xref: /openbmc/linux/fs/read_write.c (revision 95e9fd10)
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/fsnotify.h>
13 #include <linux/security.h>
14 #include <linux/export.h>
15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h>
17 #include <linux/splice.h>
18 #include "read_write.h"
19 
20 #include <asm/uaccess.h>
21 #include <asm/unistd.h>
22 
23 const struct file_operations generic_ro_fops = {
24 	.llseek		= generic_file_llseek,
25 	.read		= do_sync_read,
26 	.aio_read	= generic_file_aio_read,
27 	.mmap		= generic_file_readonly_mmap,
28 	.splice_read	= generic_file_splice_read,
29 };
30 
31 EXPORT_SYMBOL(generic_ro_fops);
32 
33 static inline int unsigned_offsets(struct file *file)
34 {
35 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
36 }
37 
38 static loff_t lseek_execute(struct file *file, struct inode *inode,
39 		loff_t offset, loff_t maxsize)
40 {
41 	if (offset < 0 && !unsigned_offsets(file))
42 		return -EINVAL;
43 	if (offset > maxsize)
44 		return -EINVAL;
45 
46 	if (offset != file->f_pos) {
47 		file->f_pos = offset;
48 		file->f_version = 0;
49 	}
50 	return offset;
51 }
52 
53 /**
54  * generic_file_llseek_size - generic llseek implementation for regular files
55  * @file:	file structure to seek on
56  * @offset:	file offset to seek to
57  * @origin:	type of seek
58  * @size:	max size of this file in file system
59  * @eof:	offset used for SEEK_END position
60  *
61  * This is a variant of generic_file_llseek that allows passing in a custom
62  * maximum file size and a custom EOF position, for e.g. hashed directories
63  *
64  * Synchronization:
65  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
66  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
67  * read/writes behave like SEEK_SET against seeks.
68  */
69 loff_t
70 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
71 		loff_t maxsize, loff_t eof)
72 {
73 	struct inode *inode = file->f_mapping->host;
74 
75 	switch (origin) {
76 	case SEEK_END:
77 		offset += eof;
78 		break;
79 	case SEEK_CUR:
80 		/*
81 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
82 		 * position-querying operation.  Avoid rewriting the "same"
83 		 * f_pos value back to the file because a concurrent read(),
84 		 * write() or lseek() might have altered it
85 		 */
86 		if (offset == 0)
87 			return file->f_pos;
88 		/*
89 		 * f_lock protects against read/modify/write race with other
90 		 * SEEK_CURs. Note that parallel writes and reads behave
91 		 * like SEEK_SET.
92 		 */
93 		spin_lock(&file->f_lock);
94 		offset = lseek_execute(file, inode, file->f_pos + offset,
95 				       maxsize);
96 		spin_unlock(&file->f_lock);
97 		return offset;
98 	case SEEK_DATA:
99 		/*
100 		 * In the generic case the entire file is data, so as long as
101 		 * offset isn't at the end of the file then the offset is data.
102 		 */
103 		if (offset >= eof)
104 			return -ENXIO;
105 		break;
106 	case SEEK_HOLE:
107 		/*
108 		 * There is a virtual hole at the end of the file, so as long as
109 		 * offset isn't i_size or larger, return i_size.
110 		 */
111 		if (offset >= eof)
112 			return -ENXIO;
113 		offset = eof;
114 		break;
115 	}
116 
117 	return lseek_execute(file, inode, offset, maxsize);
118 }
119 EXPORT_SYMBOL(generic_file_llseek_size);
120 
121 /**
122  * generic_file_llseek - generic llseek implementation for regular files
123  * @file:	file structure to seek on
124  * @offset:	file offset to seek to
125  * @origin:	type of seek
126  *
127  * This is a generic implemenation of ->llseek useable for all normal local
128  * filesystems.  It just updates the file offset to the value specified by
129  * @offset and @origin under i_mutex.
130  */
131 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
132 {
133 	struct inode *inode = file->f_mapping->host;
134 
135 	return generic_file_llseek_size(file, offset, origin,
136 					inode->i_sb->s_maxbytes,
137 					i_size_read(inode));
138 }
139 EXPORT_SYMBOL(generic_file_llseek);
140 
141 /**
142  * noop_llseek - No Operation Performed llseek implementation
143  * @file:	file structure to seek on
144  * @offset:	file offset to seek to
145  * @origin:	type of seek
146  *
147  * This is an implementation of ->llseek useable for the rare special case when
148  * userspace expects the seek to succeed but the (device) file is actually not
149  * able to perform the seek. In this case you use noop_llseek() instead of
150  * falling back to the default implementation of ->llseek.
151  */
152 loff_t noop_llseek(struct file *file, loff_t offset, int origin)
153 {
154 	return file->f_pos;
155 }
156 EXPORT_SYMBOL(noop_llseek);
157 
158 loff_t no_llseek(struct file *file, loff_t offset, int origin)
159 {
160 	return -ESPIPE;
161 }
162 EXPORT_SYMBOL(no_llseek);
163 
164 loff_t default_llseek(struct file *file, loff_t offset, int origin)
165 {
166 	struct inode *inode = file->f_path.dentry->d_inode;
167 	loff_t retval;
168 
169 	mutex_lock(&inode->i_mutex);
170 	switch (origin) {
171 		case SEEK_END:
172 			offset += i_size_read(inode);
173 			break;
174 		case SEEK_CUR:
175 			if (offset == 0) {
176 				retval = file->f_pos;
177 				goto out;
178 			}
179 			offset += file->f_pos;
180 			break;
181 		case SEEK_DATA:
182 			/*
183 			 * In the generic case the entire file is data, so as
184 			 * long as offset isn't at the end of the file then the
185 			 * offset is data.
186 			 */
187 			if (offset >= inode->i_size) {
188 				retval = -ENXIO;
189 				goto out;
190 			}
191 			break;
192 		case SEEK_HOLE:
193 			/*
194 			 * There is a virtual hole at the end of the file, so
195 			 * as long as offset isn't i_size or larger, return
196 			 * i_size.
197 			 */
198 			if (offset >= inode->i_size) {
199 				retval = -ENXIO;
200 				goto out;
201 			}
202 			offset = inode->i_size;
203 			break;
204 	}
205 	retval = -EINVAL;
206 	if (offset >= 0 || unsigned_offsets(file)) {
207 		if (offset != file->f_pos) {
208 			file->f_pos = offset;
209 			file->f_version = 0;
210 		}
211 		retval = offset;
212 	}
213 out:
214 	mutex_unlock(&inode->i_mutex);
215 	return retval;
216 }
217 EXPORT_SYMBOL(default_llseek);
218 
219 loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
220 {
221 	loff_t (*fn)(struct file *, loff_t, int);
222 
223 	fn = no_llseek;
224 	if (file->f_mode & FMODE_LSEEK) {
225 		if (file->f_op && file->f_op->llseek)
226 			fn = file->f_op->llseek;
227 	}
228 	return fn(file, offset, origin);
229 }
230 EXPORT_SYMBOL(vfs_llseek);
231 
232 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
233 {
234 	off_t retval;
235 	struct file * file;
236 	int fput_needed;
237 
238 	retval = -EBADF;
239 	file = fget_light(fd, &fput_needed);
240 	if (!file)
241 		goto bad;
242 
243 	retval = -EINVAL;
244 	if (origin <= SEEK_MAX) {
245 		loff_t res = vfs_llseek(file, offset, origin);
246 		retval = res;
247 		if (res != (loff_t)retval)
248 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
249 	}
250 	fput_light(file, fput_needed);
251 bad:
252 	return retval;
253 }
254 
255 #ifdef __ARCH_WANT_SYS_LLSEEK
256 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
257 		unsigned long, offset_low, loff_t __user *, result,
258 		unsigned int, origin)
259 {
260 	int retval;
261 	struct file * file;
262 	loff_t offset;
263 	int fput_needed;
264 
265 	retval = -EBADF;
266 	file = fget_light(fd, &fput_needed);
267 	if (!file)
268 		goto bad;
269 
270 	retval = -EINVAL;
271 	if (origin > SEEK_MAX)
272 		goto out_putf;
273 
274 	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
275 			origin);
276 
277 	retval = (int)offset;
278 	if (offset >= 0) {
279 		retval = -EFAULT;
280 		if (!copy_to_user(result, &offset, sizeof(offset)))
281 			retval = 0;
282 	}
283 out_putf:
284 	fput_light(file, fput_needed);
285 bad:
286 	return retval;
287 }
288 #endif
289 
290 
291 /*
292  * rw_verify_area doesn't like huge counts. We limit
293  * them to something that fits in "int" so that others
294  * won't have to do range checks all the time.
295  */
296 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
297 {
298 	struct inode *inode;
299 	loff_t pos;
300 	int retval = -EINVAL;
301 
302 	inode = file->f_path.dentry->d_inode;
303 	if (unlikely((ssize_t) count < 0))
304 		return retval;
305 	pos = *ppos;
306 	if (unlikely(pos < 0)) {
307 		if (!unsigned_offsets(file))
308 			return retval;
309 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
310 			return -EOVERFLOW;
311 	} else if (unlikely((loff_t) (pos + count) < 0)) {
312 		if (!unsigned_offsets(file))
313 			return retval;
314 	}
315 
316 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
317 		retval = locks_mandatory_area(
318 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
319 			inode, file, pos, count);
320 		if (retval < 0)
321 			return retval;
322 	}
323 	retval = security_file_permission(file,
324 				read_write == READ ? MAY_READ : MAY_WRITE);
325 	if (retval)
326 		return retval;
327 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
328 }
329 
330 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
331 {
332 	set_current_state(TASK_UNINTERRUPTIBLE);
333 	if (!kiocbIsKicked(iocb))
334 		schedule();
335 	else
336 		kiocbClearKicked(iocb);
337 	__set_current_state(TASK_RUNNING);
338 }
339 
340 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
341 {
342 	struct iovec iov = { .iov_base = buf, .iov_len = len };
343 	struct kiocb kiocb;
344 	ssize_t ret;
345 
346 	init_sync_kiocb(&kiocb, filp);
347 	kiocb.ki_pos = *ppos;
348 	kiocb.ki_left = len;
349 	kiocb.ki_nbytes = len;
350 
351 	for (;;) {
352 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
353 		if (ret != -EIOCBRETRY)
354 			break;
355 		wait_on_retry_sync_kiocb(&kiocb);
356 	}
357 
358 	if (-EIOCBQUEUED == ret)
359 		ret = wait_on_sync_kiocb(&kiocb);
360 	*ppos = kiocb.ki_pos;
361 	return ret;
362 }
363 
364 EXPORT_SYMBOL(do_sync_read);
365 
366 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
367 {
368 	ssize_t ret;
369 
370 	if (!(file->f_mode & FMODE_READ))
371 		return -EBADF;
372 	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
373 		return -EINVAL;
374 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
375 		return -EFAULT;
376 
377 	ret = rw_verify_area(READ, file, pos, count);
378 	if (ret >= 0) {
379 		count = ret;
380 		if (file->f_op->read)
381 			ret = file->f_op->read(file, buf, count, pos);
382 		else
383 			ret = do_sync_read(file, buf, count, pos);
384 		if (ret > 0) {
385 			fsnotify_access(file);
386 			add_rchar(current, ret);
387 		}
388 		inc_syscr(current);
389 	}
390 
391 	return ret;
392 }
393 
394 EXPORT_SYMBOL(vfs_read);
395 
396 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
397 {
398 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
399 	struct kiocb kiocb;
400 	ssize_t ret;
401 
402 	init_sync_kiocb(&kiocb, filp);
403 	kiocb.ki_pos = *ppos;
404 	kiocb.ki_left = len;
405 	kiocb.ki_nbytes = len;
406 
407 	for (;;) {
408 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
409 		if (ret != -EIOCBRETRY)
410 			break;
411 		wait_on_retry_sync_kiocb(&kiocb);
412 	}
413 
414 	if (-EIOCBQUEUED == ret)
415 		ret = wait_on_sync_kiocb(&kiocb);
416 	*ppos = kiocb.ki_pos;
417 	return ret;
418 }
419 
420 EXPORT_SYMBOL(do_sync_write);
421 
422 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
423 {
424 	ssize_t ret;
425 
426 	if (!(file->f_mode & FMODE_WRITE))
427 		return -EBADF;
428 	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
429 		return -EINVAL;
430 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
431 		return -EFAULT;
432 
433 	ret = rw_verify_area(WRITE, file, pos, count);
434 	if (ret >= 0) {
435 		count = ret;
436 		if (file->f_op->write)
437 			ret = file->f_op->write(file, buf, count, pos);
438 		else
439 			ret = do_sync_write(file, buf, count, pos);
440 		if (ret > 0) {
441 			fsnotify_modify(file);
442 			add_wchar(current, ret);
443 		}
444 		inc_syscw(current);
445 	}
446 
447 	return ret;
448 }
449 
450 EXPORT_SYMBOL(vfs_write);
451 
452 static inline loff_t file_pos_read(struct file *file)
453 {
454 	return file->f_pos;
455 }
456 
457 static inline void file_pos_write(struct file *file, loff_t pos)
458 {
459 	file->f_pos = pos;
460 }
461 
462 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
463 {
464 	struct file *file;
465 	ssize_t ret = -EBADF;
466 	int fput_needed;
467 
468 	file = fget_light(fd, &fput_needed);
469 	if (file) {
470 		loff_t pos = file_pos_read(file);
471 		ret = vfs_read(file, buf, count, &pos);
472 		file_pos_write(file, pos);
473 		fput_light(file, fput_needed);
474 	}
475 
476 	return ret;
477 }
478 
479 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
480 		size_t, count)
481 {
482 	struct file *file;
483 	ssize_t ret = -EBADF;
484 	int fput_needed;
485 
486 	file = fget_light(fd, &fput_needed);
487 	if (file) {
488 		loff_t pos = file_pos_read(file);
489 		ret = vfs_write(file, buf, count, &pos);
490 		file_pos_write(file, pos);
491 		fput_light(file, fput_needed);
492 	}
493 
494 	return ret;
495 }
496 
497 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
498 			size_t count, loff_t pos)
499 {
500 	struct file *file;
501 	ssize_t ret = -EBADF;
502 	int fput_needed;
503 
504 	if (pos < 0)
505 		return -EINVAL;
506 
507 	file = fget_light(fd, &fput_needed);
508 	if (file) {
509 		ret = -ESPIPE;
510 		if (file->f_mode & FMODE_PREAD)
511 			ret = vfs_read(file, buf, count, &pos);
512 		fput_light(file, fput_needed);
513 	}
514 
515 	return ret;
516 }
517 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
518 asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
519 {
520 	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
521 			    (size_t) count, pos);
522 }
523 SYSCALL_ALIAS(sys_pread64, SyS_pread64);
524 #endif
525 
526 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
527 			 size_t count, loff_t pos)
528 {
529 	struct file *file;
530 	ssize_t ret = -EBADF;
531 	int fput_needed;
532 
533 	if (pos < 0)
534 		return -EINVAL;
535 
536 	file = fget_light(fd, &fput_needed);
537 	if (file) {
538 		ret = -ESPIPE;
539 		if (file->f_mode & FMODE_PWRITE)
540 			ret = vfs_write(file, buf, count, &pos);
541 		fput_light(file, fput_needed);
542 	}
543 
544 	return ret;
545 }
546 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
547 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
548 {
549 	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
550 			     (size_t) count, pos);
551 }
552 SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
553 #endif
554 
555 /*
556  * Reduce an iovec's length in-place.  Return the resulting number of segments
557  */
558 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
559 {
560 	unsigned long seg = 0;
561 	size_t len = 0;
562 
563 	while (seg < nr_segs) {
564 		seg++;
565 		if (len + iov->iov_len >= to) {
566 			iov->iov_len = to - len;
567 			break;
568 		}
569 		len += iov->iov_len;
570 		iov++;
571 	}
572 	return seg;
573 }
574 EXPORT_SYMBOL(iov_shorten);
575 
576 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
577 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
578 {
579 	struct kiocb kiocb;
580 	ssize_t ret;
581 
582 	init_sync_kiocb(&kiocb, filp);
583 	kiocb.ki_pos = *ppos;
584 	kiocb.ki_left = len;
585 	kiocb.ki_nbytes = len;
586 
587 	for (;;) {
588 		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
589 		if (ret != -EIOCBRETRY)
590 			break;
591 		wait_on_retry_sync_kiocb(&kiocb);
592 	}
593 
594 	if (ret == -EIOCBQUEUED)
595 		ret = wait_on_sync_kiocb(&kiocb);
596 	*ppos = kiocb.ki_pos;
597 	return ret;
598 }
599 
600 /* Do it by hand, with file-ops */
601 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
602 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
603 {
604 	struct iovec *vector = iov;
605 	ssize_t ret = 0;
606 
607 	while (nr_segs > 0) {
608 		void __user *base;
609 		size_t len;
610 		ssize_t nr;
611 
612 		base = vector->iov_base;
613 		len = vector->iov_len;
614 		vector++;
615 		nr_segs--;
616 
617 		nr = fn(filp, base, len, ppos);
618 
619 		if (nr < 0) {
620 			if (!ret)
621 				ret = nr;
622 			break;
623 		}
624 		ret += nr;
625 		if (nr != len)
626 			break;
627 	}
628 
629 	return ret;
630 }
631 
632 /* A write operation does a read from user space and vice versa */
633 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
634 
635 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
636 			      unsigned long nr_segs, unsigned long fast_segs,
637 			      struct iovec *fast_pointer,
638 			      struct iovec **ret_pointer)
639 {
640 	unsigned long seg;
641 	ssize_t ret;
642 	struct iovec *iov = fast_pointer;
643 
644 	/*
645 	 * SuS says "The readv() function *may* fail if the iovcnt argument
646 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
647 	 * traditionally returned zero for zero segments, so...
648 	 */
649 	if (nr_segs == 0) {
650 		ret = 0;
651 		goto out;
652 	}
653 
654 	/*
655 	 * First get the "struct iovec" from user memory and
656 	 * verify all the pointers
657 	 */
658 	if (nr_segs > UIO_MAXIOV) {
659 		ret = -EINVAL;
660 		goto out;
661 	}
662 	if (nr_segs > fast_segs) {
663 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
664 		if (iov == NULL) {
665 			ret = -ENOMEM;
666 			goto out;
667 		}
668 	}
669 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
670 		ret = -EFAULT;
671 		goto out;
672 	}
673 
674 	/*
675 	 * According to the Single Unix Specification we should return EINVAL
676 	 * if an element length is < 0 when cast to ssize_t or if the
677 	 * total length would overflow the ssize_t return value of the
678 	 * system call.
679 	 *
680 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
681 	 * overflow case.
682 	 */
683 	ret = 0;
684 	for (seg = 0; seg < nr_segs; seg++) {
685 		void __user *buf = iov[seg].iov_base;
686 		ssize_t len = (ssize_t)iov[seg].iov_len;
687 
688 		/* see if we we're about to use an invalid len or if
689 		 * it's about to overflow ssize_t */
690 		if (len < 0) {
691 			ret = -EINVAL;
692 			goto out;
693 		}
694 		if (type >= 0
695 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
696 			ret = -EFAULT;
697 			goto out;
698 		}
699 		if (len > MAX_RW_COUNT - ret) {
700 			len = MAX_RW_COUNT - ret;
701 			iov[seg].iov_len = len;
702 		}
703 		ret += len;
704 	}
705 out:
706 	*ret_pointer = iov;
707 	return ret;
708 }
709 
710 static ssize_t do_readv_writev(int type, struct file *file,
711 			       const struct iovec __user * uvector,
712 			       unsigned long nr_segs, loff_t *pos)
713 {
714 	size_t tot_len;
715 	struct iovec iovstack[UIO_FASTIOV];
716 	struct iovec *iov = iovstack;
717 	ssize_t ret;
718 	io_fn_t fn;
719 	iov_fn_t fnv;
720 
721 	if (!file->f_op) {
722 		ret = -EINVAL;
723 		goto out;
724 	}
725 
726 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
727 				    ARRAY_SIZE(iovstack), iovstack, &iov);
728 	if (ret <= 0)
729 		goto out;
730 
731 	tot_len = ret;
732 	ret = rw_verify_area(type, file, pos, tot_len);
733 	if (ret < 0)
734 		goto out;
735 
736 	fnv = NULL;
737 	if (type == READ) {
738 		fn = file->f_op->read;
739 		fnv = file->f_op->aio_read;
740 	} else {
741 		fn = (io_fn_t)file->f_op->write;
742 		fnv = file->f_op->aio_write;
743 	}
744 
745 	if (fnv)
746 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
747 						pos, fnv);
748 	else
749 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
750 
751 out:
752 	if (iov != iovstack)
753 		kfree(iov);
754 	if ((ret + (type == READ)) > 0) {
755 		if (type == READ)
756 			fsnotify_access(file);
757 		else
758 			fsnotify_modify(file);
759 	}
760 	return ret;
761 }
762 
763 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
764 		  unsigned long vlen, loff_t *pos)
765 {
766 	if (!(file->f_mode & FMODE_READ))
767 		return -EBADF;
768 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
769 		return -EINVAL;
770 
771 	return do_readv_writev(READ, file, vec, vlen, pos);
772 }
773 
774 EXPORT_SYMBOL(vfs_readv);
775 
776 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
777 		   unsigned long vlen, loff_t *pos)
778 {
779 	if (!(file->f_mode & FMODE_WRITE))
780 		return -EBADF;
781 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
782 		return -EINVAL;
783 
784 	return do_readv_writev(WRITE, file, vec, vlen, pos);
785 }
786 
787 EXPORT_SYMBOL(vfs_writev);
788 
789 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
790 		unsigned long, vlen)
791 {
792 	struct file *file;
793 	ssize_t ret = -EBADF;
794 	int fput_needed;
795 
796 	file = fget_light(fd, &fput_needed);
797 	if (file) {
798 		loff_t pos = file_pos_read(file);
799 		ret = vfs_readv(file, vec, vlen, &pos);
800 		file_pos_write(file, pos);
801 		fput_light(file, fput_needed);
802 	}
803 
804 	if (ret > 0)
805 		add_rchar(current, ret);
806 	inc_syscr(current);
807 	return ret;
808 }
809 
810 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
811 		unsigned long, vlen)
812 {
813 	struct file *file;
814 	ssize_t ret = -EBADF;
815 	int fput_needed;
816 
817 	file = fget_light(fd, &fput_needed);
818 	if (file) {
819 		loff_t pos = file_pos_read(file);
820 		ret = vfs_writev(file, vec, vlen, &pos);
821 		file_pos_write(file, pos);
822 		fput_light(file, fput_needed);
823 	}
824 
825 	if (ret > 0)
826 		add_wchar(current, ret);
827 	inc_syscw(current);
828 	return ret;
829 }
830 
831 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
832 {
833 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
834 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
835 }
836 
837 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
838 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
839 {
840 	loff_t pos = pos_from_hilo(pos_h, pos_l);
841 	struct file *file;
842 	ssize_t ret = -EBADF;
843 	int fput_needed;
844 
845 	if (pos < 0)
846 		return -EINVAL;
847 
848 	file = fget_light(fd, &fput_needed);
849 	if (file) {
850 		ret = -ESPIPE;
851 		if (file->f_mode & FMODE_PREAD)
852 			ret = vfs_readv(file, vec, vlen, &pos);
853 		fput_light(file, fput_needed);
854 	}
855 
856 	if (ret > 0)
857 		add_rchar(current, ret);
858 	inc_syscr(current);
859 	return ret;
860 }
861 
862 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
863 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
864 {
865 	loff_t pos = pos_from_hilo(pos_h, pos_l);
866 	struct file *file;
867 	ssize_t ret = -EBADF;
868 	int fput_needed;
869 
870 	if (pos < 0)
871 		return -EINVAL;
872 
873 	file = fget_light(fd, &fput_needed);
874 	if (file) {
875 		ret = -ESPIPE;
876 		if (file->f_mode & FMODE_PWRITE)
877 			ret = vfs_writev(file, vec, vlen, &pos);
878 		fput_light(file, fput_needed);
879 	}
880 
881 	if (ret > 0)
882 		add_wchar(current, ret);
883 	inc_syscw(current);
884 	return ret;
885 }
886 
887 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
888 			   size_t count, loff_t max)
889 {
890 	struct file * in_file, * out_file;
891 	struct inode * in_inode, * out_inode;
892 	loff_t pos;
893 	ssize_t retval;
894 	int fput_needed_in, fput_needed_out, fl;
895 
896 	/*
897 	 * Get input file, and verify that it is ok..
898 	 */
899 	retval = -EBADF;
900 	in_file = fget_light(in_fd, &fput_needed_in);
901 	if (!in_file)
902 		goto out;
903 	if (!(in_file->f_mode & FMODE_READ))
904 		goto fput_in;
905 	retval = -ESPIPE;
906 	if (!ppos)
907 		ppos = &in_file->f_pos;
908 	else
909 		if (!(in_file->f_mode & FMODE_PREAD))
910 			goto fput_in;
911 	retval = rw_verify_area(READ, in_file, ppos, count);
912 	if (retval < 0)
913 		goto fput_in;
914 	count = retval;
915 
916 	/*
917 	 * Get output file, and verify that it is ok..
918 	 */
919 	retval = -EBADF;
920 	out_file = fget_light(out_fd, &fput_needed_out);
921 	if (!out_file)
922 		goto fput_in;
923 	if (!(out_file->f_mode & FMODE_WRITE))
924 		goto fput_out;
925 	retval = -EINVAL;
926 	in_inode = in_file->f_path.dentry->d_inode;
927 	out_inode = out_file->f_path.dentry->d_inode;
928 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
929 	if (retval < 0)
930 		goto fput_out;
931 	count = retval;
932 
933 	if (!max)
934 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
935 
936 	pos = *ppos;
937 	if (unlikely(pos + count > max)) {
938 		retval = -EOVERFLOW;
939 		if (pos >= max)
940 			goto fput_out;
941 		count = max - pos;
942 	}
943 
944 	fl = 0;
945 #if 0
946 	/*
947 	 * We need to debate whether we can enable this or not. The
948 	 * man page documents EAGAIN return for the output at least,
949 	 * and the application is arguably buggy if it doesn't expect
950 	 * EAGAIN on a non-blocking file descriptor.
951 	 */
952 	if (in_file->f_flags & O_NONBLOCK)
953 		fl = SPLICE_F_NONBLOCK;
954 #endif
955 	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
956 
957 	if (retval > 0) {
958 		add_rchar(current, retval);
959 		add_wchar(current, retval);
960 	}
961 
962 	inc_syscr(current);
963 	inc_syscw(current);
964 	if (*ppos > max)
965 		retval = -EOVERFLOW;
966 
967 fput_out:
968 	fput_light(out_file, fput_needed_out);
969 fput_in:
970 	fput_light(in_file, fput_needed_in);
971 out:
972 	return retval;
973 }
974 
975 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
976 {
977 	loff_t pos;
978 	off_t off;
979 	ssize_t ret;
980 
981 	if (offset) {
982 		if (unlikely(get_user(off, offset)))
983 			return -EFAULT;
984 		pos = off;
985 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
986 		if (unlikely(put_user(pos, offset)))
987 			return -EFAULT;
988 		return ret;
989 	}
990 
991 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
992 }
993 
994 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
995 {
996 	loff_t pos;
997 	ssize_t ret;
998 
999 	if (offset) {
1000 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1001 			return -EFAULT;
1002 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1003 		if (unlikely(put_user(pos, offset)))
1004 			return -EFAULT;
1005 		return ret;
1006 	}
1007 
1008 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1009 }
1010