xref: /openbmc/linux/fs/read_write.c (revision 3a0d89d3)
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/aio.h>
13 #include <linux/fsnotify.h>
14 #include <linux/security.h>
15 #include <linux/export.h>
16 #include <linux/syscalls.h>
17 #include <linux/pagemap.h>
18 #include <linux/splice.h>
19 #include <linux/compat.h>
20 #include "internal.h"
21 
22 #include <asm/uaccess.h>
23 #include <asm/unistd.h>
24 
25 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
26 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
27 		unsigned long, loff_t);
28 
29 const struct file_operations generic_ro_fops = {
30 	.llseek		= generic_file_llseek,
31 	.read		= do_sync_read,
32 	.aio_read	= generic_file_aio_read,
33 	.mmap		= generic_file_readonly_mmap,
34 	.splice_read	= generic_file_splice_read,
35 };
36 
37 EXPORT_SYMBOL(generic_ro_fops);
38 
39 static inline int unsigned_offsets(struct file *file)
40 {
41 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
42 }
43 
44 /**
45  * vfs_setpos - update the file offset for lseek
46  * @file:	file structure in question
47  * @offset:	file offset to seek to
48  * @maxsize:	maximum file size
49  *
50  * This is a low-level filesystem helper for updating the file offset to
51  * the value specified by @offset if the given offset is valid and it is
52  * not equal to the current file offset.
53  *
54  * Return the specified offset on success and -EINVAL on invalid offset.
55  */
56 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
57 {
58 	if (offset < 0 && !unsigned_offsets(file))
59 		return -EINVAL;
60 	if (offset > maxsize)
61 		return -EINVAL;
62 
63 	if (offset != file->f_pos) {
64 		file->f_pos = offset;
65 		file->f_version = 0;
66 	}
67 	return offset;
68 }
69 EXPORT_SYMBOL(vfs_setpos);
70 
71 /**
72  * generic_file_llseek_size - generic llseek implementation for regular files
73  * @file:	file structure to seek on
74  * @offset:	file offset to seek to
75  * @whence:	type of seek
76  * @size:	max size of this file in file system
77  * @eof:	offset used for SEEK_END position
78  *
79  * This is a variant of generic_file_llseek that allows passing in a custom
80  * maximum file size and a custom EOF position, for e.g. hashed directories
81  *
82  * Synchronization:
83  * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
84  * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
85  * read/writes behave like SEEK_SET against seeks.
86  */
87 loff_t
88 generic_file_llseek_size(struct file *file, loff_t offset, int whence,
89 		loff_t maxsize, loff_t eof)
90 {
91 	switch (whence) {
92 	case SEEK_END:
93 		offset += eof;
94 		break;
95 	case SEEK_CUR:
96 		/*
97 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
98 		 * position-querying operation.  Avoid rewriting the "same"
99 		 * f_pos value back to the file because a concurrent read(),
100 		 * write() or lseek() might have altered it
101 		 */
102 		if (offset == 0)
103 			return file->f_pos;
104 		/*
105 		 * f_lock protects against read/modify/write race with other
106 		 * SEEK_CURs. Note that parallel writes and reads behave
107 		 * like SEEK_SET.
108 		 */
109 		spin_lock(&file->f_lock);
110 		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
111 		spin_unlock(&file->f_lock);
112 		return offset;
113 	case SEEK_DATA:
114 		/*
115 		 * In the generic case the entire file is data, so as long as
116 		 * offset isn't at the end of the file then the offset is data.
117 		 */
118 		if (offset >= eof)
119 			return -ENXIO;
120 		break;
121 	case SEEK_HOLE:
122 		/*
123 		 * There is a virtual hole at the end of the file, so as long as
124 		 * offset isn't i_size or larger, return i_size.
125 		 */
126 		if (offset >= eof)
127 			return -ENXIO;
128 		offset = eof;
129 		break;
130 	}
131 
132 	return vfs_setpos(file, offset, maxsize);
133 }
134 EXPORT_SYMBOL(generic_file_llseek_size);
135 
136 /**
137  * generic_file_llseek - generic llseek implementation for regular files
138  * @file:	file structure to seek on
139  * @offset:	file offset to seek to
140  * @whence:	type of seek
141  *
142  * This is a generic implemenation of ->llseek useable for all normal local
143  * filesystems.  It just updates the file offset to the value specified by
144  * @offset and @whence.
145  */
146 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
147 {
148 	struct inode *inode = file->f_mapping->host;
149 
150 	return generic_file_llseek_size(file, offset, whence,
151 					inode->i_sb->s_maxbytes,
152 					i_size_read(inode));
153 }
154 EXPORT_SYMBOL(generic_file_llseek);
155 
156 /**
157  * fixed_size_llseek - llseek implementation for fixed-sized devices
158  * @file:	file structure to seek on
159  * @offset:	file offset to seek to
160  * @whence:	type of seek
161  * @size:	size of the file
162  *
163  */
164 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
165 {
166 	switch (whence) {
167 	case SEEK_SET: case SEEK_CUR: case SEEK_END:
168 		return generic_file_llseek_size(file, offset, whence,
169 						size, size);
170 	default:
171 		return -EINVAL;
172 	}
173 }
174 EXPORT_SYMBOL(fixed_size_llseek);
175 
176 /**
177  * noop_llseek - No Operation Performed llseek implementation
178  * @file:	file structure to seek on
179  * @offset:	file offset to seek to
180  * @whence:	type of seek
181  *
182  * This is an implementation of ->llseek useable for the rare special case when
183  * userspace expects the seek to succeed but the (device) file is actually not
184  * able to perform the seek. In this case you use noop_llseek() instead of
185  * falling back to the default implementation of ->llseek.
186  */
187 loff_t noop_llseek(struct file *file, loff_t offset, int whence)
188 {
189 	return file->f_pos;
190 }
191 EXPORT_SYMBOL(noop_llseek);
192 
193 loff_t no_llseek(struct file *file, loff_t offset, int whence)
194 {
195 	return -ESPIPE;
196 }
197 EXPORT_SYMBOL(no_llseek);
198 
199 loff_t default_llseek(struct file *file, loff_t offset, int whence)
200 {
201 	struct inode *inode = file_inode(file);
202 	loff_t retval;
203 
204 	mutex_lock(&inode->i_mutex);
205 	switch (whence) {
206 		case SEEK_END:
207 			offset += i_size_read(inode);
208 			break;
209 		case SEEK_CUR:
210 			if (offset == 0) {
211 				retval = file->f_pos;
212 				goto out;
213 			}
214 			offset += file->f_pos;
215 			break;
216 		case SEEK_DATA:
217 			/*
218 			 * In the generic case the entire file is data, so as
219 			 * long as offset isn't at the end of the file then the
220 			 * offset is data.
221 			 */
222 			if (offset >= inode->i_size) {
223 				retval = -ENXIO;
224 				goto out;
225 			}
226 			break;
227 		case SEEK_HOLE:
228 			/*
229 			 * There is a virtual hole at the end of the file, so
230 			 * as long as offset isn't i_size or larger, return
231 			 * i_size.
232 			 */
233 			if (offset >= inode->i_size) {
234 				retval = -ENXIO;
235 				goto out;
236 			}
237 			offset = inode->i_size;
238 			break;
239 	}
240 	retval = -EINVAL;
241 	if (offset >= 0 || unsigned_offsets(file)) {
242 		if (offset != file->f_pos) {
243 			file->f_pos = offset;
244 			file->f_version = 0;
245 		}
246 		retval = offset;
247 	}
248 out:
249 	mutex_unlock(&inode->i_mutex);
250 	return retval;
251 }
252 EXPORT_SYMBOL(default_llseek);
253 
254 loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
255 {
256 	loff_t (*fn)(struct file *, loff_t, int);
257 
258 	fn = no_llseek;
259 	if (file->f_mode & FMODE_LSEEK) {
260 		if (file->f_op->llseek)
261 			fn = file->f_op->llseek;
262 	}
263 	return fn(file, offset, whence);
264 }
265 EXPORT_SYMBOL(vfs_llseek);
266 
267 static inline struct fd fdget_pos(int fd)
268 {
269 	return __to_fd(__fdget_pos(fd));
270 }
271 
272 static inline void fdput_pos(struct fd f)
273 {
274 	if (f.flags & FDPUT_POS_UNLOCK)
275 		mutex_unlock(&f.file->f_pos_lock);
276 	fdput(f);
277 }
278 
279 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
280 {
281 	off_t retval;
282 	struct fd f = fdget_pos(fd);
283 	if (!f.file)
284 		return -EBADF;
285 
286 	retval = -EINVAL;
287 	if (whence <= SEEK_MAX) {
288 		loff_t res = vfs_llseek(f.file, offset, whence);
289 		retval = res;
290 		if (res != (loff_t)retval)
291 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
292 	}
293 	fdput_pos(f);
294 	return retval;
295 }
296 
297 #ifdef CONFIG_COMPAT
298 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
299 {
300 	return sys_lseek(fd, offset, whence);
301 }
302 #endif
303 
304 #ifdef __ARCH_WANT_SYS_LLSEEK
305 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
306 		unsigned long, offset_low, loff_t __user *, result,
307 		unsigned int, whence)
308 {
309 	int retval;
310 	struct fd f = fdget_pos(fd);
311 	loff_t offset;
312 
313 	if (!f.file)
314 		return -EBADF;
315 
316 	retval = -EINVAL;
317 	if (whence > SEEK_MAX)
318 		goto out_putf;
319 
320 	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
321 			whence);
322 
323 	retval = (int)offset;
324 	if (offset >= 0) {
325 		retval = -EFAULT;
326 		if (!copy_to_user(result, &offset, sizeof(offset)))
327 			retval = 0;
328 	}
329 out_putf:
330 	fdput_pos(f);
331 	return retval;
332 }
333 #endif
334 
335 /*
336  * rw_verify_area doesn't like huge counts. We limit
337  * them to something that fits in "int" so that others
338  * won't have to do range checks all the time.
339  */
340 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
341 {
342 	struct inode *inode;
343 	loff_t pos;
344 	int retval = -EINVAL;
345 
346 	inode = file_inode(file);
347 	if (unlikely((ssize_t) count < 0))
348 		return retval;
349 	pos = *ppos;
350 	if (unlikely(pos < 0)) {
351 		if (!unsigned_offsets(file))
352 			return retval;
353 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
354 			return -EOVERFLOW;
355 	} else if (unlikely((loff_t) (pos + count) < 0)) {
356 		if (!unsigned_offsets(file))
357 			return retval;
358 	}
359 
360 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
361 		retval = locks_mandatory_area(
362 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
363 			inode, file, pos, count);
364 		if (retval < 0)
365 			return retval;
366 	}
367 	retval = security_file_permission(file,
368 				read_write == READ ? MAY_READ : MAY_WRITE);
369 	if (retval)
370 		return retval;
371 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
372 }
373 
374 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
375 {
376 	struct iovec iov = { .iov_base = buf, .iov_len = len };
377 	struct kiocb kiocb;
378 	ssize_t ret;
379 
380 	init_sync_kiocb(&kiocb, filp);
381 	kiocb.ki_pos = *ppos;
382 	kiocb.ki_nbytes = len;
383 
384 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
385 	if (-EIOCBQUEUED == ret)
386 		ret = wait_on_sync_kiocb(&kiocb);
387 	*ppos = kiocb.ki_pos;
388 	return ret;
389 }
390 
391 EXPORT_SYMBOL(do_sync_read);
392 
393 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
394 {
395 	ssize_t ret;
396 
397 	if (!(file->f_mode & FMODE_READ))
398 		return -EBADF;
399 	if (!file->f_op->read && !file->f_op->aio_read)
400 		return -EINVAL;
401 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
402 		return -EFAULT;
403 
404 	ret = rw_verify_area(READ, file, pos, count);
405 	if (ret >= 0) {
406 		count = ret;
407 		if (file->f_op->read)
408 			ret = file->f_op->read(file, buf, count, pos);
409 		else
410 			ret = do_sync_read(file, buf, count, pos);
411 		if (ret > 0) {
412 			fsnotify_access(file);
413 			add_rchar(current, ret);
414 		}
415 		inc_syscr(current);
416 	}
417 
418 	return ret;
419 }
420 
421 EXPORT_SYMBOL(vfs_read);
422 
423 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
424 {
425 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
426 	struct kiocb kiocb;
427 	ssize_t ret;
428 
429 	init_sync_kiocb(&kiocb, filp);
430 	kiocb.ki_pos = *ppos;
431 	kiocb.ki_nbytes = len;
432 
433 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
434 	if (-EIOCBQUEUED == ret)
435 		ret = wait_on_sync_kiocb(&kiocb);
436 	*ppos = kiocb.ki_pos;
437 	return ret;
438 }
439 
440 EXPORT_SYMBOL(do_sync_write);
441 
442 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
443 {
444 	mm_segment_t old_fs;
445 	const char __user *p;
446 	ssize_t ret;
447 
448 	if (!file->f_op->write && !file->f_op->aio_write)
449 		return -EINVAL;
450 
451 	old_fs = get_fs();
452 	set_fs(get_ds());
453 	p = (__force const char __user *)buf;
454 	if (count > MAX_RW_COUNT)
455 		count =  MAX_RW_COUNT;
456 	if (file->f_op->write)
457 		ret = file->f_op->write(file, p, count, pos);
458 	else
459 		ret = do_sync_write(file, p, count, pos);
460 	set_fs(old_fs);
461 	if (ret > 0) {
462 		fsnotify_modify(file);
463 		add_wchar(current, ret);
464 	}
465 	inc_syscw(current);
466 	return ret;
467 }
468 
469 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
470 {
471 	ssize_t ret;
472 
473 	if (!(file->f_mode & FMODE_WRITE))
474 		return -EBADF;
475 	if (!file->f_op->write && !file->f_op->aio_write)
476 		return -EINVAL;
477 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
478 		return -EFAULT;
479 
480 	ret = rw_verify_area(WRITE, file, pos, count);
481 	if (ret >= 0) {
482 		count = ret;
483 		file_start_write(file);
484 		if (file->f_op->write)
485 			ret = file->f_op->write(file, buf, count, pos);
486 		else
487 			ret = do_sync_write(file, buf, count, pos);
488 		if (ret > 0) {
489 			fsnotify_modify(file);
490 			add_wchar(current, ret);
491 		}
492 		inc_syscw(current);
493 		file_end_write(file);
494 	}
495 
496 	return ret;
497 }
498 
499 EXPORT_SYMBOL(vfs_write);
500 
501 static inline loff_t file_pos_read(struct file *file)
502 {
503 	return file->f_pos;
504 }
505 
506 static inline void file_pos_write(struct file *file, loff_t pos)
507 {
508 	file->f_pos = pos;
509 }
510 
511 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
512 {
513 	struct fd f = fdget_pos(fd);
514 	ssize_t ret = -EBADF;
515 
516 	if (f.file) {
517 		loff_t pos = file_pos_read(f.file);
518 		ret = vfs_read(f.file, buf, count, &pos);
519 		if (ret >= 0)
520 			file_pos_write(f.file, pos);
521 		fdput_pos(f);
522 	}
523 	return ret;
524 }
525 
526 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
527 		size_t, count)
528 {
529 	struct fd f = fdget_pos(fd);
530 	ssize_t ret = -EBADF;
531 
532 	if (f.file) {
533 		loff_t pos = file_pos_read(f.file);
534 		ret = vfs_write(f.file, buf, count, &pos);
535 		if (ret >= 0)
536 			file_pos_write(f.file, pos);
537 		fdput_pos(f);
538 	}
539 
540 	return ret;
541 }
542 
543 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
544 			size_t, count, loff_t, pos)
545 {
546 	struct fd f;
547 	ssize_t ret = -EBADF;
548 
549 	if (pos < 0)
550 		return -EINVAL;
551 
552 	f = fdget(fd);
553 	if (f.file) {
554 		ret = -ESPIPE;
555 		if (f.file->f_mode & FMODE_PREAD)
556 			ret = vfs_read(f.file, buf, count, &pos);
557 		fdput(f);
558 	}
559 
560 	return ret;
561 }
562 
563 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
564 			 size_t, count, loff_t, pos)
565 {
566 	struct fd f;
567 	ssize_t ret = -EBADF;
568 
569 	if (pos < 0)
570 		return -EINVAL;
571 
572 	f = fdget(fd);
573 	if (f.file) {
574 		ret = -ESPIPE;
575 		if (f.file->f_mode & FMODE_PWRITE)
576 			ret = vfs_write(f.file, buf, count, &pos);
577 		fdput(f);
578 	}
579 
580 	return ret;
581 }
582 
583 /*
584  * Reduce an iovec's length in-place.  Return the resulting number of segments
585  */
586 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
587 {
588 	unsigned long seg = 0;
589 	size_t len = 0;
590 
591 	while (seg < nr_segs) {
592 		seg++;
593 		if (len + iov->iov_len >= to) {
594 			iov->iov_len = to - len;
595 			break;
596 		}
597 		len += iov->iov_len;
598 		iov++;
599 	}
600 	return seg;
601 }
602 EXPORT_SYMBOL(iov_shorten);
603 
604 static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
605 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
606 {
607 	struct kiocb kiocb;
608 	ssize_t ret;
609 
610 	init_sync_kiocb(&kiocb, filp);
611 	kiocb.ki_pos = *ppos;
612 	kiocb.ki_nbytes = len;
613 
614 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
615 	if (ret == -EIOCBQUEUED)
616 		ret = wait_on_sync_kiocb(&kiocb);
617 	*ppos = kiocb.ki_pos;
618 	return ret;
619 }
620 
621 /* Do it by hand, with file-ops */
622 static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
623 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
624 {
625 	struct iovec *vector = iov;
626 	ssize_t ret = 0;
627 
628 	while (nr_segs > 0) {
629 		void __user *base;
630 		size_t len;
631 		ssize_t nr;
632 
633 		base = vector->iov_base;
634 		len = vector->iov_len;
635 		vector++;
636 		nr_segs--;
637 
638 		nr = fn(filp, base, len, ppos);
639 
640 		if (nr < 0) {
641 			if (!ret)
642 				ret = nr;
643 			break;
644 		}
645 		ret += nr;
646 		if (nr != len)
647 			break;
648 	}
649 
650 	return ret;
651 }
652 
653 /* A write operation does a read from user space and vice versa */
654 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
655 
656 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
657 			      unsigned long nr_segs, unsigned long fast_segs,
658 			      struct iovec *fast_pointer,
659 			      struct iovec **ret_pointer)
660 {
661 	unsigned long seg;
662 	ssize_t ret;
663 	struct iovec *iov = fast_pointer;
664 
665 	/*
666 	 * SuS says "The readv() function *may* fail if the iovcnt argument
667 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
668 	 * traditionally returned zero for zero segments, so...
669 	 */
670 	if (nr_segs == 0) {
671 		ret = 0;
672 		goto out;
673 	}
674 
675 	/*
676 	 * First get the "struct iovec" from user memory and
677 	 * verify all the pointers
678 	 */
679 	if (nr_segs > UIO_MAXIOV) {
680 		ret = -EINVAL;
681 		goto out;
682 	}
683 	if (nr_segs > fast_segs) {
684 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
685 		if (iov == NULL) {
686 			ret = -ENOMEM;
687 			goto out;
688 		}
689 	}
690 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
691 		ret = -EFAULT;
692 		goto out;
693 	}
694 
695 	/*
696 	 * According to the Single Unix Specification we should return EINVAL
697 	 * if an element length is < 0 when cast to ssize_t or if the
698 	 * total length would overflow the ssize_t return value of the
699 	 * system call.
700 	 *
701 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
702 	 * overflow case.
703 	 */
704 	ret = 0;
705 	for (seg = 0; seg < nr_segs; seg++) {
706 		void __user *buf = iov[seg].iov_base;
707 		ssize_t len = (ssize_t)iov[seg].iov_len;
708 
709 		/* see if we we're about to use an invalid len or if
710 		 * it's about to overflow ssize_t */
711 		if (len < 0) {
712 			ret = -EINVAL;
713 			goto out;
714 		}
715 		if (type >= 0
716 		    && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
717 			ret = -EFAULT;
718 			goto out;
719 		}
720 		if (len > MAX_RW_COUNT - ret) {
721 			len = MAX_RW_COUNT - ret;
722 			iov[seg].iov_len = len;
723 		}
724 		ret += len;
725 	}
726 out:
727 	*ret_pointer = iov;
728 	return ret;
729 }
730 
731 static ssize_t do_readv_writev(int type, struct file *file,
732 			       const struct iovec __user * uvector,
733 			       unsigned long nr_segs, loff_t *pos)
734 {
735 	size_t tot_len;
736 	struct iovec iovstack[UIO_FASTIOV];
737 	struct iovec *iov = iovstack;
738 	ssize_t ret;
739 	io_fn_t fn;
740 	iov_fn_t fnv;
741 
742 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
743 				    ARRAY_SIZE(iovstack), iovstack, &iov);
744 	if (ret <= 0)
745 		goto out;
746 
747 	tot_len = ret;
748 	ret = rw_verify_area(type, file, pos, tot_len);
749 	if (ret < 0)
750 		goto out;
751 
752 	fnv = NULL;
753 	if (type == READ) {
754 		fn = file->f_op->read;
755 		fnv = file->f_op->aio_read;
756 	} else {
757 		fn = (io_fn_t)file->f_op->write;
758 		fnv = file->f_op->aio_write;
759 		file_start_write(file);
760 	}
761 
762 	if (fnv)
763 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
764 						pos, fnv);
765 	else
766 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
767 
768 	if (type != READ)
769 		file_end_write(file);
770 
771 out:
772 	if (iov != iovstack)
773 		kfree(iov);
774 	if ((ret + (type == READ)) > 0) {
775 		if (type == READ)
776 			fsnotify_access(file);
777 		else
778 			fsnotify_modify(file);
779 	}
780 	return ret;
781 }
782 
783 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
784 		  unsigned long vlen, loff_t *pos)
785 {
786 	if (!(file->f_mode & FMODE_READ))
787 		return -EBADF;
788 	if (!file->f_op->aio_read && !file->f_op->read)
789 		return -EINVAL;
790 
791 	return do_readv_writev(READ, file, vec, vlen, pos);
792 }
793 
794 EXPORT_SYMBOL(vfs_readv);
795 
796 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
797 		   unsigned long vlen, loff_t *pos)
798 {
799 	if (!(file->f_mode & FMODE_WRITE))
800 		return -EBADF;
801 	if (!file->f_op->aio_write && !file->f_op->write)
802 		return -EINVAL;
803 
804 	return do_readv_writev(WRITE, file, vec, vlen, pos);
805 }
806 
807 EXPORT_SYMBOL(vfs_writev);
808 
809 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
810 		unsigned long, vlen)
811 {
812 	struct fd f = fdget_pos(fd);
813 	ssize_t ret = -EBADF;
814 
815 	if (f.file) {
816 		loff_t pos = file_pos_read(f.file);
817 		ret = vfs_readv(f.file, vec, vlen, &pos);
818 		if (ret >= 0)
819 			file_pos_write(f.file, pos);
820 		fdput_pos(f);
821 	}
822 
823 	if (ret > 0)
824 		add_rchar(current, ret);
825 	inc_syscr(current);
826 	return ret;
827 }
828 
829 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
830 		unsigned long, vlen)
831 {
832 	struct fd f = fdget_pos(fd);
833 	ssize_t ret = -EBADF;
834 
835 	if (f.file) {
836 		loff_t pos = file_pos_read(f.file);
837 		ret = vfs_writev(f.file, vec, vlen, &pos);
838 		if (ret >= 0)
839 			file_pos_write(f.file, pos);
840 		fdput_pos(f);
841 	}
842 
843 	if (ret > 0)
844 		add_wchar(current, ret);
845 	inc_syscw(current);
846 	return ret;
847 }
848 
849 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
850 {
851 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
852 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
853 }
854 
855 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
856 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
857 {
858 	loff_t pos = pos_from_hilo(pos_h, pos_l);
859 	struct fd f;
860 	ssize_t ret = -EBADF;
861 
862 	if (pos < 0)
863 		return -EINVAL;
864 
865 	f = fdget(fd);
866 	if (f.file) {
867 		ret = -ESPIPE;
868 		if (f.file->f_mode & FMODE_PREAD)
869 			ret = vfs_readv(f.file, vec, vlen, &pos);
870 		fdput(f);
871 	}
872 
873 	if (ret > 0)
874 		add_rchar(current, ret);
875 	inc_syscr(current);
876 	return ret;
877 }
878 
879 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
880 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
881 {
882 	loff_t pos = pos_from_hilo(pos_h, pos_l);
883 	struct fd f;
884 	ssize_t ret = -EBADF;
885 
886 	if (pos < 0)
887 		return -EINVAL;
888 
889 	f = fdget(fd);
890 	if (f.file) {
891 		ret = -ESPIPE;
892 		if (f.file->f_mode & FMODE_PWRITE)
893 			ret = vfs_writev(f.file, vec, vlen, &pos);
894 		fdput(f);
895 	}
896 
897 	if (ret > 0)
898 		add_wchar(current, ret);
899 	inc_syscw(current);
900 	return ret;
901 }
902 
903 #ifdef CONFIG_COMPAT
904 
905 static ssize_t compat_do_readv_writev(int type, struct file *file,
906 			       const struct compat_iovec __user *uvector,
907 			       unsigned long nr_segs, loff_t *pos)
908 {
909 	compat_ssize_t tot_len;
910 	struct iovec iovstack[UIO_FASTIOV];
911 	struct iovec *iov = iovstack;
912 	ssize_t ret;
913 	io_fn_t fn;
914 	iov_fn_t fnv;
915 
916 	ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
917 					       UIO_FASTIOV, iovstack, &iov);
918 	if (ret <= 0)
919 		goto out;
920 
921 	tot_len = ret;
922 	ret = rw_verify_area(type, file, pos, tot_len);
923 	if (ret < 0)
924 		goto out;
925 
926 	fnv = NULL;
927 	if (type == READ) {
928 		fn = file->f_op->read;
929 		fnv = file->f_op->aio_read;
930 	} else {
931 		fn = (io_fn_t)file->f_op->write;
932 		fnv = file->f_op->aio_write;
933 		file_start_write(file);
934 	}
935 
936 	if (fnv)
937 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
938 						pos, fnv);
939 	else
940 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
941 
942 	if (type != READ)
943 		file_end_write(file);
944 
945 out:
946 	if (iov != iovstack)
947 		kfree(iov);
948 	if ((ret + (type == READ)) > 0) {
949 		if (type == READ)
950 			fsnotify_access(file);
951 		else
952 			fsnotify_modify(file);
953 	}
954 	return ret;
955 }
956 
957 static size_t compat_readv(struct file *file,
958 			   const struct compat_iovec __user *vec,
959 			   unsigned long vlen, loff_t *pos)
960 {
961 	ssize_t ret = -EBADF;
962 
963 	if (!(file->f_mode & FMODE_READ))
964 		goto out;
965 
966 	ret = -EINVAL;
967 	if (!file->f_op->aio_read && !file->f_op->read)
968 		goto out;
969 
970 	ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
971 
972 out:
973 	if (ret > 0)
974 		add_rchar(current, ret);
975 	inc_syscr(current);
976 	return ret;
977 }
978 
979 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
980 		const struct compat_iovec __user *,vec,
981 		compat_ulong_t, vlen)
982 {
983 	struct fd f = fdget_pos(fd);
984 	ssize_t ret;
985 	loff_t pos;
986 
987 	if (!f.file)
988 		return -EBADF;
989 	pos = f.file->f_pos;
990 	ret = compat_readv(f.file, vec, vlen, &pos);
991 	if (ret >= 0)
992 		f.file->f_pos = pos;
993 	fdput_pos(f);
994 	return ret;
995 }
996 
997 static long __compat_sys_preadv64(unsigned long fd,
998 				  const struct compat_iovec __user *vec,
999 				  unsigned long vlen, loff_t pos)
1000 {
1001 	struct fd f;
1002 	ssize_t ret;
1003 
1004 	if (pos < 0)
1005 		return -EINVAL;
1006 	f = fdget(fd);
1007 	if (!f.file)
1008 		return -EBADF;
1009 	ret = -ESPIPE;
1010 	if (f.file->f_mode & FMODE_PREAD)
1011 		ret = compat_readv(f.file, vec, vlen, &pos);
1012 	fdput(f);
1013 	return ret;
1014 }
1015 
1016 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
1017 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1018 		const struct compat_iovec __user *,vec,
1019 		unsigned long, vlen, loff_t, pos)
1020 {
1021 	return __compat_sys_preadv64(fd, vec, vlen, pos);
1022 }
1023 #endif
1024 
1025 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1026 		const struct compat_iovec __user *,vec,
1027 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1028 {
1029 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1030 
1031 	return __compat_sys_preadv64(fd, vec, vlen, pos);
1032 }
1033 
1034 static size_t compat_writev(struct file *file,
1035 			    const struct compat_iovec __user *vec,
1036 			    unsigned long vlen, loff_t *pos)
1037 {
1038 	ssize_t ret = -EBADF;
1039 
1040 	if (!(file->f_mode & FMODE_WRITE))
1041 		goto out;
1042 
1043 	ret = -EINVAL;
1044 	if (!file->f_op->aio_write && !file->f_op->write)
1045 		goto out;
1046 
1047 	ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
1048 
1049 out:
1050 	if (ret > 0)
1051 		add_wchar(current, ret);
1052 	inc_syscw(current);
1053 	return ret;
1054 }
1055 
1056 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
1057 		const struct compat_iovec __user *, vec,
1058 		compat_ulong_t, vlen)
1059 {
1060 	struct fd f = fdget_pos(fd);
1061 	ssize_t ret;
1062 	loff_t pos;
1063 
1064 	if (!f.file)
1065 		return -EBADF;
1066 	pos = f.file->f_pos;
1067 	ret = compat_writev(f.file, vec, vlen, &pos);
1068 	if (ret >= 0)
1069 		f.file->f_pos = pos;
1070 	fdput_pos(f);
1071 	return ret;
1072 }
1073 
1074 static long __compat_sys_pwritev64(unsigned long fd,
1075 				   const struct compat_iovec __user *vec,
1076 				   unsigned long vlen, loff_t pos)
1077 {
1078 	struct fd f;
1079 	ssize_t ret;
1080 
1081 	if (pos < 0)
1082 		return -EINVAL;
1083 	f = fdget(fd);
1084 	if (!f.file)
1085 		return -EBADF;
1086 	ret = -ESPIPE;
1087 	if (f.file->f_mode & FMODE_PWRITE)
1088 		ret = compat_writev(f.file, vec, vlen, &pos);
1089 	fdput(f);
1090 	return ret;
1091 }
1092 
1093 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
1094 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1095 		const struct compat_iovec __user *,vec,
1096 		unsigned long, vlen, loff_t, pos)
1097 {
1098 	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1099 }
1100 #endif
1101 
1102 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1103 		const struct compat_iovec __user *,vec,
1104 		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1105 {
1106 	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1107 
1108 	return __compat_sys_pwritev64(fd, vec, vlen, pos);
1109 }
1110 #endif
1111 
1112 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1113 		  	   size_t count, loff_t max)
1114 {
1115 	struct fd in, out;
1116 	struct inode *in_inode, *out_inode;
1117 	loff_t pos;
1118 	loff_t out_pos;
1119 	ssize_t retval;
1120 	int fl;
1121 
1122 	/*
1123 	 * Get input file, and verify that it is ok..
1124 	 */
1125 	retval = -EBADF;
1126 	in = fdget(in_fd);
1127 	if (!in.file)
1128 		goto out;
1129 	if (!(in.file->f_mode & FMODE_READ))
1130 		goto fput_in;
1131 	retval = -ESPIPE;
1132 	if (!ppos) {
1133 		pos = in.file->f_pos;
1134 	} else {
1135 		pos = *ppos;
1136 		if (!(in.file->f_mode & FMODE_PREAD))
1137 			goto fput_in;
1138 	}
1139 	retval = rw_verify_area(READ, in.file, &pos, count);
1140 	if (retval < 0)
1141 		goto fput_in;
1142 	count = retval;
1143 
1144 	/*
1145 	 * Get output file, and verify that it is ok..
1146 	 */
1147 	retval = -EBADF;
1148 	out = fdget(out_fd);
1149 	if (!out.file)
1150 		goto fput_in;
1151 	if (!(out.file->f_mode & FMODE_WRITE))
1152 		goto fput_out;
1153 	retval = -EINVAL;
1154 	in_inode = file_inode(in.file);
1155 	out_inode = file_inode(out.file);
1156 	out_pos = out.file->f_pos;
1157 	retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1158 	if (retval < 0)
1159 		goto fput_out;
1160 	count = retval;
1161 
1162 	if (!max)
1163 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1164 
1165 	if (unlikely(pos + count > max)) {
1166 		retval = -EOVERFLOW;
1167 		if (pos >= max)
1168 			goto fput_out;
1169 		count = max - pos;
1170 	}
1171 
1172 	fl = 0;
1173 #if 0
1174 	/*
1175 	 * We need to debate whether we can enable this or not. The
1176 	 * man page documents EAGAIN return for the output at least,
1177 	 * and the application is arguably buggy if it doesn't expect
1178 	 * EAGAIN on a non-blocking file descriptor.
1179 	 */
1180 	if (in.file->f_flags & O_NONBLOCK)
1181 		fl = SPLICE_F_NONBLOCK;
1182 #endif
1183 	file_start_write(out.file);
1184 	retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
1185 	file_end_write(out.file);
1186 
1187 	if (retval > 0) {
1188 		add_rchar(current, retval);
1189 		add_wchar(current, retval);
1190 		fsnotify_access(in.file);
1191 		fsnotify_modify(out.file);
1192 		out.file->f_pos = out_pos;
1193 		if (ppos)
1194 			*ppos = pos;
1195 		else
1196 			in.file->f_pos = pos;
1197 	}
1198 
1199 	inc_syscr(current);
1200 	inc_syscw(current);
1201 	if (pos > max)
1202 		retval = -EOVERFLOW;
1203 
1204 fput_out:
1205 	fdput(out);
1206 fput_in:
1207 	fdput(in);
1208 out:
1209 	return retval;
1210 }
1211 
1212 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1213 {
1214 	loff_t pos;
1215 	off_t off;
1216 	ssize_t ret;
1217 
1218 	if (offset) {
1219 		if (unlikely(get_user(off, offset)))
1220 			return -EFAULT;
1221 		pos = off;
1222 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1223 		if (unlikely(put_user(pos, offset)))
1224 			return -EFAULT;
1225 		return ret;
1226 	}
1227 
1228 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1229 }
1230 
1231 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1232 {
1233 	loff_t pos;
1234 	ssize_t ret;
1235 
1236 	if (offset) {
1237 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1238 			return -EFAULT;
1239 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1240 		if (unlikely(put_user(pos, offset)))
1241 			return -EFAULT;
1242 		return ret;
1243 	}
1244 
1245 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1246 }
1247 
1248 #ifdef CONFIG_COMPAT
1249 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1250 		compat_off_t __user *, offset, compat_size_t, count)
1251 {
1252 	loff_t pos;
1253 	off_t off;
1254 	ssize_t ret;
1255 
1256 	if (offset) {
1257 		if (unlikely(get_user(off, offset)))
1258 			return -EFAULT;
1259 		pos = off;
1260 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1261 		if (unlikely(put_user(pos, offset)))
1262 			return -EFAULT;
1263 		return ret;
1264 	}
1265 
1266 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1267 }
1268 
1269 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1270 		compat_loff_t __user *, offset, compat_size_t, count)
1271 {
1272 	loff_t pos;
1273 	ssize_t ret;
1274 
1275 	if (offset) {
1276 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1277 			return -EFAULT;
1278 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1279 		if (unlikely(put_user(pos, offset)))
1280 			return -EFAULT;
1281 		return ret;
1282 	}
1283 
1284 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1285 }
1286 #endif
1287