xref: /openbmc/linux/fs/read_write.c (revision 81d67439)
1 /*
2  *  linux/fs/read_write.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 #include <linux/slab.h>
8 #include <linux/stat.h>
9 #include <linux/fcntl.h>
10 #include <linux/file.h>
11 #include <linux/uio.h>
12 #include <linux/fsnotify.h>
13 #include <linux/security.h>
14 #include <linux/module.h>
15 #include <linux/syscalls.h>
16 #include <linux/pagemap.h>
17 #include <linux/splice.h>
18 #include "read_write.h"
19 
20 #include <asm/uaccess.h>
21 #include <asm/unistd.h>
22 
23 const struct file_operations generic_ro_fops = {
24 	.llseek		= generic_file_llseek,
25 	.read		= do_sync_read,
26 	.aio_read	= generic_file_aio_read,
27 	.mmap		= generic_file_readonly_mmap,
28 	.splice_read	= generic_file_splice_read,
29 };
30 
31 EXPORT_SYMBOL(generic_ro_fops);
32 
33 static inline int unsigned_offsets(struct file *file)
34 {
35 	return file->f_mode & FMODE_UNSIGNED_OFFSET;
36 }
37 
38 /**
39  * generic_file_llseek_unlocked - lockless generic llseek implementation
40  * @file:	file structure to seek on
41  * @offset:	file offset to seek to
42  * @origin:	type of seek
43  *
44  * Updates the file offset to the value specified by @offset and @origin.
45  * Locking must be provided by the caller.
46  */
47 loff_t
48 generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
49 {
50 	struct inode *inode = file->f_mapping->host;
51 
52 	switch (origin) {
53 	case SEEK_END:
54 		offset += inode->i_size;
55 		break;
56 	case SEEK_CUR:
57 		/*
58 		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
59 		 * position-querying operation.  Avoid rewriting the "same"
60 		 * f_pos value back to the file because a concurrent read(),
61 		 * write() or lseek() might have altered it
62 		 */
63 		if (offset == 0)
64 			return file->f_pos;
65 		offset += file->f_pos;
66 		break;
67 	case SEEK_DATA:
68 		/*
69 		 * In the generic case the entire file is data, so as long as
70 		 * offset isn't at the end of the file then the offset is data.
71 		 */
72 		if (offset >= inode->i_size)
73 			return -ENXIO;
74 		break;
75 	case SEEK_HOLE:
76 		/*
77 		 * There is a virtual hole at the end of the file, so as long as
78 		 * offset isn't i_size or larger, return i_size.
79 		 */
80 		if (offset >= inode->i_size)
81 			return -ENXIO;
82 		offset = inode->i_size;
83 		break;
84 	}
85 
86 	if (offset < 0 && !unsigned_offsets(file))
87 		return -EINVAL;
88 	if (offset > inode->i_sb->s_maxbytes)
89 		return -EINVAL;
90 
91 	/* Special lock needed here? */
92 	if (offset != file->f_pos) {
93 		file->f_pos = offset;
94 		file->f_version = 0;
95 	}
96 
97 	return offset;
98 }
99 EXPORT_SYMBOL(generic_file_llseek_unlocked);
100 
101 /**
102  * generic_file_llseek - generic llseek implementation for regular files
103  * @file:	file structure to seek on
104  * @offset:	file offset to seek to
105  * @origin:	type of seek
106  *
107  * This is a generic implemenation of ->llseek useable for all normal local
108  * filesystems.  It just updates the file offset to the value specified by
109  * @offset and @origin under i_mutex.
110  */
111 loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
112 {
113 	loff_t rval;
114 
115 	mutex_lock(&file->f_dentry->d_inode->i_mutex);
116 	rval = generic_file_llseek_unlocked(file, offset, origin);
117 	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
118 
119 	return rval;
120 }
121 EXPORT_SYMBOL(generic_file_llseek);
122 
123 /**
124  * noop_llseek - No Operation Performed llseek implementation
125  * @file:	file structure to seek on
126  * @offset:	file offset to seek to
127  * @origin:	type of seek
128  *
129  * This is an implementation of ->llseek useable for the rare special case when
130  * userspace expects the seek to succeed but the (device) file is actually not
131  * able to perform the seek. In this case you use noop_llseek() instead of
132  * falling back to the default implementation of ->llseek.
133  */
134 loff_t noop_llseek(struct file *file, loff_t offset, int origin)
135 {
136 	return file->f_pos;
137 }
138 EXPORT_SYMBOL(noop_llseek);
139 
140 loff_t no_llseek(struct file *file, loff_t offset, int origin)
141 {
142 	return -ESPIPE;
143 }
144 EXPORT_SYMBOL(no_llseek);
145 
146 loff_t default_llseek(struct file *file, loff_t offset, int origin)
147 {
148 	struct inode *inode = file->f_path.dentry->d_inode;
149 	loff_t retval;
150 
151 	mutex_lock(&inode->i_mutex);
152 	switch (origin) {
153 		case SEEK_END:
154 			offset += i_size_read(inode);
155 			break;
156 		case SEEK_CUR:
157 			if (offset == 0) {
158 				retval = file->f_pos;
159 				goto out;
160 			}
161 			offset += file->f_pos;
162 			break;
163 		case SEEK_DATA:
164 			/*
165 			 * In the generic case the entire file is data, so as
166 			 * long as offset isn't at the end of the file then the
167 			 * offset is data.
168 			 */
169 			if (offset >= inode->i_size)
170 				return -ENXIO;
171 			break;
172 		case SEEK_HOLE:
173 			/*
174 			 * There is a virtual hole at the end of the file, so
175 			 * as long as offset isn't i_size or larger, return
176 			 * i_size.
177 			 */
178 			if (offset >= inode->i_size)
179 				return -ENXIO;
180 			offset = inode->i_size;
181 			break;
182 	}
183 	retval = -EINVAL;
184 	if (offset >= 0 || unsigned_offsets(file)) {
185 		if (offset != file->f_pos) {
186 			file->f_pos = offset;
187 			file->f_version = 0;
188 		}
189 		retval = offset;
190 	}
191 out:
192 	mutex_unlock(&inode->i_mutex);
193 	return retval;
194 }
195 EXPORT_SYMBOL(default_llseek);
196 
197 loff_t vfs_llseek(struct file *file, loff_t offset, int origin)
198 {
199 	loff_t (*fn)(struct file *, loff_t, int);
200 
201 	fn = no_llseek;
202 	if (file->f_mode & FMODE_LSEEK) {
203 		if (file->f_op && file->f_op->llseek)
204 			fn = file->f_op->llseek;
205 	}
206 	return fn(file, offset, origin);
207 }
208 EXPORT_SYMBOL(vfs_llseek);
209 
210 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
211 {
212 	off_t retval;
213 	struct file * file;
214 	int fput_needed;
215 
216 	retval = -EBADF;
217 	file = fget_light(fd, &fput_needed);
218 	if (!file)
219 		goto bad;
220 
221 	retval = -EINVAL;
222 	if (origin <= SEEK_MAX) {
223 		loff_t res = vfs_llseek(file, offset, origin);
224 		retval = res;
225 		if (res != (loff_t)retval)
226 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
227 	}
228 	fput_light(file, fput_needed);
229 bad:
230 	return retval;
231 }
232 
233 #ifdef __ARCH_WANT_SYS_LLSEEK
234 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
235 		unsigned long, offset_low, loff_t __user *, result,
236 		unsigned int, origin)
237 {
238 	int retval;
239 	struct file * file;
240 	loff_t offset;
241 	int fput_needed;
242 
243 	retval = -EBADF;
244 	file = fget_light(fd, &fput_needed);
245 	if (!file)
246 		goto bad;
247 
248 	retval = -EINVAL;
249 	if (origin > SEEK_MAX)
250 		goto out_putf;
251 
252 	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
253 			origin);
254 
255 	retval = (int)offset;
256 	if (offset >= 0) {
257 		retval = -EFAULT;
258 		if (!copy_to_user(result, &offset, sizeof(offset)))
259 			retval = 0;
260 	}
261 out_putf:
262 	fput_light(file, fput_needed);
263 bad:
264 	return retval;
265 }
266 #endif
267 
268 
269 /*
270  * rw_verify_area doesn't like huge counts. We limit
271  * them to something that fits in "int" so that others
272  * won't have to do range checks all the time.
273  */
274 int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count)
275 {
276 	struct inode *inode;
277 	loff_t pos;
278 	int retval = -EINVAL;
279 
280 	inode = file->f_path.dentry->d_inode;
281 	if (unlikely((ssize_t) count < 0))
282 		return retval;
283 	pos = *ppos;
284 	if (unlikely(pos < 0)) {
285 		if (!unsigned_offsets(file))
286 			return retval;
287 		if (count >= -pos) /* both values are in 0..LLONG_MAX */
288 			return -EOVERFLOW;
289 	} else if (unlikely((loff_t) (pos + count) < 0)) {
290 		if (!unsigned_offsets(file))
291 			return retval;
292 	}
293 
294 	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
295 		retval = locks_mandatory_area(
296 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
297 			inode, file, pos, count);
298 		if (retval < 0)
299 			return retval;
300 	}
301 	retval = security_file_permission(file,
302 				read_write == READ ? MAY_READ : MAY_WRITE);
303 	if (retval)
304 		return retval;
305 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
306 }
307 
308 static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
309 {
310 	set_current_state(TASK_UNINTERRUPTIBLE);
311 	if (!kiocbIsKicked(iocb))
312 		schedule();
313 	else
314 		kiocbClearKicked(iocb);
315 	__set_current_state(TASK_RUNNING);
316 }
317 
318 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
319 {
320 	struct iovec iov = { .iov_base = buf, .iov_len = len };
321 	struct kiocb kiocb;
322 	ssize_t ret;
323 
324 	init_sync_kiocb(&kiocb, filp);
325 	kiocb.ki_pos = *ppos;
326 	kiocb.ki_left = len;
327 	kiocb.ki_nbytes = len;
328 
329 	for (;;) {
330 		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
331 		if (ret != -EIOCBRETRY)
332 			break;
333 		wait_on_retry_sync_kiocb(&kiocb);
334 	}
335 
336 	if (-EIOCBQUEUED == ret)
337 		ret = wait_on_sync_kiocb(&kiocb);
338 	*ppos = kiocb.ki_pos;
339 	return ret;
340 }
341 
342 EXPORT_SYMBOL(do_sync_read);
343 
344 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
345 {
346 	ssize_t ret;
347 
348 	if (!(file->f_mode & FMODE_READ))
349 		return -EBADF;
350 	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
351 		return -EINVAL;
352 	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
353 		return -EFAULT;
354 
355 	ret = rw_verify_area(READ, file, pos, count);
356 	if (ret >= 0) {
357 		count = ret;
358 		if (file->f_op->read)
359 			ret = file->f_op->read(file, buf, count, pos);
360 		else
361 			ret = do_sync_read(file, buf, count, pos);
362 		if (ret > 0) {
363 			fsnotify_access(file);
364 			add_rchar(current, ret);
365 		}
366 		inc_syscr(current);
367 	}
368 
369 	return ret;
370 }
371 
372 EXPORT_SYMBOL(vfs_read);
373 
374 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
375 {
376 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
377 	struct kiocb kiocb;
378 	ssize_t ret;
379 
380 	init_sync_kiocb(&kiocb, filp);
381 	kiocb.ki_pos = *ppos;
382 	kiocb.ki_left = len;
383 	kiocb.ki_nbytes = len;
384 
385 	for (;;) {
386 		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
387 		if (ret != -EIOCBRETRY)
388 			break;
389 		wait_on_retry_sync_kiocb(&kiocb);
390 	}
391 
392 	if (-EIOCBQUEUED == ret)
393 		ret = wait_on_sync_kiocb(&kiocb);
394 	*ppos = kiocb.ki_pos;
395 	return ret;
396 }
397 
398 EXPORT_SYMBOL(do_sync_write);
399 
400 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
401 {
402 	ssize_t ret;
403 
404 	if (!(file->f_mode & FMODE_WRITE))
405 		return -EBADF;
406 	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
407 		return -EINVAL;
408 	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
409 		return -EFAULT;
410 
411 	ret = rw_verify_area(WRITE, file, pos, count);
412 	if (ret >= 0) {
413 		count = ret;
414 		if (file->f_op->write)
415 			ret = file->f_op->write(file, buf, count, pos);
416 		else
417 			ret = do_sync_write(file, buf, count, pos);
418 		if (ret > 0) {
419 			fsnotify_modify(file);
420 			add_wchar(current, ret);
421 		}
422 		inc_syscw(current);
423 	}
424 
425 	return ret;
426 }
427 
428 EXPORT_SYMBOL(vfs_write);
429 
430 static inline loff_t file_pos_read(struct file *file)
431 {
432 	return file->f_pos;
433 }
434 
435 static inline void file_pos_write(struct file *file, loff_t pos)
436 {
437 	file->f_pos = pos;
438 }
439 
440 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
441 {
442 	struct file *file;
443 	ssize_t ret = -EBADF;
444 	int fput_needed;
445 
446 	file = fget_light(fd, &fput_needed);
447 	if (file) {
448 		loff_t pos = file_pos_read(file);
449 		ret = vfs_read(file, buf, count, &pos);
450 		file_pos_write(file, pos);
451 		fput_light(file, fput_needed);
452 	}
453 
454 	return ret;
455 }
456 
457 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
458 		size_t, count)
459 {
460 	struct file *file;
461 	ssize_t ret = -EBADF;
462 	int fput_needed;
463 
464 	file = fget_light(fd, &fput_needed);
465 	if (file) {
466 		loff_t pos = file_pos_read(file);
467 		ret = vfs_write(file, buf, count, &pos);
468 		file_pos_write(file, pos);
469 		fput_light(file, fput_needed);
470 	}
471 
472 	return ret;
473 }
474 
475 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
476 			size_t count, loff_t pos)
477 {
478 	struct file *file;
479 	ssize_t ret = -EBADF;
480 	int fput_needed;
481 
482 	if (pos < 0)
483 		return -EINVAL;
484 
485 	file = fget_light(fd, &fput_needed);
486 	if (file) {
487 		ret = -ESPIPE;
488 		if (file->f_mode & FMODE_PREAD)
489 			ret = vfs_read(file, buf, count, &pos);
490 		fput_light(file, fput_needed);
491 	}
492 
493 	return ret;
494 }
495 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
496 asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
497 {
498 	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
499 			    (size_t) count, pos);
500 }
501 SYSCALL_ALIAS(sys_pread64, SyS_pread64);
502 #endif
503 
504 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
505 			 size_t count, loff_t pos)
506 {
507 	struct file *file;
508 	ssize_t ret = -EBADF;
509 	int fput_needed;
510 
511 	if (pos < 0)
512 		return -EINVAL;
513 
514 	file = fget_light(fd, &fput_needed);
515 	if (file) {
516 		ret = -ESPIPE;
517 		if (file->f_mode & FMODE_PWRITE)
518 			ret = vfs_write(file, buf, count, &pos);
519 		fput_light(file, fput_needed);
520 	}
521 
522 	return ret;
523 }
524 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
525 asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
526 {
527 	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
528 			     (size_t) count, pos);
529 }
530 SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
531 #endif
532 
533 /*
534  * Reduce an iovec's length in-place.  Return the resulting number of segments
535  */
536 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
537 {
538 	unsigned long seg = 0;
539 	size_t len = 0;
540 
541 	while (seg < nr_segs) {
542 		seg++;
543 		if (len + iov->iov_len >= to) {
544 			iov->iov_len = to - len;
545 			break;
546 		}
547 		len += iov->iov_len;
548 		iov++;
549 	}
550 	return seg;
551 }
552 EXPORT_SYMBOL(iov_shorten);
553 
554 ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
555 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
556 {
557 	struct kiocb kiocb;
558 	ssize_t ret;
559 
560 	init_sync_kiocb(&kiocb, filp);
561 	kiocb.ki_pos = *ppos;
562 	kiocb.ki_left = len;
563 	kiocb.ki_nbytes = len;
564 
565 	for (;;) {
566 		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
567 		if (ret != -EIOCBRETRY)
568 			break;
569 		wait_on_retry_sync_kiocb(&kiocb);
570 	}
571 
572 	if (ret == -EIOCBQUEUED)
573 		ret = wait_on_sync_kiocb(&kiocb);
574 	*ppos = kiocb.ki_pos;
575 	return ret;
576 }
577 
578 /* Do it by hand, with file-ops */
579 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
580 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
581 {
582 	struct iovec *vector = iov;
583 	ssize_t ret = 0;
584 
585 	while (nr_segs > 0) {
586 		void __user *base;
587 		size_t len;
588 		ssize_t nr;
589 
590 		base = vector->iov_base;
591 		len = vector->iov_len;
592 		vector++;
593 		nr_segs--;
594 
595 		nr = fn(filp, base, len, ppos);
596 
597 		if (nr < 0) {
598 			if (!ret)
599 				ret = nr;
600 			break;
601 		}
602 		ret += nr;
603 		if (nr != len)
604 			break;
605 	}
606 
607 	return ret;
608 }
609 
610 /* A write operation does a read from user space and vice versa */
611 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
612 
613 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
614 			      unsigned long nr_segs, unsigned long fast_segs,
615 			      struct iovec *fast_pointer,
616 			      struct iovec **ret_pointer)
617 {
618 	unsigned long seg;
619 	ssize_t ret;
620 	struct iovec *iov = fast_pointer;
621 
622 	/*
623 	 * SuS says "The readv() function *may* fail if the iovcnt argument
624 	 * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
625 	 * traditionally returned zero for zero segments, so...
626 	 */
627 	if (nr_segs == 0) {
628 		ret = 0;
629 		goto out;
630 	}
631 
632 	/*
633 	 * First get the "struct iovec" from user memory and
634 	 * verify all the pointers
635 	 */
636 	if (nr_segs > UIO_MAXIOV) {
637 		ret = -EINVAL;
638 		goto out;
639 	}
640 	if (nr_segs > fast_segs) {
641 		iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
642 		if (iov == NULL) {
643 			ret = -ENOMEM;
644 			goto out;
645 		}
646 	}
647 	if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
648 		ret = -EFAULT;
649 		goto out;
650 	}
651 
652 	/*
653 	 * According to the Single Unix Specification we should return EINVAL
654 	 * if an element length is < 0 when cast to ssize_t or if the
655 	 * total length would overflow the ssize_t return value of the
656 	 * system call.
657 	 *
658 	 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
659 	 * overflow case.
660 	 */
661 	ret = 0;
662 	for (seg = 0; seg < nr_segs; seg++) {
663 		void __user *buf = iov[seg].iov_base;
664 		ssize_t len = (ssize_t)iov[seg].iov_len;
665 
666 		/* see if we we're about to use an invalid len or if
667 		 * it's about to overflow ssize_t */
668 		if (len < 0) {
669 			ret = -EINVAL;
670 			goto out;
671 		}
672 		if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
673 			ret = -EFAULT;
674 			goto out;
675 		}
676 		if (len > MAX_RW_COUNT - ret) {
677 			len = MAX_RW_COUNT - ret;
678 			iov[seg].iov_len = len;
679 		}
680 		ret += len;
681 	}
682 out:
683 	*ret_pointer = iov;
684 	return ret;
685 }
686 
687 static ssize_t do_readv_writev(int type, struct file *file,
688 			       const struct iovec __user * uvector,
689 			       unsigned long nr_segs, loff_t *pos)
690 {
691 	size_t tot_len;
692 	struct iovec iovstack[UIO_FASTIOV];
693 	struct iovec *iov = iovstack;
694 	ssize_t ret;
695 	io_fn_t fn;
696 	iov_fn_t fnv;
697 
698 	if (!file->f_op) {
699 		ret = -EINVAL;
700 		goto out;
701 	}
702 
703 	ret = rw_copy_check_uvector(type, uvector, nr_segs,
704 			ARRAY_SIZE(iovstack), iovstack, &iov);
705 	if (ret <= 0)
706 		goto out;
707 
708 	tot_len = ret;
709 	ret = rw_verify_area(type, file, pos, tot_len);
710 	if (ret < 0)
711 		goto out;
712 
713 	fnv = NULL;
714 	if (type == READ) {
715 		fn = file->f_op->read;
716 		fnv = file->f_op->aio_read;
717 	} else {
718 		fn = (io_fn_t)file->f_op->write;
719 		fnv = file->f_op->aio_write;
720 	}
721 
722 	if (fnv)
723 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
724 						pos, fnv);
725 	else
726 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
727 
728 out:
729 	if (iov != iovstack)
730 		kfree(iov);
731 	if ((ret + (type == READ)) > 0) {
732 		if (type == READ)
733 			fsnotify_access(file);
734 		else
735 			fsnotify_modify(file);
736 	}
737 	return ret;
738 }
739 
740 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
741 		  unsigned long vlen, loff_t *pos)
742 {
743 	if (!(file->f_mode & FMODE_READ))
744 		return -EBADF;
745 	if (!file->f_op || (!file->f_op->aio_read && !file->f_op->read))
746 		return -EINVAL;
747 
748 	return do_readv_writev(READ, file, vec, vlen, pos);
749 }
750 
751 EXPORT_SYMBOL(vfs_readv);
752 
753 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
754 		   unsigned long vlen, loff_t *pos)
755 {
756 	if (!(file->f_mode & FMODE_WRITE))
757 		return -EBADF;
758 	if (!file->f_op || (!file->f_op->aio_write && !file->f_op->write))
759 		return -EINVAL;
760 
761 	return do_readv_writev(WRITE, file, vec, vlen, pos);
762 }
763 
764 EXPORT_SYMBOL(vfs_writev);
765 
766 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
767 		unsigned long, vlen)
768 {
769 	struct file *file;
770 	ssize_t ret = -EBADF;
771 	int fput_needed;
772 
773 	file = fget_light(fd, &fput_needed);
774 	if (file) {
775 		loff_t pos = file_pos_read(file);
776 		ret = vfs_readv(file, vec, vlen, &pos);
777 		file_pos_write(file, pos);
778 		fput_light(file, fput_needed);
779 	}
780 
781 	if (ret > 0)
782 		add_rchar(current, ret);
783 	inc_syscr(current);
784 	return ret;
785 }
786 
787 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
788 		unsigned long, vlen)
789 {
790 	struct file *file;
791 	ssize_t ret = -EBADF;
792 	int fput_needed;
793 
794 	file = fget_light(fd, &fput_needed);
795 	if (file) {
796 		loff_t pos = file_pos_read(file);
797 		ret = vfs_writev(file, vec, vlen, &pos);
798 		file_pos_write(file, pos);
799 		fput_light(file, fput_needed);
800 	}
801 
802 	if (ret > 0)
803 		add_wchar(current, ret);
804 	inc_syscw(current);
805 	return ret;
806 }
807 
808 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
809 {
810 #define HALF_LONG_BITS (BITS_PER_LONG / 2)
811 	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
812 }
813 
814 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
815 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
816 {
817 	loff_t pos = pos_from_hilo(pos_h, pos_l);
818 	struct file *file;
819 	ssize_t ret = -EBADF;
820 	int fput_needed;
821 
822 	if (pos < 0)
823 		return -EINVAL;
824 
825 	file = fget_light(fd, &fput_needed);
826 	if (file) {
827 		ret = -ESPIPE;
828 		if (file->f_mode & FMODE_PREAD)
829 			ret = vfs_readv(file, vec, vlen, &pos);
830 		fput_light(file, fput_needed);
831 	}
832 
833 	if (ret > 0)
834 		add_rchar(current, ret);
835 	inc_syscr(current);
836 	return ret;
837 }
838 
839 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
840 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
841 {
842 	loff_t pos = pos_from_hilo(pos_h, pos_l);
843 	struct file *file;
844 	ssize_t ret = -EBADF;
845 	int fput_needed;
846 
847 	if (pos < 0)
848 		return -EINVAL;
849 
850 	file = fget_light(fd, &fput_needed);
851 	if (file) {
852 		ret = -ESPIPE;
853 		if (file->f_mode & FMODE_PWRITE)
854 			ret = vfs_writev(file, vec, vlen, &pos);
855 		fput_light(file, fput_needed);
856 	}
857 
858 	if (ret > 0)
859 		add_wchar(current, ret);
860 	inc_syscw(current);
861 	return ret;
862 }
863 
864 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
865 			   size_t count, loff_t max)
866 {
867 	struct file * in_file, * out_file;
868 	struct inode * in_inode, * out_inode;
869 	loff_t pos;
870 	ssize_t retval;
871 	int fput_needed_in, fput_needed_out, fl;
872 
873 	/*
874 	 * Get input file, and verify that it is ok..
875 	 */
876 	retval = -EBADF;
877 	in_file = fget_light(in_fd, &fput_needed_in);
878 	if (!in_file)
879 		goto out;
880 	if (!(in_file->f_mode & FMODE_READ))
881 		goto fput_in;
882 	retval = -ESPIPE;
883 	if (!ppos)
884 		ppos = &in_file->f_pos;
885 	else
886 		if (!(in_file->f_mode & FMODE_PREAD))
887 			goto fput_in;
888 	retval = rw_verify_area(READ, in_file, ppos, count);
889 	if (retval < 0)
890 		goto fput_in;
891 	count = retval;
892 
893 	/*
894 	 * Get output file, and verify that it is ok..
895 	 */
896 	retval = -EBADF;
897 	out_file = fget_light(out_fd, &fput_needed_out);
898 	if (!out_file)
899 		goto fput_in;
900 	if (!(out_file->f_mode & FMODE_WRITE))
901 		goto fput_out;
902 	retval = -EINVAL;
903 	in_inode = in_file->f_path.dentry->d_inode;
904 	out_inode = out_file->f_path.dentry->d_inode;
905 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
906 	if (retval < 0)
907 		goto fput_out;
908 	count = retval;
909 
910 	if (!max)
911 		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
912 
913 	pos = *ppos;
914 	if (unlikely(pos + count > max)) {
915 		retval = -EOVERFLOW;
916 		if (pos >= max)
917 			goto fput_out;
918 		count = max - pos;
919 	}
920 
921 	fl = 0;
922 #if 0
923 	/*
924 	 * We need to debate whether we can enable this or not. The
925 	 * man page documents EAGAIN return for the output at least,
926 	 * and the application is arguably buggy if it doesn't expect
927 	 * EAGAIN on a non-blocking file descriptor.
928 	 */
929 	if (in_file->f_flags & O_NONBLOCK)
930 		fl = SPLICE_F_NONBLOCK;
931 #endif
932 	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
933 
934 	if (retval > 0) {
935 		add_rchar(current, retval);
936 		add_wchar(current, retval);
937 	}
938 
939 	inc_syscr(current);
940 	inc_syscw(current);
941 	if (*ppos > max)
942 		retval = -EOVERFLOW;
943 
944 fput_out:
945 	fput_light(out_file, fput_needed_out);
946 fput_in:
947 	fput_light(in_file, fput_needed_in);
948 out:
949 	return retval;
950 }
951 
952 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
953 {
954 	loff_t pos;
955 	off_t off;
956 	ssize_t ret;
957 
958 	if (offset) {
959 		if (unlikely(get_user(off, offset)))
960 			return -EFAULT;
961 		pos = off;
962 		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
963 		if (unlikely(put_user(pos, offset)))
964 			return -EFAULT;
965 		return ret;
966 	}
967 
968 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
969 }
970 
971 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
972 {
973 	loff_t pos;
974 	ssize_t ret;
975 
976 	if (offset) {
977 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
978 			return -EFAULT;
979 		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
980 		if (unlikely(put_user(pos, offset)))
981 			return -EFAULT;
982 		return ret;
983 	}
984 
985 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
986 }
987