xref: /openbmc/linux/fs/read_write.c (revision c900529f3d9161bfde5cca0754f83b4d3c3e0220)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   *  linux/fs/read_write.c
4   *
5   *  Copyright (C) 1991, 1992  Linus Torvalds
6   */
7  
8  #include <linux/slab.h>
9  #include <linux/stat.h>
10  #include <linux/sched/xacct.h>
11  #include <linux/fcntl.h>
12  #include <linux/file.h>
13  #include <linux/uio.h>
14  #include <linux/fsnotify.h>
15  #include <linux/security.h>
16  #include <linux/export.h>
17  #include <linux/syscalls.h>
18  #include <linux/pagemap.h>
19  #include <linux/splice.h>
20  #include <linux/compat.h>
21  #include <linux/mount.h>
22  #include <linux/fs.h>
23  #include "internal.h"
24  
25  #include <linux/uaccess.h>
26  #include <asm/unistd.h>
27  
28  const struct file_operations generic_ro_fops = {
29  	.llseek		= generic_file_llseek,
30  	.read_iter	= generic_file_read_iter,
31  	.mmap		= generic_file_readonly_mmap,
32  	.splice_read	= filemap_splice_read,
33  };
34  
35  EXPORT_SYMBOL(generic_ro_fops);
36  
unsigned_offsets(struct file * file)37  static inline bool unsigned_offsets(struct file *file)
38  {
39  	return file->f_mode & FMODE_UNSIGNED_OFFSET;
40  }
41  
42  /**
43   * vfs_setpos - update the file offset for lseek
44   * @file:	file structure in question
45   * @offset:	file offset to seek to
46   * @maxsize:	maximum file size
47   *
48   * This is a low-level filesystem helper for updating the file offset to
49   * the value specified by @offset if the given offset is valid and it is
50   * not equal to the current file offset.
51   *
52   * Return the specified offset on success and -EINVAL on invalid offset.
53   */
vfs_setpos(struct file * file,loff_t offset,loff_t maxsize)54  loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
55  {
56  	if (offset < 0 && !unsigned_offsets(file))
57  		return -EINVAL;
58  	if (offset > maxsize)
59  		return -EINVAL;
60  
61  	if (offset != file->f_pos) {
62  		file->f_pos = offset;
63  		file->f_version = 0;
64  	}
65  	return offset;
66  }
67  EXPORT_SYMBOL(vfs_setpos);
68  
69  /**
70   * generic_file_llseek_size - generic llseek implementation for regular files
71   * @file:	file structure to seek on
72   * @offset:	file offset to seek to
73   * @whence:	type of seek
74   * @maxsize:	max size of this file in file system
75   * @eof:	offset used for SEEK_END position
76   *
77   * This is a variant of generic_file_llseek that allows passing in a custom
78   * maximum file size and a custom EOF position, for e.g. hashed directories
79   *
80   * Synchronization:
81   * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
82   * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
83   * read/writes behave like SEEK_SET against seeks.
84   */
85  loff_t
generic_file_llseek_size(struct file * file,loff_t offset,int whence,loff_t maxsize,loff_t eof)86  generic_file_llseek_size(struct file *file, loff_t offset, int whence,
87  		loff_t maxsize, loff_t eof)
88  {
89  	switch (whence) {
90  	case SEEK_END:
91  		offset += eof;
92  		break;
93  	case SEEK_CUR:
94  		/*
95  		 * Here we special-case the lseek(fd, 0, SEEK_CUR)
96  		 * position-querying operation.  Avoid rewriting the "same"
97  		 * f_pos value back to the file because a concurrent read(),
98  		 * write() or lseek() might have altered it
99  		 */
100  		if (offset == 0)
101  			return file->f_pos;
102  		/*
103  		 * f_lock protects against read/modify/write race with other
104  		 * SEEK_CURs. Note that parallel writes and reads behave
105  		 * like SEEK_SET.
106  		 */
107  		spin_lock(&file->f_lock);
108  		offset = vfs_setpos(file, file->f_pos + offset, maxsize);
109  		spin_unlock(&file->f_lock);
110  		return offset;
111  	case SEEK_DATA:
112  		/*
113  		 * In the generic case the entire file is data, so as long as
114  		 * offset isn't at the end of the file then the offset is data.
115  		 */
116  		if ((unsigned long long)offset >= eof)
117  			return -ENXIO;
118  		break;
119  	case SEEK_HOLE:
120  		/*
121  		 * There is a virtual hole at the end of the file, so as long as
122  		 * offset isn't i_size or larger, return i_size.
123  		 */
124  		if ((unsigned long long)offset >= eof)
125  			return -ENXIO;
126  		offset = eof;
127  		break;
128  	}
129  
130  	return vfs_setpos(file, offset, maxsize);
131  }
132  EXPORT_SYMBOL(generic_file_llseek_size);
133  
134  /**
135   * generic_file_llseek - generic llseek implementation for regular files
136   * @file:	file structure to seek on
137   * @offset:	file offset to seek to
138   * @whence:	type of seek
139   *
140   * This is a generic implemenation of ->llseek useable for all normal local
141   * filesystems.  It just updates the file offset to the value specified by
142   * @offset and @whence.
143   */
generic_file_llseek(struct file * file,loff_t offset,int whence)144  loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
145  {
146  	struct inode *inode = file->f_mapping->host;
147  
148  	return generic_file_llseek_size(file, offset, whence,
149  					inode->i_sb->s_maxbytes,
150  					i_size_read(inode));
151  }
152  EXPORT_SYMBOL(generic_file_llseek);
153  
154  /**
155   * fixed_size_llseek - llseek implementation for fixed-sized devices
156   * @file:	file structure to seek on
157   * @offset:	file offset to seek to
158   * @whence:	type of seek
159   * @size:	size of the file
160   *
161   */
fixed_size_llseek(struct file * file,loff_t offset,int whence,loff_t size)162  loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
163  {
164  	switch (whence) {
165  	case SEEK_SET: case SEEK_CUR: case SEEK_END:
166  		return generic_file_llseek_size(file, offset, whence,
167  						size, size);
168  	default:
169  		return -EINVAL;
170  	}
171  }
172  EXPORT_SYMBOL(fixed_size_llseek);
173  
174  /**
175   * no_seek_end_llseek - llseek implementation for fixed-sized devices
176   * @file:	file structure to seek on
177   * @offset:	file offset to seek to
178   * @whence:	type of seek
179   *
180   */
no_seek_end_llseek(struct file * file,loff_t offset,int whence)181  loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
182  {
183  	switch (whence) {
184  	case SEEK_SET: case SEEK_CUR:
185  		return generic_file_llseek_size(file, offset, whence,
186  						OFFSET_MAX, 0);
187  	default:
188  		return -EINVAL;
189  	}
190  }
191  EXPORT_SYMBOL(no_seek_end_llseek);
192  
193  /**
194   * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
195   * @file:	file structure to seek on
196   * @offset:	file offset to seek to
197   * @whence:	type of seek
198   * @size:	maximal offset allowed
199   *
200   */
no_seek_end_llseek_size(struct file * file,loff_t offset,int whence,loff_t size)201  loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
202  {
203  	switch (whence) {
204  	case SEEK_SET: case SEEK_CUR:
205  		return generic_file_llseek_size(file, offset, whence,
206  						size, 0);
207  	default:
208  		return -EINVAL;
209  	}
210  }
211  EXPORT_SYMBOL(no_seek_end_llseek_size);
212  
213  /**
214   * noop_llseek - No Operation Performed llseek implementation
215   * @file:	file structure to seek on
216   * @offset:	file offset to seek to
217   * @whence:	type of seek
218   *
219   * This is an implementation of ->llseek useable for the rare special case when
220   * userspace expects the seek to succeed but the (device) file is actually not
221   * able to perform the seek. In this case you use noop_llseek() instead of
222   * falling back to the default implementation of ->llseek.
223   */
noop_llseek(struct file * file,loff_t offset,int whence)224  loff_t noop_llseek(struct file *file, loff_t offset, int whence)
225  {
226  	return file->f_pos;
227  }
228  EXPORT_SYMBOL(noop_llseek);
229  
default_llseek(struct file * file,loff_t offset,int whence)230  loff_t default_llseek(struct file *file, loff_t offset, int whence)
231  {
232  	struct inode *inode = file_inode(file);
233  	loff_t retval;
234  
235  	inode_lock(inode);
236  	switch (whence) {
237  		case SEEK_END:
238  			offset += i_size_read(inode);
239  			break;
240  		case SEEK_CUR:
241  			if (offset == 0) {
242  				retval = file->f_pos;
243  				goto out;
244  			}
245  			offset += file->f_pos;
246  			break;
247  		case SEEK_DATA:
248  			/*
249  			 * In the generic case the entire file is data, so as
250  			 * long as offset isn't at the end of the file then the
251  			 * offset is data.
252  			 */
253  			if (offset >= inode->i_size) {
254  				retval = -ENXIO;
255  				goto out;
256  			}
257  			break;
258  		case SEEK_HOLE:
259  			/*
260  			 * There is a virtual hole at the end of the file, so
261  			 * as long as offset isn't i_size or larger, return
262  			 * i_size.
263  			 */
264  			if (offset >= inode->i_size) {
265  				retval = -ENXIO;
266  				goto out;
267  			}
268  			offset = inode->i_size;
269  			break;
270  	}
271  	retval = -EINVAL;
272  	if (offset >= 0 || unsigned_offsets(file)) {
273  		if (offset != file->f_pos) {
274  			file->f_pos = offset;
275  			file->f_version = 0;
276  		}
277  		retval = offset;
278  	}
279  out:
280  	inode_unlock(inode);
281  	return retval;
282  }
283  EXPORT_SYMBOL(default_llseek);
284  
vfs_llseek(struct file * file,loff_t offset,int whence)285  loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
286  {
287  	if (!(file->f_mode & FMODE_LSEEK))
288  		return -ESPIPE;
289  	return file->f_op->llseek(file, offset, whence);
290  }
291  EXPORT_SYMBOL(vfs_llseek);
292  
ksys_lseek(unsigned int fd,off_t offset,unsigned int whence)293  static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
294  {
295  	off_t retval;
296  	struct fd f = fdget_pos(fd);
297  	if (!f.file)
298  		return -EBADF;
299  
300  	retval = -EINVAL;
301  	if (whence <= SEEK_MAX) {
302  		loff_t res = vfs_llseek(f.file, offset, whence);
303  		retval = res;
304  		if (res != (loff_t)retval)
305  			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
306  	}
307  	fdput_pos(f);
308  	return retval;
309  }
310  
SYSCALL_DEFINE3(lseek,unsigned int,fd,off_t,offset,unsigned int,whence)311  SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
312  {
313  	return ksys_lseek(fd, offset, whence);
314  }
315  
316  #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE3(lseek,unsigned int,fd,compat_off_t,offset,unsigned int,whence)317  COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
318  {
319  	return ksys_lseek(fd, offset, whence);
320  }
321  #endif
322  
323  #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \
324  	defined(__ARCH_WANT_SYS_LLSEEK)
SYSCALL_DEFINE5(llseek,unsigned int,fd,unsigned long,offset_high,unsigned long,offset_low,loff_t __user *,result,unsigned int,whence)325  SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
326  		unsigned long, offset_low, loff_t __user *, result,
327  		unsigned int, whence)
328  {
329  	int retval;
330  	struct fd f = fdget_pos(fd);
331  	loff_t offset;
332  
333  	if (!f.file)
334  		return -EBADF;
335  
336  	retval = -EINVAL;
337  	if (whence > SEEK_MAX)
338  		goto out_putf;
339  
340  	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
341  			whence);
342  
343  	retval = (int)offset;
344  	if (offset >= 0) {
345  		retval = -EFAULT;
346  		if (!copy_to_user(result, &offset, sizeof(offset)))
347  			retval = 0;
348  	}
349  out_putf:
350  	fdput_pos(f);
351  	return retval;
352  }
353  #endif
354  
rw_verify_area(int read_write,struct file * file,const loff_t * ppos,size_t count)355  int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
356  {
357  	if (unlikely((ssize_t) count < 0))
358  		return -EINVAL;
359  
360  	if (ppos) {
361  		loff_t pos = *ppos;
362  
363  		if (unlikely(pos < 0)) {
364  			if (!unsigned_offsets(file))
365  				return -EINVAL;
366  			if (count >= -pos) /* both values are in 0..LLONG_MAX */
367  				return -EOVERFLOW;
368  		} else if (unlikely((loff_t) (pos + count) < 0)) {
369  			if (!unsigned_offsets(file))
370  				return -EINVAL;
371  		}
372  	}
373  
374  	return security_file_permission(file,
375  				read_write == READ ? MAY_READ : MAY_WRITE);
376  }
377  EXPORT_SYMBOL(rw_verify_area);
378  
new_sync_read(struct file * filp,char __user * buf,size_t len,loff_t * ppos)379  static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
380  {
381  	struct kiocb kiocb;
382  	struct iov_iter iter;
383  	ssize_t ret;
384  
385  	init_sync_kiocb(&kiocb, filp);
386  	kiocb.ki_pos = (ppos ? *ppos : 0);
387  	iov_iter_ubuf(&iter, ITER_DEST, buf, len);
388  
389  	ret = call_read_iter(filp, &kiocb, &iter);
390  	BUG_ON(ret == -EIOCBQUEUED);
391  	if (ppos)
392  		*ppos = kiocb.ki_pos;
393  	return ret;
394  }
395  
warn_unsupported(struct file * file,const char * op)396  static int warn_unsupported(struct file *file, const char *op)
397  {
398  	pr_warn_ratelimited(
399  		"kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
400  		op, file, current->pid, current->comm);
401  	return -EINVAL;
402  }
403  
__kernel_read(struct file * file,void * buf,size_t count,loff_t * pos)404  ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
405  {
406  	struct kvec iov = {
407  		.iov_base	= buf,
408  		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
409  	};
410  	struct kiocb kiocb;
411  	struct iov_iter iter;
412  	ssize_t ret;
413  
414  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ)))
415  		return -EINVAL;
416  	if (!(file->f_mode & FMODE_CAN_READ))
417  		return -EINVAL;
418  	/*
419  	 * Also fail if ->read_iter and ->read are both wired up as that
420  	 * implies very convoluted semantics.
421  	 */
422  	if (unlikely(!file->f_op->read_iter || file->f_op->read))
423  		return warn_unsupported(file, "read");
424  
425  	init_sync_kiocb(&kiocb, file);
426  	kiocb.ki_pos = pos ? *pos : 0;
427  	iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len);
428  	ret = file->f_op->read_iter(&kiocb, &iter);
429  	if (ret > 0) {
430  		if (pos)
431  			*pos = kiocb.ki_pos;
432  		fsnotify_access(file);
433  		add_rchar(current, ret);
434  	}
435  	inc_syscr(current);
436  	return ret;
437  }
438  
kernel_read(struct file * file,void * buf,size_t count,loff_t * pos)439  ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos)
440  {
441  	ssize_t ret;
442  
443  	ret = rw_verify_area(READ, file, pos, count);
444  	if (ret)
445  		return ret;
446  	return __kernel_read(file, buf, count, pos);
447  }
448  EXPORT_SYMBOL(kernel_read);
449  
vfs_read(struct file * file,char __user * buf,size_t count,loff_t * pos)450  ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
451  {
452  	ssize_t ret;
453  
454  	if (!(file->f_mode & FMODE_READ))
455  		return -EBADF;
456  	if (!(file->f_mode & FMODE_CAN_READ))
457  		return -EINVAL;
458  	if (unlikely(!access_ok(buf, count)))
459  		return -EFAULT;
460  
461  	ret = rw_verify_area(READ, file, pos, count);
462  	if (ret)
463  		return ret;
464  	if (count > MAX_RW_COUNT)
465  		count =  MAX_RW_COUNT;
466  
467  	if (file->f_op->read)
468  		ret = file->f_op->read(file, buf, count, pos);
469  	else if (file->f_op->read_iter)
470  		ret = new_sync_read(file, buf, count, pos);
471  	else
472  		ret = -EINVAL;
473  	if (ret > 0) {
474  		fsnotify_access(file);
475  		add_rchar(current, ret);
476  	}
477  	inc_syscr(current);
478  	return ret;
479  }
480  
new_sync_write(struct file * filp,const char __user * buf,size_t len,loff_t * ppos)481  static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
482  {
483  	struct kiocb kiocb;
484  	struct iov_iter iter;
485  	ssize_t ret;
486  
487  	init_sync_kiocb(&kiocb, filp);
488  	kiocb.ki_pos = (ppos ? *ppos : 0);
489  	iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len);
490  
491  	ret = call_write_iter(filp, &kiocb, &iter);
492  	BUG_ON(ret == -EIOCBQUEUED);
493  	if (ret > 0 && ppos)
494  		*ppos = kiocb.ki_pos;
495  	return ret;
496  }
497  
498  /* caller is responsible for file_start_write/file_end_write */
__kernel_write_iter(struct file * file,struct iov_iter * from,loff_t * pos)499  ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos)
500  {
501  	struct kiocb kiocb;
502  	ssize_t ret;
503  
504  	if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE)))
505  		return -EBADF;
506  	if (!(file->f_mode & FMODE_CAN_WRITE))
507  		return -EINVAL;
508  	/*
509  	 * Also fail if ->write_iter and ->write are both wired up as that
510  	 * implies very convoluted semantics.
511  	 */
512  	if (unlikely(!file->f_op->write_iter || file->f_op->write))
513  		return warn_unsupported(file, "write");
514  
515  	init_sync_kiocb(&kiocb, file);
516  	kiocb.ki_pos = pos ? *pos : 0;
517  	ret = file->f_op->write_iter(&kiocb, from);
518  	if (ret > 0) {
519  		if (pos)
520  			*pos = kiocb.ki_pos;
521  		fsnotify_modify(file);
522  		add_wchar(current, ret);
523  	}
524  	inc_syscw(current);
525  	return ret;
526  }
527  
528  /* caller is responsible for file_start_write/file_end_write */
__kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)529  ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
530  {
531  	struct kvec iov = {
532  		.iov_base	= (void *)buf,
533  		.iov_len	= min_t(size_t, count, MAX_RW_COUNT),
534  	};
535  	struct iov_iter iter;
536  	iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len);
537  	return __kernel_write_iter(file, &iter, pos);
538  }
539  /*
540   * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()",
541   * but autofs is one of the few internal kernel users that actually
542   * wants this _and_ can be built as a module. So we need to export
543   * this symbol for autofs, even though it really isn't appropriate
544   * for any other kernel modules.
545   */
546  EXPORT_SYMBOL_GPL(__kernel_write);
547  
kernel_write(struct file * file,const void * buf,size_t count,loff_t * pos)548  ssize_t kernel_write(struct file *file, const void *buf, size_t count,
549  			    loff_t *pos)
550  {
551  	ssize_t ret;
552  
553  	ret = rw_verify_area(WRITE, file, pos, count);
554  	if (ret)
555  		return ret;
556  
557  	file_start_write(file);
558  	ret =  __kernel_write(file, buf, count, pos);
559  	file_end_write(file);
560  	return ret;
561  }
562  EXPORT_SYMBOL(kernel_write);
563  
vfs_write(struct file * file,const char __user * buf,size_t count,loff_t * pos)564  ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
565  {
566  	ssize_t ret;
567  
568  	if (!(file->f_mode & FMODE_WRITE))
569  		return -EBADF;
570  	if (!(file->f_mode & FMODE_CAN_WRITE))
571  		return -EINVAL;
572  	if (unlikely(!access_ok(buf, count)))
573  		return -EFAULT;
574  
575  	ret = rw_verify_area(WRITE, file, pos, count);
576  	if (ret)
577  		return ret;
578  	if (count > MAX_RW_COUNT)
579  		count =  MAX_RW_COUNT;
580  	file_start_write(file);
581  	if (file->f_op->write)
582  		ret = file->f_op->write(file, buf, count, pos);
583  	else if (file->f_op->write_iter)
584  		ret = new_sync_write(file, buf, count, pos);
585  	else
586  		ret = -EINVAL;
587  	if (ret > 0) {
588  		fsnotify_modify(file);
589  		add_wchar(current, ret);
590  	}
591  	inc_syscw(current);
592  	file_end_write(file);
593  	return ret;
594  }
595  
596  /* file_ppos returns &file->f_pos or NULL if file is stream */
file_ppos(struct file * file)597  static inline loff_t *file_ppos(struct file *file)
598  {
599  	return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos;
600  }
601  
ksys_read(unsigned int fd,char __user * buf,size_t count)602  ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
603  {
604  	struct fd f = fdget_pos(fd);
605  	ssize_t ret = -EBADF;
606  
607  	if (f.file) {
608  		loff_t pos, *ppos = file_ppos(f.file);
609  		if (ppos) {
610  			pos = *ppos;
611  			ppos = &pos;
612  		}
613  		ret = vfs_read(f.file, buf, count, ppos);
614  		if (ret >= 0 && ppos)
615  			f.file->f_pos = pos;
616  		fdput_pos(f);
617  	}
618  	return ret;
619  }
620  
SYSCALL_DEFINE3(read,unsigned int,fd,char __user *,buf,size_t,count)621  SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
622  {
623  	return ksys_read(fd, buf, count);
624  }
625  
ksys_write(unsigned int fd,const char __user * buf,size_t count)626  ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
627  {
628  	struct fd f = fdget_pos(fd);
629  	ssize_t ret = -EBADF;
630  
631  	if (f.file) {
632  		loff_t pos, *ppos = file_ppos(f.file);
633  		if (ppos) {
634  			pos = *ppos;
635  			ppos = &pos;
636  		}
637  		ret = vfs_write(f.file, buf, count, ppos);
638  		if (ret >= 0 && ppos)
639  			f.file->f_pos = pos;
640  		fdput_pos(f);
641  	}
642  
643  	return ret;
644  }
645  
SYSCALL_DEFINE3(write,unsigned int,fd,const char __user *,buf,size_t,count)646  SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
647  		size_t, count)
648  {
649  	return ksys_write(fd, buf, count);
650  }
651  
ksys_pread64(unsigned int fd,char __user * buf,size_t count,loff_t pos)652  ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
653  		     loff_t pos)
654  {
655  	struct fd f;
656  	ssize_t ret = -EBADF;
657  
658  	if (pos < 0)
659  		return -EINVAL;
660  
661  	f = fdget(fd);
662  	if (f.file) {
663  		ret = -ESPIPE;
664  		if (f.file->f_mode & FMODE_PREAD)
665  			ret = vfs_read(f.file, buf, count, &pos);
666  		fdput(f);
667  	}
668  
669  	return ret;
670  }
671  
SYSCALL_DEFINE4(pread64,unsigned int,fd,char __user *,buf,size_t,count,loff_t,pos)672  SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
673  			size_t, count, loff_t, pos)
674  {
675  	return ksys_pread64(fd, buf, count, pos);
676  }
677  
678  #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64)
COMPAT_SYSCALL_DEFINE5(pread64,unsigned int,fd,char __user *,buf,size_t,count,compat_arg_u64_dual (pos))679  COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
680  		       size_t, count, compat_arg_u64_dual(pos))
681  {
682  	return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos));
683  }
684  #endif
685  
ksys_pwrite64(unsigned int fd,const char __user * buf,size_t count,loff_t pos)686  ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
687  		      size_t count, loff_t pos)
688  {
689  	struct fd f;
690  	ssize_t ret = -EBADF;
691  
692  	if (pos < 0)
693  		return -EINVAL;
694  
695  	f = fdget(fd);
696  	if (f.file) {
697  		ret = -ESPIPE;
698  		if (f.file->f_mode & FMODE_PWRITE)
699  			ret = vfs_write(f.file, buf, count, &pos);
700  		fdput(f);
701  	}
702  
703  	return ret;
704  }
705  
SYSCALL_DEFINE4(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,loff_t,pos)706  SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
707  			 size_t, count, loff_t, pos)
708  {
709  	return ksys_pwrite64(fd, buf, count, pos);
710  }
711  
712  #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64)
COMPAT_SYSCALL_DEFINE5(pwrite64,unsigned int,fd,const char __user *,buf,size_t,count,compat_arg_u64_dual (pos))713  COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf,
714  		       size_t, count, compat_arg_u64_dual(pos))
715  {
716  	return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos));
717  }
718  #endif
719  
do_iter_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)720  static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter,
721  		loff_t *ppos, int type, rwf_t flags)
722  {
723  	struct kiocb kiocb;
724  	ssize_t ret;
725  
726  	init_sync_kiocb(&kiocb, filp);
727  	ret = kiocb_set_rw_flags(&kiocb, flags);
728  	if (ret)
729  		return ret;
730  	kiocb.ki_pos = (ppos ? *ppos : 0);
731  
732  	if (type == READ)
733  		ret = call_read_iter(filp, &kiocb, iter);
734  	else
735  		ret = call_write_iter(filp, &kiocb, iter);
736  	BUG_ON(ret == -EIOCBQUEUED);
737  	if (ppos)
738  		*ppos = kiocb.ki_pos;
739  	return ret;
740  }
741  
742  /* Do it by hand, with file-ops */
do_loop_readv_writev(struct file * filp,struct iov_iter * iter,loff_t * ppos,int type,rwf_t flags)743  static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
744  		loff_t *ppos, int type, rwf_t flags)
745  {
746  	ssize_t ret = 0;
747  
748  	if (flags & ~RWF_HIPRI)
749  		return -EOPNOTSUPP;
750  
751  	while (iov_iter_count(iter)) {
752  		ssize_t nr;
753  
754  		if (type == READ) {
755  			nr = filp->f_op->read(filp, iter_iov_addr(iter),
756  						iter_iov_len(iter), ppos);
757  		} else {
758  			nr = filp->f_op->write(filp, iter_iov_addr(iter),
759  						iter_iov_len(iter), ppos);
760  		}
761  
762  		if (nr < 0) {
763  			if (!ret)
764  				ret = nr;
765  			break;
766  		}
767  		ret += nr;
768  		if (nr != iter_iov_len(iter))
769  			break;
770  		iov_iter_advance(iter, nr);
771  	}
772  
773  	return ret;
774  }
775  
do_iter_read(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)776  static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
777  		loff_t *pos, rwf_t flags)
778  {
779  	size_t tot_len;
780  	ssize_t ret = 0;
781  
782  	if (!(file->f_mode & FMODE_READ))
783  		return -EBADF;
784  	if (!(file->f_mode & FMODE_CAN_READ))
785  		return -EINVAL;
786  
787  	tot_len = iov_iter_count(iter);
788  	if (!tot_len)
789  		goto out;
790  	ret = rw_verify_area(READ, file, pos, tot_len);
791  	if (ret < 0)
792  		return ret;
793  
794  	if (file->f_op->read_iter)
795  		ret = do_iter_readv_writev(file, iter, pos, READ, flags);
796  	else
797  		ret = do_loop_readv_writev(file, iter, pos, READ, flags);
798  out:
799  	if (ret >= 0)
800  		fsnotify_access(file);
801  	return ret;
802  }
803  
vfs_iocb_iter_read(struct file * file,struct kiocb * iocb,struct iov_iter * iter)804  ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
805  			   struct iov_iter *iter)
806  {
807  	size_t tot_len;
808  	ssize_t ret = 0;
809  
810  	if (!file->f_op->read_iter)
811  		return -EINVAL;
812  	if (!(file->f_mode & FMODE_READ))
813  		return -EBADF;
814  	if (!(file->f_mode & FMODE_CAN_READ))
815  		return -EINVAL;
816  
817  	tot_len = iov_iter_count(iter);
818  	if (!tot_len)
819  		goto out;
820  	ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
821  	if (ret < 0)
822  		return ret;
823  
824  	ret = call_read_iter(file, iocb, iter);
825  out:
826  	if (ret >= 0)
827  		fsnotify_access(file);
828  	return ret;
829  }
830  EXPORT_SYMBOL(vfs_iocb_iter_read);
831  
vfs_iter_read(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)832  ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
833  		rwf_t flags)
834  {
835  	if (!file->f_op->read_iter)
836  		return -EINVAL;
837  	return do_iter_read(file, iter, ppos, flags);
838  }
839  EXPORT_SYMBOL(vfs_iter_read);
840  
do_iter_write(struct file * file,struct iov_iter * iter,loff_t * pos,rwf_t flags)841  static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
842  		loff_t *pos, rwf_t flags)
843  {
844  	size_t tot_len;
845  	ssize_t ret = 0;
846  
847  	if (!(file->f_mode & FMODE_WRITE))
848  		return -EBADF;
849  	if (!(file->f_mode & FMODE_CAN_WRITE))
850  		return -EINVAL;
851  
852  	tot_len = iov_iter_count(iter);
853  	if (!tot_len)
854  		return 0;
855  	ret = rw_verify_area(WRITE, file, pos, tot_len);
856  	if (ret < 0)
857  		return ret;
858  
859  	if (file->f_op->write_iter)
860  		ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
861  	else
862  		ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
863  	if (ret > 0)
864  		fsnotify_modify(file);
865  	return ret;
866  }
867  
vfs_iocb_iter_write(struct file * file,struct kiocb * iocb,struct iov_iter * iter)868  ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
869  			    struct iov_iter *iter)
870  {
871  	size_t tot_len;
872  	ssize_t ret = 0;
873  
874  	if (!file->f_op->write_iter)
875  		return -EINVAL;
876  	if (!(file->f_mode & FMODE_WRITE))
877  		return -EBADF;
878  	if (!(file->f_mode & FMODE_CAN_WRITE))
879  		return -EINVAL;
880  
881  	tot_len = iov_iter_count(iter);
882  	if (!tot_len)
883  		return 0;
884  	ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
885  	if (ret < 0)
886  		return ret;
887  
888  	ret = call_write_iter(file, iocb, iter);
889  	if (ret > 0)
890  		fsnotify_modify(file);
891  
892  	return ret;
893  }
894  EXPORT_SYMBOL(vfs_iocb_iter_write);
895  
vfs_iter_write(struct file * file,struct iov_iter * iter,loff_t * ppos,rwf_t flags)896  ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
897  		rwf_t flags)
898  {
899  	if (!file->f_op->write_iter)
900  		return -EINVAL;
901  	return do_iter_write(file, iter, ppos, flags);
902  }
903  EXPORT_SYMBOL(vfs_iter_write);
904  
vfs_readv(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)905  static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
906  		  unsigned long vlen, loff_t *pos, rwf_t flags)
907  {
908  	struct iovec iovstack[UIO_FASTIOV];
909  	struct iovec *iov = iovstack;
910  	struct iov_iter iter;
911  	ssize_t ret;
912  
913  	ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
914  	if (ret >= 0) {
915  		ret = do_iter_read(file, &iter, pos, flags);
916  		kfree(iov);
917  	}
918  
919  	return ret;
920  }
921  
vfs_writev(struct file * file,const struct iovec __user * vec,unsigned long vlen,loff_t * pos,rwf_t flags)922  static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
923  		   unsigned long vlen, loff_t *pos, rwf_t flags)
924  {
925  	struct iovec iovstack[UIO_FASTIOV];
926  	struct iovec *iov = iovstack;
927  	struct iov_iter iter;
928  	ssize_t ret;
929  
930  	ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
931  	if (ret >= 0) {
932  		file_start_write(file);
933  		ret = do_iter_write(file, &iter, pos, flags);
934  		file_end_write(file);
935  		kfree(iov);
936  	}
937  	return ret;
938  }
939  
do_readv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)940  static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
941  			unsigned long vlen, rwf_t flags)
942  {
943  	struct fd f = fdget_pos(fd);
944  	ssize_t ret = -EBADF;
945  
946  	if (f.file) {
947  		loff_t pos, *ppos = file_ppos(f.file);
948  		if (ppos) {
949  			pos = *ppos;
950  			ppos = &pos;
951  		}
952  		ret = vfs_readv(f.file, vec, vlen, ppos, flags);
953  		if (ret >= 0 && ppos)
954  			f.file->f_pos = pos;
955  		fdput_pos(f);
956  	}
957  
958  	if (ret > 0)
959  		add_rchar(current, ret);
960  	inc_syscr(current);
961  	return ret;
962  }
963  
do_writev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,rwf_t flags)964  static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
965  			 unsigned long vlen, rwf_t flags)
966  {
967  	struct fd f = fdget_pos(fd);
968  	ssize_t ret = -EBADF;
969  
970  	if (f.file) {
971  		loff_t pos, *ppos = file_ppos(f.file);
972  		if (ppos) {
973  			pos = *ppos;
974  			ppos = &pos;
975  		}
976  		ret = vfs_writev(f.file, vec, vlen, ppos, flags);
977  		if (ret >= 0 && ppos)
978  			f.file->f_pos = pos;
979  		fdput_pos(f);
980  	}
981  
982  	if (ret > 0)
983  		add_wchar(current, ret);
984  	inc_syscw(current);
985  	return ret;
986  }
987  
pos_from_hilo(unsigned long high,unsigned long low)988  static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
989  {
990  #define HALF_LONG_BITS (BITS_PER_LONG / 2)
991  	return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
992  }
993  
do_preadv(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)994  static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
995  			 unsigned long vlen, loff_t pos, rwf_t flags)
996  {
997  	struct fd f;
998  	ssize_t ret = -EBADF;
999  
1000  	if (pos < 0)
1001  		return -EINVAL;
1002  
1003  	f = fdget(fd);
1004  	if (f.file) {
1005  		ret = -ESPIPE;
1006  		if (f.file->f_mode & FMODE_PREAD)
1007  			ret = vfs_readv(f.file, vec, vlen, &pos, flags);
1008  		fdput(f);
1009  	}
1010  
1011  	if (ret > 0)
1012  		add_rchar(current, ret);
1013  	inc_syscr(current);
1014  	return ret;
1015  }
1016  
do_pwritev(unsigned long fd,const struct iovec __user * vec,unsigned long vlen,loff_t pos,rwf_t flags)1017  static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
1018  			  unsigned long vlen, loff_t pos, rwf_t flags)
1019  {
1020  	struct fd f;
1021  	ssize_t ret = -EBADF;
1022  
1023  	if (pos < 0)
1024  		return -EINVAL;
1025  
1026  	f = fdget(fd);
1027  	if (f.file) {
1028  		ret = -ESPIPE;
1029  		if (f.file->f_mode & FMODE_PWRITE)
1030  			ret = vfs_writev(f.file, vec, vlen, &pos, flags);
1031  		fdput(f);
1032  	}
1033  
1034  	if (ret > 0)
1035  		add_wchar(current, ret);
1036  	inc_syscw(current);
1037  	return ret;
1038  }
1039  
SYSCALL_DEFINE3(readv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1040  SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
1041  		unsigned long, vlen)
1042  {
1043  	return do_readv(fd, vec, vlen, 0);
1044  }
1045  
SYSCALL_DEFINE3(writev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen)1046  SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
1047  		unsigned long, vlen)
1048  {
1049  	return do_writev(fd, vec, vlen, 0);
1050  }
1051  
SYSCALL_DEFINE5(preadv,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1052  SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
1053  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1054  {
1055  	loff_t pos = pos_from_hilo(pos_h, pos_l);
1056  
1057  	return do_preadv(fd, vec, vlen, pos, 0);
1058  }
1059  
SYSCALL_DEFINE6(preadv2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1060  SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec,
1061  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1062  		rwf_t, flags)
1063  {
1064  	loff_t pos = pos_from_hilo(pos_h, pos_l);
1065  
1066  	if (pos == -1)
1067  		return do_readv(fd, vec, vlen, flags);
1068  
1069  	return do_preadv(fd, vec, vlen, pos, flags);
1070  }
1071  
SYSCALL_DEFINE5(pwritev,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h)1072  SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
1073  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
1074  {
1075  	loff_t pos = pos_from_hilo(pos_h, pos_l);
1076  
1077  	return do_pwritev(fd, vec, vlen, pos, 0);
1078  }
1079  
SYSCALL_DEFINE6(pwritev2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,unsigned long,pos_l,unsigned long,pos_h,rwf_t,flags)1080  SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec,
1081  		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h,
1082  		rwf_t, flags)
1083  {
1084  	loff_t pos = pos_from_hilo(pos_h, pos_l);
1085  
1086  	if (pos == -1)
1087  		return do_writev(fd, vec, vlen, flags);
1088  
1089  	return do_pwritev(fd, vec, vlen, pos, flags);
1090  }
1091  
1092  /*
1093   * Various compat syscalls.  Note that they all pretend to take a native
1094   * iovec - import_iovec will properly treat those as compat_iovecs based on
1095   * in_compat_syscall().
1096   */
1097  #ifdef CONFIG_COMPAT
1098  #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
COMPAT_SYSCALL_DEFINE4(preadv64,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,loff_t,pos)1099  COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
1100  		const struct iovec __user *, vec,
1101  		unsigned long, vlen, loff_t, pos)
1102  {
1103  	return do_preadv(fd, vec, vlen, pos, 0);
1104  }
1105  #endif
1106  
COMPAT_SYSCALL_DEFINE5(preadv,compat_ulong_t,fd,const struct iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1107  COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
1108  		const struct iovec __user *, vec,
1109  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1110  {
1111  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1112  
1113  	return do_preadv(fd, vec, vlen, pos, 0);
1114  }
1115  
1116  #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2
COMPAT_SYSCALL_DEFINE5(preadv64v2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1117  COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd,
1118  		const struct iovec __user *, vec,
1119  		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1120  {
1121  	if (pos == -1)
1122  		return do_readv(fd, vec, vlen, flags);
1123  	return do_preadv(fd, vec, vlen, pos, flags);
1124  }
1125  #endif
1126  
COMPAT_SYSCALL_DEFINE6(preadv2,compat_ulong_t,fd,const struct iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1127  COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd,
1128  		const struct iovec __user *, vec,
1129  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high,
1130  		rwf_t, flags)
1131  {
1132  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1133  
1134  	if (pos == -1)
1135  		return do_readv(fd, vec, vlen, flags);
1136  	return do_preadv(fd, vec, vlen, pos, flags);
1137  }
1138  
1139  #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
COMPAT_SYSCALL_DEFINE4(pwritev64,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,loff_t,pos)1140  COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
1141  		const struct iovec __user *, vec,
1142  		unsigned long, vlen, loff_t, pos)
1143  {
1144  	return do_pwritev(fd, vec, vlen, pos, 0);
1145  }
1146  #endif
1147  
COMPAT_SYSCALL_DEFINE5(pwritev,compat_ulong_t,fd,const struct iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high)1148  COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
1149  		const struct iovec __user *,vec,
1150  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
1151  {
1152  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1153  
1154  	return do_pwritev(fd, vec, vlen, pos, 0);
1155  }
1156  
1157  #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2
COMPAT_SYSCALL_DEFINE5(pwritev64v2,unsigned long,fd,const struct iovec __user *,vec,unsigned long,vlen,loff_t,pos,rwf_t,flags)1158  COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd,
1159  		const struct iovec __user *, vec,
1160  		unsigned long, vlen, loff_t, pos, rwf_t, flags)
1161  {
1162  	if (pos == -1)
1163  		return do_writev(fd, vec, vlen, flags);
1164  	return do_pwritev(fd, vec, vlen, pos, flags);
1165  }
1166  #endif
1167  
COMPAT_SYSCALL_DEFINE6(pwritev2,compat_ulong_t,fd,const struct iovec __user *,vec,compat_ulong_t,vlen,u32,pos_low,u32,pos_high,rwf_t,flags)1168  COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
1169  		const struct iovec __user *,vec,
1170  		compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags)
1171  {
1172  	loff_t pos = ((loff_t)pos_high << 32) | pos_low;
1173  
1174  	if (pos == -1)
1175  		return do_writev(fd, vec, vlen, flags);
1176  	return do_pwritev(fd, vec, vlen, pos, flags);
1177  }
1178  #endif /* CONFIG_COMPAT */
1179  
do_sendfile(int out_fd,int in_fd,loff_t * ppos,size_t count,loff_t max)1180  static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
1181  		  	   size_t count, loff_t max)
1182  {
1183  	struct fd in, out;
1184  	struct inode *in_inode, *out_inode;
1185  	struct pipe_inode_info *opipe;
1186  	loff_t pos;
1187  	loff_t out_pos;
1188  	ssize_t retval;
1189  	int fl;
1190  
1191  	/*
1192  	 * Get input file, and verify that it is ok..
1193  	 */
1194  	retval = -EBADF;
1195  	in = fdget(in_fd);
1196  	if (!in.file)
1197  		goto out;
1198  	if (!(in.file->f_mode & FMODE_READ))
1199  		goto fput_in;
1200  	retval = -ESPIPE;
1201  	if (!ppos) {
1202  		pos = in.file->f_pos;
1203  	} else {
1204  		pos = *ppos;
1205  		if (!(in.file->f_mode & FMODE_PREAD))
1206  			goto fput_in;
1207  	}
1208  	retval = rw_verify_area(READ, in.file, &pos, count);
1209  	if (retval < 0)
1210  		goto fput_in;
1211  	if (count > MAX_RW_COUNT)
1212  		count =  MAX_RW_COUNT;
1213  
1214  	/*
1215  	 * Get output file, and verify that it is ok..
1216  	 */
1217  	retval = -EBADF;
1218  	out = fdget(out_fd);
1219  	if (!out.file)
1220  		goto fput_in;
1221  	if (!(out.file->f_mode & FMODE_WRITE))
1222  		goto fput_out;
1223  	in_inode = file_inode(in.file);
1224  	out_inode = file_inode(out.file);
1225  	out_pos = out.file->f_pos;
1226  
1227  	if (!max)
1228  		max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
1229  
1230  	if (unlikely(pos + count > max)) {
1231  		retval = -EOVERFLOW;
1232  		if (pos >= max)
1233  			goto fput_out;
1234  		count = max - pos;
1235  	}
1236  
1237  	fl = 0;
1238  #if 0
1239  	/*
1240  	 * We need to debate whether we can enable this or not. The
1241  	 * man page documents EAGAIN return for the output at least,
1242  	 * and the application is arguably buggy if it doesn't expect
1243  	 * EAGAIN on a non-blocking file descriptor.
1244  	 */
1245  	if (in.file->f_flags & O_NONBLOCK)
1246  		fl = SPLICE_F_NONBLOCK;
1247  #endif
1248  	opipe = get_pipe_info(out.file, true);
1249  	if (!opipe) {
1250  		retval = rw_verify_area(WRITE, out.file, &out_pos, count);
1251  		if (retval < 0)
1252  			goto fput_out;
1253  		file_start_write(out.file);
1254  		retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
1255  					  count, fl);
1256  		file_end_write(out.file);
1257  	} else {
1258  		if (out.file->f_flags & O_NONBLOCK)
1259  			fl |= SPLICE_F_NONBLOCK;
1260  
1261  		retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl);
1262  	}
1263  
1264  	if (retval > 0) {
1265  		add_rchar(current, retval);
1266  		add_wchar(current, retval);
1267  		fsnotify_access(in.file);
1268  		fsnotify_modify(out.file);
1269  		out.file->f_pos = out_pos;
1270  		if (ppos)
1271  			*ppos = pos;
1272  		else
1273  			in.file->f_pos = pos;
1274  	}
1275  
1276  	inc_syscr(current);
1277  	inc_syscw(current);
1278  	if (pos > max)
1279  		retval = -EOVERFLOW;
1280  
1281  fput_out:
1282  	fdput(out);
1283  fput_in:
1284  	fdput(in);
1285  out:
1286  	return retval;
1287  }
1288  
SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,off_t __user *,offset,size_t,count)1289  SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
1290  {
1291  	loff_t pos;
1292  	off_t off;
1293  	ssize_t ret;
1294  
1295  	if (offset) {
1296  		if (unlikely(get_user(off, offset)))
1297  			return -EFAULT;
1298  		pos = off;
1299  		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1300  		if (unlikely(put_user(pos, offset)))
1301  			return -EFAULT;
1302  		return ret;
1303  	}
1304  
1305  	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1306  }
1307  
SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,loff_t __user *,offset,size_t,count)1308  SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
1309  {
1310  	loff_t pos;
1311  	ssize_t ret;
1312  
1313  	if (offset) {
1314  		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1315  			return -EFAULT;
1316  		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1317  		if (unlikely(put_user(pos, offset)))
1318  			return -EFAULT;
1319  		return ret;
1320  	}
1321  
1322  	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1323  }
1324  
1325  #ifdef CONFIG_COMPAT
COMPAT_SYSCALL_DEFINE4(sendfile,int,out_fd,int,in_fd,compat_off_t __user *,offset,compat_size_t,count)1326  COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
1327  		compat_off_t __user *, offset, compat_size_t, count)
1328  {
1329  	loff_t pos;
1330  	off_t off;
1331  	ssize_t ret;
1332  
1333  	if (offset) {
1334  		if (unlikely(get_user(off, offset)))
1335  			return -EFAULT;
1336  		pos = off;
1337  		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
1338  		if (unlikely(put_user(pos, offset)))
1339  			return -EFAULT;
1340  		return ret;
1341  	}
1342  
1343  	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1344  }
1345  
COMPAT_SYSCALL_DEFINE4(sendfile64,int,out_fd,int,in_fd,compat_loff_t __user *,offset,compat_size_t,count)1346  COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
1347  		compat_loff_t __user *, offset, compat_size_t, count)
1348  {
1349  	loff_t pos;
1350  	ssize_t ret;
1351  
1352  	if (offset) {
1353  		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1354  			return -EFAULT;
1355  		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
1356  		if (unlikely(put_user(pos, offset)))
1357  			return -EFAULT;
1358  		return ret;
1359  	}
1360  
1361  	return do_sendfile(out_fd, in_fd, NULL, count, 0);
1362  }
1363  #endif
1364  
1365  /**
1366   * generic_copy_file_range - copy data between two files
1367   * @file_in:	file structure to read from
1368   * @pos_in:	file offset to read from
1369   * @file_out:	file structure to write data to
1370   * @pos_out:	file offset to write data to
1371   * @len:	amount of data to copy
1372   * @flags:	copy flags
1373   *
1374   * This is a generic filesystem helper to copy data from one file to another.
1375   * It has no constraints on the source or destination file owners - the files
1376   * can belong to different superblocks and different filesystem types. Short
1377   * copies are allowed.
1378   *
1379   * This should be called from the @file_out filesystem, as per the
1380   * ->copy_file_range() method.
1381   *
1382   * Returns the number of bytes copied or a negative error indicating the
1383   * failure.
1384   */
1385  
generic_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1386  ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
1387  				struct file *file_out, loff_t pos_out,
1388  				size_t len, unsigned int flags)
1389  {
1390  	lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
1391  
1392  	return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
1393  				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
1394  }
1395  EXPORT_SYMBOL(generic_copy_file_range);
1396  
1397  /*
1398   * Performs necessary checks before doing a file copy
1399   *
1400   * Can adjust amount of bytes to copy via @req_count argument.
1401   * Returns appropriate error code that caller should return or
1402   * zero in case the copy should be allowed.
1403   */
generic_copy_file_checks(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t * req_count,unsigned int flags)1404  static int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
1405  				    struct file *file_out, loff_t pos_out,
1406  				    size_t *req_count, unsigned int flags)
1407  {
1408  	struct inode *inode_in = file_inode(file_in);
1409  	struct inode *inode_out = file_inode(file_out);
1410  	uint64_t count = *req_count;
1411  	loff_t size_in;
1412  	int ret;
1413  
1414  	ret = generic_file_rw_checks(file_in, file_out);
1415  	if (ret)
1416  		return ret;
1417  
1418  	/*
1419  	 * We allow some filesystems to handle cross sb copy, but passing
1420  	 * a file of the wrong filesystem type to filesystem driver can result
1421  	 * in an attempt to dereference the wrong type of ->private_data, so
1422  	 * avoid doing that until we really have a good reason.
1423  	 *
1424  	 * nfs and cifs define several different file_system_type structures
1425  	 * and several different sets of file_operations, but they all end up
1426  	 * using the same ->copy_file_range() function pointer.
1427  	 */
1428  	if (flags & COPY_FILE_SPLICE) {
1429  		/* cross sb splice is allowed */
1430  	} else if (file_out->f_op->copy_file_range) {
1431  		if (file_in->f_op->copy_file_range !=
1432  		    file_out->f_op->copy_file_range)
1433  			return -EXDEV;
1434  	} else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) {
1435  		return -EXDEV;
1436  	}
1437  
1438  	/* Don't touch certain kinds of inodes */
1439  	if (IS_IMMUTABLE(inode_out))
1440  		return -EPERM;
1441  
1442  	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
1443  		return -ETXTBSY;
1444  
1445  	/* Ensure offsets don't wrap. */
1446  	if (pos_in + count < pos_in || pos_out + count < pos_out)
1447  		return -EOVERFLOW;
1448  
1449  	/* Shorten the copy to EOF */
1450  	size_in = i_size_read(inode_in);
1451  	if (pos_in >= size_in)
1452  		count = 0;
1453  	else
1454  		count = min(count, size_in - (uint64_t)pos_in);
1455  
1456  	ret = generic_write_check_limits(file_out, pos_out, &count);
1457  	if (ret)
1458  		return ret;
1459  
1460  	/* Don't allow overlapped copying within the same file. */
1461  	if (inode_in == inode_out &&
1462  	    pos_out + count > pos_in &&
1463  	    pos_out < pos_in + count)
1464  		return -EINVAL;
1465  
1466  	*req_count = count;
1467  	return 0;
1468  }
1469  
1470  /*
1471   * copy_file_range() differs from regular file read and write in that it
1472   * specifically allows return partial success.  When it does so is up to
1473   * the copy_file_range method.
1474   */
vfs_copy_file_range(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,size_t len,unsigned int flags)1475  ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
1476  			    struct file *file_out, loff_t pos_out,
1477  			    size_t len, unsigned int flags)
1478  {
1479  	ssize_t ret;
1480  	bool splice = flags & COPY_FILE_SPLICE;
1481  
1482  	if (flags & ~COPY_FILE_SPLICE)
1483  		return -EINVAL;
1484  
1485  	ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len,
1486  				       flags);
1487  	if (unlikely(ret))
1488  		return ret;
1489  
1490  	ret = rw_verify_area(READ, file_in, &pos_in, len);
1491  	if (unlikely(ret))
1492  		return ret;
1493  
1494  	ret = rw_verify_area(WRITE, file_out, &pos_out, len);
1495  	if (unlikely(ret))
1496  		return ret;
1497  
1498  	if (len == 0)
1499  		return 0;
1500  
1501  	file_start_write(file_out);
1502  
1503  	/*
1504  	 * Cloning is supported by more file systems, so we implement copy on
1505  	 * same sb using clone, but for filesystems where both clone and copy
1506  	 * are supported (e.g. nfs,cifs), we only call the copy method.
1507  	 */
1508  	if (!splice && file_out->f_op->copy_file_range) {
1509  		ret = file_out->f_op->copy_file_range(file_in, pos_in,
1510  						      file_out, pos_out,
1511  						      len, flags);
1512  		goto done;
1513  	}
1514  
1515  	if (!splice && file_in->f_op->remap_file_range &&
1516  	    file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
1517  		ret = file_in->f_op->remap_file_range(file_in, pos_in,
1518  				file_out, pos_out,
1519  				min_t(loff_t, MAX_RW_COUNT, len),
1520  				REMAP_FILE_CAN_SHORTEN);
1521  		if (ret > 0)
1522  			goto done;
1523  	}
1524  
1525  	/*
1526  	 * We can get here for same sb copy of filesystems that do not implement
1527  	 * ->copy_file_range() in case filesystem does not support clone or in
1528  	 * case filesystem supports clone but rejected the clone request (e.g.
1529  	 * because it was not block aligned).
1530  	 *
1531  	 * In both cases, fall back to kernel copy so we are able to maintain a
1532  	 * consistent story about which filesystems support copy_file_range()
1533  	 * and which filesystems do not, that will allow userspace tools to
1534  	 * make consistent desicions w.r.t using copy_file_range().
1535  	 *
1536  	 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
1537  	 */
1538  	ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
1539  				      flags);
1540  
1541  done:
1542  	if (ret > 0) {
1543  		fsnotify_access(file_in);
1544  		add_rchar(current, ret);
1545  		fsnotify_modify(file_out);
1546  		add_wchar(current, ret);
1547  	}
1548  
1549  	inc_syscr(current);
1550  	inc_syscw(current);
1551  
1552  	file_end_write(file_out);
1553  
1554  	return ret;
1555  }
1556  EXPORT_SYMBOL(vfs_copy_file_range);
1557  
SYSCALL_DEFINE6(copy_file_range,int,fd_in,loff_t __user *,off_in,int,fd_out,loff_t __user *,off_out,size_t,len,unsigned int,flags)1558  SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
1559  		int, fd_out, loff_t __user *, off_out,
1560  		size_t, len, unsigned int, flags)
1561  {
1562  	loff_t pos_in;
1563  	loff_t pos_out;
1564  	struct fd f_in;
1565  	struct fd f_out;
1566  	ssize_t ret = -EBADF;
1567  
1568  	f_in = fdget(fd_in);
1569  	if (!f_in.file)
1570  		goto out2;
1571  
1572  	f_out = fdget(fd_out);
1573  	if (!f_out.file)
1574  		goto out1;
1575  
1576  	ret = -EFAULT;
1577  	if (off_in) {
1578  		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
1579  			goto out;
1580  	} else {
1581  		pos_in = f_in.file->f_pos;
1582  	}
1583  
1584  	if (off_out) {
1585  		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
1586  			goto out;
1587  	} else {
1588  		pos_out = f_out.file->f_pos;
1589  	}
1590  
1591  	ret = -EINVAL;
1592  	if (flags != 0)
1593  		goto out;
1594  
1595  	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
1596  				  flags);
1597  	if (ret > 0) {
1598  		pos_in += ret;
1599  		pos_out += ret;
1600  
1601  		if (off_in) {
1602  			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
1603  				ret = -EFAULT;
1604  		} else {
1605  			f_in.file->f_pos = pos_in;
1606  		}
1607  
1608  		if (off_out) {
1609  			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
1610  				ret = -EFAULT;
1611  		} else {
1612  			f_out.file->f_pos = pos_out;
1613  		}
1614  	}
1615  
1616  out:
1617  	fdput(f_out);
1618  out1:
1619  	fdput(f_in);
1620  out2:
1621  	return ret;
1622  }
1623  
1624  /*
1625   * Don't operate on ranges the page cache doesn't support, and don't exceed the
1626   * LFS limits.  If pos is under the limit it becomes a short access.  If it
1627   * exceeds the limit we return -EFBIG.
1628   */
generic_write_check_limits(struct file * file,loff_t pos,loff_t * count)1629  int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count)
1630  {
1631  	struct inode *inode = file->f_mapping->host;
1632  	loff_t max_size = inode->i_sb->s_maxbytes;
1633  	loff_t limit = rlimit(RLIMIT_FSIZE);
1634  
1635  	if (limit != RLIM_INFINITY) {
1636  		if (pos >= limit) {
1637  			send_sig(SIGXFSZ, current, 0);
1638  			return -EFBIG;
1639  		}
1640  		*count = min(*count, limit - pos);
1641  	}
1642  
1643  	if (!(file->f_flags & O_LARGEFILE))
1644  		max_size = MAX_NON_LFS;
1645  
1646  	if (unlikely(pos >= max_size))
1647  		return -EFBIG;
1648  
1649  	*count = min(*count, max_size - pos);
1650  
1651  	return 0;
1652  }
1653  
1654  /* Like generic_write_checks(), but takes size of write instead of iter. */
generic_write_checks_count(struct kiocb * iocb,loff_t * count)1655  int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
1656  {
1657  	struct file *file = iocb->ki_filp;
1658  	struct inode *inode = file->f_mapping->host;
1659  
1660  	if (IS_SWAPFILE(inode))
1661  		return -ETXTBSY;
1662  
1663  	if (!*count)
1664  		return 0;
1665  
1666  	if (iocb->ki_flags & IOCB_APPEND)
1667  		iocb->ki_pos = i_size_read(inode);
1668  
1669  	if ((iocb->ki_flags & IOCB_NOWAIT) &&
1670  	    !((iocb->ki_flags & IOCB_DIRECT) ||
1671  	      (file->f_mode & FMODE_BUF_WASYNC)))
1672  		return -EINVAL;
1673  
1674  	return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
1675  }
1676  EXPORT_SYMBOL(generic_write_checks_count);
1677  
1678  /*
1679   * Performs necessary checks before doing a write
1680   *
1681   * Can adjust writing position or amount of bytes to write.
1682   * Returns appropriate error code that caller should return or
1683   * zero in case that write should be allowed.
1684   */
generic_write_checks(struct kiocb * iocb,struct iov_iter * from)1685  ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from)
1686  {
1687  	loff_t count = iov_iter_count(from);
1688  	int ret;
1689  
1690  	ret = generic_write_checks_count(iocb, &count);
1691  	if (ret)
1692  		return ret;
1693  
1694  	iov_iter_truncate(from, count);
1695  	return iov_iter_count(from);
1696  }
1697  EXPORT_SYMBOL(generic_write_checks);
1698  
1699  /*
1700   * Performs common checks before doing a file copy/clone
1701   * from @file_in to @file_out.
1702   */
generic_file_rw_checks(struct file * file_in,struct file * file_out)1703  int generic_file_rw_checks(struct file *file_in, struct file *file_out)
1704  {
1705  	struct inode *inode_in = file_inode(file_in);
1706  	struct inode *inode_out = file_inode(file_out);
1707  
1708  	/* Don't copy dirs, pipes, sockets... */
1709  	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
1710  		return -EISDIR;
1711  	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
1712  		return -EINVAL;
1713  
1714  	if (!(file_in->f_mode & FMODE_READ) ||
1715  	    !(file_out->f_mode & FMODE_WRITE) ||
1716  	    (file_out->f_flags & O_APPEND))
1717  		return -EBADF;
1718  
1719  	return 0;
1720  }
1721