1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/read_write.c 4 * 5 * Copyright (C) 1991, 1992 Linus Torvalds 6 */ 7 8 #include <linux/slab.h> 9 #include <linux/stat.h> 10 #include <linux/sched/xacct.h> 11 #include <linux/fcntl.h> 12 #include <linux/file.h> 13 #include <linux/uio.h> 14 #include <linux/fsnotify.h> 15 #include <linux/security.h> 16 #include <linux/export.h> 17 #include <linux/syscalls.h> 18 #include <linux/pagemap.h> 19 #include <linux/splice.h> 20 #include <linux/compat.h> 21 #include <linux/mount.h> 22 #include <linux/fs.h> 23 #include "internal.h" 24 25 #include <linux/uaccess.h> 26 #include <asm/unistd.h> 27 28 const struct file_operations generic_ro_fops = { 29 .llseek = generic_file_llseek, 30 .read_iter = generic_file_read_iter, 31 .mmap = generic_file_readonly_mmap, 32 .splice_read = generic_file_splice_read, 33 }; 34 35 EXPORT_SYMBOL(generic_ro_fops); 36 37 static inline bool unsigned_offsets(struct file *file) 38 { 39 return file->f_mode & FMODE_UNSIGNED_OFFSET; 40 } 41 42 /** 43 * vfs_setpos - update the file offset for lseek 44 * @file: file structure in question 45 * @offset: file offset to seek to 46 * @maxsize: maximum file size 47 * 48 * This is a low-level filesystem helper for updating the file offset to 49 * the value specified by @offset if the given offset is valid and it is 50 * not equal to the current file offset. 51 * 52 * Return the specified offset on success and -EINVAL on invalid offset. 53 */ 54 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 55 { 56 if (offset < 0 && !unsigned_offsets(file)) 57 return -EINVAL; 58 if (offset > maxsize) 59 return -EINVAL; 60 61 if (offset != file->f_pos) { 62 file->f_pos = offset; 63 file->f_version = 0; 64 } 65 return offset; 66 } 67 EXPORT_SYMBOL(vfs_setpos); 68 69 /** 70 * generic_file_llseek_size - generic llseek implementation for regular files 71 * @file: file structure to seek on 72 * @offset: file offset to seek to 73 * @whence: type of seek 74 * @size: max size of this file in file system 75 * @eof: offset used for SEEK_END position 76 * 77 * This is a variant of generic_file_llseek that allows passing in a custom 78 * maximum file size and a custom EOF position, for e.g. hashed directories 79 * 80 * Synchronization: 81 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 82 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 83 * read/writes behave like SEEK_SET against seeks. 84 */ 85 loff_t 86 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 87 loff_t maxsize, loff_t eof) 88 { 89 switch (whence) { 90 case SEEK_END: 91 offset += eof; 92 break; 93 case SEEK_CUR: 94 /* 95 * Here we special-case the lseek(fd, 0, SEEK_CUR) 96 * position-querying operation. Avoid rewriting the "same" 97 * f_pos value back to the file because a concurrent read(), 98 * write() or lseek() might have altered it 99 */ 100 if (offset == 0) 101 return file->f_pos; 102 /* 103 * f_lock protects against read/modify/write race with other 104 * SEEK_CURs. Note that parallel writes and reads behave 105 * like SEEK_SET. 106 */ 107 spin_lock(&file->f_lock); 108 offset = vfs_setpos(file, file->f_pos + offset, maxsize); 109 spin_unlock(&file->f_lock); 110 return offset; 111 case SEEK_DATA: 112 /* 113 * In the generic case the entire file is data, so as long as 114 * offset isn't at the end of the file then the offset is data. 115 */ 116 if ((unsigned long long)offset >= eof) 117 return -ENXIO; 118 break; 119 case SEEK_HOLE: 120 /* 121 * There is a virtual hole at the end of the file, so as long as 122 * offset isn't i_size or larger, return i_size. 123 */ 124 if ((unsigned long long)offset >= eof) 125 return -ENXIO; 126 offset = eof; 127 break; 128 } 129 130 return vfs_setpos(file, offset, maxsize); 131 } 132 EXPORT_SYMBOL(generic_file_llseek_size); 133 134 /** 135 * generic_file_llseek - generic llseek implementation for regular files 136 * @file: file structure to seek on 137 * @offset: file offset to seek to 138 * @whence: type of seek 139 * 140 * This is a generic implemenation of ->llseek useable for all normal local 141 * filesystems. It just updates the file offset to the value specified by 142 * @offset and @whence. 143 */ 144 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 145 { 146 struct inode *inode = file->f_mapping->host; 147 148 return generic_file_llseek_size(file, offset, whence, 149 inode->i_sb->s_maxbytes, 150 i_size_read(inode)); 151 } 152 EXPORT_SYMBOL(generic_file_llseek); 153 154 /** 155 * fixed_size_llseek - llseek implementation for fixed-sized devices 156 * @file: file structure to seek on 157 * @offset: file offset to seek to 158 * @whence: type of seek 159 * @size: size of the file 160 * 161 */ 162 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 163 { 164 switch (whence) { 165 case SEEK_SET: case SEEK_CUR: case SEEK_END: 166 return generic_file_llseek_size(file, offset, whence, 167 size, size); 168 default: 169 return -EINVAL; 170 } 171 } 172 EXPORT_SYMBOL(fixed_size_llseek); 173 174 /** 175 * no_seek_end_llseek - llseek implementation for fixed-sized devices 176 * @file: file structure to seek on 177 * @offset: file offset to seek to 178 * @whence: type of seek 179 * 180 */ 181 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) 182 { 183 switch (whence) { 184 case SEEK_SET: case SEEK_CUR: 185 return generic_file_llseek_size(file, offset, whence, 186 OFFSET_MAX, 0); 187 default: 188 return -EINVAL; 189 } 190 } 191 EXPORT_SYMBOL(no_seek_end_llseek); 192 193 /** 194 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices 195 * @file: file structure to seek on 196 * @offset: file offset to seek to 197 * @whence: type of seek 198 * @size: maximal offset allowed 199 * 200 */ 201 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) 202 { 203 switch (whence) { 204 case SEEK_SET: case SEEK_CUR: 205 return generic_file_llseek_size(file, offset, whence, 206 size, 0); 207 default: 208 return -EINVAL; 209 } 210 } 211 EXPORT_SYMBOL(no_seek_end_llseek_size); 212 213 /** 214 * noop_llseek - No Operation Performed llseek implementation 215 * @file: file structure to seek on 216 * @offset: file offset to seek to 217 * @whence: type of seek 218 * 219 * This is an implementation of ->llseek useable for the rare special case when 220 * userspace expects the seek to succeed but the (device) file is actually not 221 * able to perform the seek. In this case you use noop_llseek() instead of 222 * falling back to the default implementation of ->llseek. 223 */ 224 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 225 { 226 return file->f_pos; 227 } 228 EXPORT_SYMBOL(noop_llseek); 229 230 loff_t default_llseek(struct file *file, loff_t offset, int whence) 231 { 232 struct inode *inode = file_inode(file); 233 loff_t retval; 234 235 inode_lock(inode); 236 switch (whence) { 237 case SEEK_END: 238 offset += i_size_read(inode); 239 break; 240 case SEEK_CUR: 241 if (offset == 0) { 242 retval = file->f_pos; 243 goto out; 244 } 245 offset += file->f_pos; 246 break; 247 case SEEK_DATA: 248 /* 249 * In the generic case the entire file is data, so as 250 * long as offset isn't at the end of the file then the 251 * offset is data. 252 */ 253 if (offset >= inode->i_size) { 254 retval = -ENXIO; 255 goto out; 256 } 257 break; 258 case SEEK_HOLE: 259 /* 260 * There is a virtual hole at the end of the file, so 261 * as long as offset isn't i_size or larger, return 262 * i_size. 263 */ 264 if (offset >= inode->i_size) { 265 retval = -ENXIO; 266 goto out; 267 } 268 offset = inode->i_size; 269 break; 270 } 271 retval = -EINVAL; 272 if (offset >= 0 || unsigned_offsets(file)) { 273 if (offset != file->f_pos) { 274 file->f_pos = offset; 275 file->f_version = 0; 276 } 277 retval = offset; 278 } 279 out: 280 inode_unlock(inode); 281 return retval; 282 } 283 EXPORT_SYMBOL(default_llseek); 284 285 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 286 { 287 if (!(file->f_mode & FMODE_LSEEK)) 288 return -ESPIPE; 289 return file->f_op->llseek(file, offset, whence); 290 } 291 EXPORT_SYMBOL(vfs_llseek); 292 293 static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) 294 { 295 off_t retval; 296 struct fd f = fdget_pos(fd); 297 if (!f.file) 298 return -EBADF; 299 300 retval = -EINVAL; 301 if (whence <= SEEK_MAX) { 302 loff_t res = vfs_llseek(f.file, offset, whence); 303 retval = res; 304 if (res != (loff_t)retval) 305 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 306 } 307 fdput_pos(f); 308 return retval; 309 } 310 311 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 312 { 313 return ksys_lseek(fd, offset, whence); 314 } 315 316 #ifdef CONFIG_COMPAT 317 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 318 { 319 return ksys_lseek(fd, offset, whence); 320 } 321 #endif 322 323 #if !defined(CONFIG_64BIT) || defined(CONFIG_COMPAT) || \ 324 defined(__ARCH_WANT_SYS_LLSEEK) 325 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 326 unsigned long, offset_low, loff_t __user *, result, 327 unsigned int, whence) 328 { 329 int retval; 330 struct fd f = fdget_pos(fd); 331 loff_t offset; 332 333 if (!f.file) 334 return -EBADF; 335 336 retval = -EINVAL; 337 if (whence > SEEK_MAX) 338 goto out_putf; 339 340 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 341 whence); 342 343 retval = (int)offset; 344 if (offset >= 0) { 345 retval = -EFAULT; 346 if (!copy_to_user(result, &offset, sizeof(offset))) 347 retval = 0; 348 } 349 out_putf: 350 fdput_pos(f); 351 return retval; 352 } 353 #endif 354 355 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 356 { 357 if (unlikely((ssize_t) count < 0)) 358 return -EINVAL; 359 360 if (ppos) { 361 loff_t pos = *ppos; 362 363 if (unlikely(pos < 0)) { 364 if (!unsigned_offsets(file)) 365 return -EINVAL; 366 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 367 return -EOVERFLOW; 368 } else if (unlikely((loff_t) (pos + count) < 0)) { 369 if (!unsigned_offsets(file)) 370 return -EINVAL; 371 } 372 } 373 374 return security_file_permission(file, 375 read_write == READ ? MAY_READ : MAY_WRITE); 376 } 377 EXPORT_SYMBOL(rw_verify_area); 378 379 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 380 { 381 struct kiocb kiocb; 382 struct iov_iter iter; 383 ssize_t ret; 384 385 init_sync_kiocb(&kiocb, filp); 386 kiocb.ki_pos = (ppos ? *ppos : 0); 387 iov_iter_ubuf(&iter, ITER_DEST, buf, len); 388 389 ret = call_read_iter(filp, &kiocb, &iter); 390 BUG_ON(ret == -EIOCBQUEUED); 391 if (ppos) 392 *ppos = kiocb.ki_pos; 393 return ret; 394 } 395 396 static int warn_unsupported(struct file *file, const char *op) 397 { 398 pr_warn_ratelimited( 399 "kernel %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 400 op, file, current->pid, current->comm); 401 return -EINVAL; 402 } 403 404 ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 405 { 406 struct kvec iov = { 407 .iov_base = buf, 408 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 409 }; 410 struct kiocb kiocb; 411 struct iov_iter iter; 412 ssize_t ret; 413 414 if (WARN_ON_ONCE(!(file->f_mode & FMODE_READ))) 415 return -EINVAL; 416 if (!(file->f_mode & FMODE_CAN_READ)) 417 return -EINVAL; 418 /* 419 * Also fail if ->read_iter and ->read are both wired up as that 420 * implies very convoluted semantics. 421 */ 422 if (unlikely(!file->f_op->read_iter || file->f_op->read)) 423 return warn_unsupported(file, "read"); 424 425 init_sync_kiocb(&kiocb, file); 426 kiocb.ki_pos = pos ? *pos : 0; 427 iov_iter_kvec(&iter, ITER_DEST, &iov, 1, iov.iov_len); 428 ret = file->f_op->read_iter(&kiocb, &iter); 429 if (ret > 0) { 430 if (pos) 431 *pos = kiocb.ki_pos; 432 fsnotify_access(file); 433 add_rchar(current, ret); 434 } 435 inc_syscr(current); 436 return ret; 437 } 438 439 ssize_t kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) 440 { 441 ssize_t ret; 442 443 ret = rw_verify_area(READ, file, pos, count); 444 if (ret) 445 return ret; 446 return __kernel_read(file, buf, count, pos); 447 } 448 EXPORT_SYMBOL(kernel_read); 449 450 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 451 { 452 ssize_t ret; 453 454 if (!(file->f_mode & FMODE_READ)) 455 return -EBADF; 456 if (!(file->f_mode & FMODE_CAN_READ)) 457 return -EINVAL; 458 if (unlikely(!access_ok(buf, count))) 459 return -EFAULT; 460 461 ret = rw_verify_area(READ, file, pos, count); 462 if (ret) 463 return ret; 464 if (count > MAX_RW_COUNT) 465 count = MAX_RW_COUNT; 466 467 if (file->f_op->read) 468 ret = file->f_op->read(file, buf, count, pos); 469 else if (file->f_op->read_iter) 470 ret = new_sync_read(file, buf, count, pos); 471 else 472 ret = -EINVAL; 473 if (ret > 0) { 474 fsnotify_access(file); 475 add_rchar(current, ret); 476 } 477 inc_syscr(current); 478 return ret; 479 } 480 481 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 482 { 483 struct kiocb kiocb; 484 struct iov_iter iter; 485 ssize_t ret; 486 487 init_sync_kiocb(&kiocb, filp); 488 kiocb.ki_pos = (ppos ? *ppos : 0); 489 iov_iter_ubuf(&iter, ITER_SOURCE, (void __user *)buf, len); 490 491 ret = call_write_iter(filp, &kiocb, &iter); 492 BUG_ON(ret == -EIOCBQUEUED); 493 if (ret > 0 && ppos) 494 *ppos = kiocb.ki_pos; 495 return ret; 496 } 497 498 /* caller is responsible for file_start_write/file_end_write */ 499 ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos) 500 { 501 struct kiocb kiocb; 502 ssize_t ret; 503 504 if (WARN_ON_ONCE(!(file->f_mode & FMODE_WRITE))) 505 return -EBADF; 506 if (!(file->f_mode & FMODE_CAN_WRITE)) 507 return -EINVAL; 508 /* 509 * Also fail if ->write_iter and ->write are both wired up as that 510 * implies very convoluted semantics. 511 */ 512 if (unlikely(!file->f_op->write_iter || file->f_op->write)) 513 return warn_unsupported(file, "write"); 514 515 init_sync_kiocb(&kiocb, file); 516 kiocb.ki_pos = pos ? *pos : 0; 517 ret = file->f_op->write_iter(&kiocb, from); 518 if (ret > 0) { 519 if (pos) 520 *pos = kiocb.ki_pos; 521 fsnotify_modify(file); 522 add_wchar(current, ret); 523 } 524 inc_syscw(current); 525 return ret; 526 } 527 528 /* caller is responsible for file_start_write/file_end_write */ 529 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos) 530 { 531 struct kvec iov = { 532 .iov_base = (void *)buf, 533 .iov_len = min_t(size_t, count, MAX_RW_COUNT), 534 }; 535 struct iov_iter iter; 536 iov_iter_kvec(&iter, ITER_SOURCE, &iov, 1, iov.iov_len); 537 return __kernel_write_iter(file, &iter, pos); 538 } 539 /* 540 * This "EXPORT_SYMBOL_GPL()" is more of a "EXPORT_SYMBOL_DONTUSE()", 541 * but autofs is one of the few internal kernel users that actually 542 * wants this _and_ can be built as a module. So we need to export 543 * this symbol for autofs, even though it really isn't appropriate 544 * for any other kernel modules. 545 */ 546 EXPORT_SYMBOL_GPL(__kernel_write); 547 548 ssize_t kernel_write(struct file *file, const void *buf, size_t count, 549 loff_t *pos) 550 { 551 ssize_t ret; 552 553 ret = rw_verify_area(WRITE, file, pos, count); 554 if (ret) 555 return ret; 556 557 file_start_write(file); 558 ret = __kernel_write(file, buf, count, pos); 559 file_end_write(file); 560 return ret; 561 } 562 EXPORT_SYMBOL(kernel_write); 563 564 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 565 { 566 ssize_t ret; 567 568 if (!(file->f_mode & FMODE_WRITE)) 569 return -EBADF; 570 if (!(file->f_mode & FMODE_CAN_WRITE)) 571 return -EINVAL; 572 if (unlikely(!access_ok(buf, count))) 573 return -EFAULT; 574 575 ret = rw_verify_area(WRITE, file, pos, count); 576 if (ret) 577 return ret; 578 if (count > MAX_RW_COUNT) 579 count = MAX_RW_COUNT; 580 file_start_write(file); 581 if (file->f_op->write) 582 ret = file->f_op->write(file, buf, count, pos); 583 else if (file->f_op->write_iter) 584 ret = new_sync_write(file, buf, count, pos); 585 else 586 ret = -EINVAL; 587 if (ret > 0) { 588 fsnotify_modify(file); 589 add_wchar(current, ret); 590 } 591 inc_syscw(current); 592 file_end_write(file); 593 return ret; 594 } 595 596 /* file_ppos returns &file->f_pos or NULL if file is stream */ 597 static inline loff_t *file_ppos(struct file *file) 598 { 599 return file->f_mode & FMODE_STREAM ? NULL : &file->f_pos; 600 } 601 602 ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) 603 { 604 struct fd f = fdget_pos(fd); 605 ssize_t ret = -EBADF; 606 607 if (f.file) { 608 loff_t pos, *ppos = file_ppos(f.file); 609 if (ppos) { 610 pos = *ppos; 611 ppos = &pos; 612 } 613 ret = vfs_read(f.file, buf, count, ppos); 614 if (ret >= 0 && ppos) 615 f.file->f_pos = pos; 616 fdput_pos(f); 617 } 618 return ret; 619 } 620 621 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 622 { 623 return ksys_read(fd, buf, count); 624 } 625 626 ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) 627 { 628 struct fd f = fdget_pos(fd); 629 ssize_t ret = -EBADF; 630 631 if (f.file) { 632 loff_t pos, *ppos = file_ppos(f.file); 633 if (ppos) { 634 pos = *ppos; 635 ppos = &pos; 636 } 637 ret = vfs_write(f.file, buf, count, ppos); 638 if (ret >= 0 && ppos) 639 f.file->f_pos = pos; 640 fdput_pos(f); 641 } 642 643 return ret; 644 } 645 646 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 647 size_t, count) 648 { 649 return ksys_write(fd, buf, count); 650 } 651 652 ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, 653 loff_t pos) 654 { 655 struct fd f; 656 ssize_t ret = -EBADF; 657 658 if (pos < 0) 659 return -EINVAL; 660 661 f = fdget(fd); 662 if (f.file) { 663 ret = -ESPIPE; 664 if (f.file->f_mode & FMODE_PREAD) 665 ret = vfs_read(f.file, buf, count, &pos); 666 fdput(f); 667 } 668 669 return ret; 670 } 671 672 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 673 size_t, count, loff_t, pos) 674 { 675 return ksys_pread64(fd, buf, count, pos); 676 } 677 678 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PREAD64) 679 COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf, 680 size_t, count, compat_arg_u64_dual(pos)) 681 { 682 return ksys_pread64(fd, buf, count, compat_arg_u64_glue(pos)); 683 } 684 #endif 685 686 ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, 687 size_t count, loff_t pos) 688 { 689 struct fd f; 690 ssize_t ret = -EBADF; 691 692 if (pos < 0) 693 return -EINVAL; 694 695 f = fdget(fd); 696 if (f.file) { 697 ret = -ESPIPE; 698 if (f.file->f_mode & FMODE_PWRITE) 699 ret = vfs_write(f.file, buf, count, &pos); 700 fdput(f); 701 } 702 703 return ret; 704 } 705 706 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 707 size_t, count, loff_t, pos) 708 { 709 return ksys_pwrite64(fd, buf, count, pos); 710 } 711 712 #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_PWRITE64) 713 COMPAT_SYSCALL_DEFINE5(pwrite64, unsigned int, fd, const char __user *, buf, 714 size_t, count, compat_arg_u64_dual(pos)) 715 { 716 return ksys_pwrite64(fd, buf, count, compat_arg_u64_glue(pos)); 717 } 718 #endif 719 720 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 721 loff_t *ppos, int type, rwf_t flags) 722 { 723 struct kiocb kiocb; 724 ssize_t ret; 725 726 init_sync_kiocb(&kiocb, filp); 727 ret = kiocb_set_rw_flags(&kiocb, flags); 728 if (ret) 729 return ret; 730 kiocb.ki_pos = (ppos ? *ppos : 0); 731 732 if (type == READ) 733 ret = call_read_iter(filp, &kiocb, iter); 734 else 735 ret = call_write_iter(filp, &kiocb, iter); 736 BUG_ON(ret == -EIOCBQUEUED); 737 if (ppos) 738 *ppos = kiocb.ki_pos; 739 return ret; 740 } 741 742 /* Do it by hand, with file-ops */ 743 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 744 loff_t *ppos, int type, rwf_t flags) 745 { 746 ssize_t ret = 0; 747 748 if (flags & ~RWF_HIPRI) 749 return -EOPNOTSUPP; 750 751 while (iov_iter_count(iter)) { 752 struct iovec iovec = iov_iter_iovec(iter); 753 ssize_t nr; 754 755 if (type == READ) { 756 nr = filp->f_op->read(filp, iovec.iov_base, 757 iovec.iov_len, ppos); 758 } else { 759 nr = filp->f_op->write(filp, iovec.iov_base, 760 iovec.iov_len, ppos); 761 } 762 763 if (nr < 0) { 764 if (!ret) 765 ret = nr; 766 break; 767 } 768 ret += nr; 769 if (nr != iovec.iov_len) 770 break; 771 iov_iter_advance(iter, nr); 772 } 773 774 return ret; 775 } 776 777 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, 778 loff_t *pos, rwf_t flags) 779 { 780 size_t tot_len; 781 ssize_t ret = 0; 782 783 if (!(file->f_mode & FMODE_READ)) 784 return -EBADF; 785 if (!(file->f_mode & FMODE_CAN_READ)) 786 return -EINVAL; 787 788 tot_len = iov_iter_count(iter); 789 if (!tot_len) 790 goto out; 791 ret = rw_verify_area(READ, file, pos, tot_len); 792 if (ret < 0) 793 return ret; 794 795 if (file->f_op->read_iter) 796 ret = do_iter_readv_writev(file, iter, pos, READ, flags); 797 else 798 ret = do_loop_readv_writev(file, iter, pos, READ, flags); 799 out: 800 if (ret >= 0) 801 fsnotify_access(file); 802 return ret; 803 } 804 805 ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, 806 struct iov_iter *iter) 807 { 808 size_t tot_len; 809 ssize_t ret = 0; 810 811 if (!file->f_op->read_iter) 812 return -EINVAL; 813 if (!(file->f_mode & FMODE_READ)) 814 return -EBADF; 815 if (!(file->f_mode & FMODE_CAN_READ)) 816 return -EINVAL; 817 818 tot_len = iov_iter_count(iter); 819 if (!tot_len) 820 goto out; 821 ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len); 822 if (ret < 0) 823 return ret; 824 825 ret = call_read_iter(file, iocb, iter); 826 out: 827 if (ret >= 0) 828 fsnotify_access(file); 829 return ret; 830 } 831 EXPORT_SYMBOL(vfs_iocb_iter_read); 832 833 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 834 rwf_t flags) 835 { 836 if (!file->f_op->read_iter) 837 return -EINVAL; 838 return do_iter_read(file, iter, ppos, flags); 839 } 840 EXPORT_SYMBOL(vfs_iter_read); 841 842 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, 843 loff_t *pos, rwf_t flags) 844 { 845 size_t tot_len; 846 ssize_t ret = 0; 847 848 if (!(file->f_mode & FMODE_WRITE)) 849 return -EBADF; 850 if (!(file->f_mode & FMODE_CAN_WRITE)) 851 return -EINVAL; 852 853 tot_len = iov_iter_count(iter); 854 if (!tot_len) 855 return 0; 856 ret = rw_verify_area(WRITE, file, pos, tot_len); 857 if (ret < 0) 858 return ret; 859 860 if (file->f_op->write_iter) 861 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); 862 else 863 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); 864 if (ret > 0) 865 fsnotify_modify(file); 866 return ret; 867 } 868 869 ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, 870 struct iov_iter *iter) 871 { 872 size_t tot_len; 873 ssize_t ret = 0; 874 875 if (!file->f_op->write_iter) 876 return -EINVAL; 877 if (!(file->f_mode & FMODE_WRITE)) 878 return -EBADF; 879 if (!(file->f_mode & FMODE_CAN_WRITE)) 880 return -EINVAL; 881 882 tot_len = iov_iter_count(iter); 883 if (!tot_len) 884 return 0; 885 ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len); 886 if (ret < 0) 887 return ret; 888 889 ret = call_write_iter(file, iocb, iter); 890 if (ret > 0) 891 fsnotify_modify(file); 892 893 return ret; 894 } 895 EXPORT_SYMBOL(vfs_iocb_iter_write); 896 897 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 898 rwf_t flags) 899 { 900 if (!file->f_op->write_iter) 901 return -EINVAL; 902 return do_iter_write(file, iter, ppos, flags); 903 } 904 EXPORT_SYMBOL(vfs_iter_write); 905 906 static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 907 unsigned long vlen, loff_t *pos, rwf_t flags) 908 { 909 struct iovec iovstack[UIO_FASTIOV]; 910 struct iovec *iov = iovstack; 911 struct iov_iter iter; 912 ssize_t ret; 913 914 ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 915 if (ret >= 0) { 916 ret = do_iter_read(file, &iter, pos, flags); 917 kfree(iov); 918 } 919 920 return ret; 921 } 922 923 static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 924 unsigned long vlen, loff_t *pos, rwf_t flags) 925 { 926 struct iovec iovstack[UIO_FASTIOV]; 927 struct iovec *iov = iovstack; 928 struct iov_iter iter; 929 ssize_t ret; 930 931 ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 932 if (ret >= 0) { 933 file_start_write(file); 934 ret = do_iter_write(file, &iter, pos, flags); 935 file_end_write(file); 936 kfree(iov); 937 } 938 return ret; 939 } 940 941 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, 942 unsigned long vlen, rwf_t flags) 943 { 944 struct fd f = fdget_pos(fd); 945 ssize_t ret = -EBADF; 946 947 if (f.file) { 948 loff_t pos, *ppos = file_ppos(f.file); 949 if (ppos) { 950 pos = *ppos; 951 ppos = &pos; 952 } 953 ret = vfs_readv(f.file, vec, vlen, ppos, flags); 954 if (ret >= 0 && ppos) 955 f.file->f_pos = pos; 956 fdput_pos(f); 957 } 958 959 if (ret > 0) 960 add_rchar(current, ret); 961 inc_syscr(current); 962 return ret; 963 } 964 965 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, 966 unsigned long vlen, rwf_t flags) 967 { 968 struct fd f = fdget_pos(fd); 969 ssize_t ret = -EBADF; 970 971 if (f.file) { 972 loff_t pos, *ppos = file_ppos(f.file); 973 if (ppos) { 974 pos = *ppos; 975 ppos = &pos; 976 } 977 ret = vfs_writev(f.file, vec, vlen, ppos, flags); 978 if (ret >= 0 && ppos) 979 f.file->f_pos = pos; 980 fdput_pos(f); 981 } 982 983 if (ret > 0) 984 add_wchar(current, ret); 985 inc_syscw(current); 986 return ret; 987 } 988 989 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 990 { 991 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 992 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 993 } 994 995 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, 996 unsigned long vlen, loff_t pos, rwf_t flags) 997 { 998 struct fd f; 999 ssize_t ret = -EBADF; 1000 1001 if (pos < 0) 1002 return -EINVAL; 1003 1004 f = fdget(fd); 1005 if (f.file) { 1006 ret = -ESPIPE; 1007 if (f.file->f_mode & FMODE_PREAD) 1008 ret = vfs_readv(f.file, vec, vlen, &pos, flags); 1009 fdput(f); 1010 } 1011 1012 if (ret > 0) 1013 add_rchar(current, ret); 1014 inc_syscr(current); 1015 return ret; 1016 } 1017 1018 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, 1019 unsigned long vlen, loff_t pos, rwf_t flags) 1020 { 1021 struct fd f; 1022 ssize_t ret = -EBADF; 1023 1024 if (pos < 0) 1025 return -EINVAL; 1026 1027 f = fdget(fd); 1028 if (f.file) { 1029 ret = -ESPIPE; 1030 if (f.file->f_mode & FMODE_PWRITE) 1031 ret = vfs_writev(f.file, vec, vlen, &pos, flags); 1032 fdput(f); 1033 } 1034 1035 if (ret > 0) 1036 add_wchar(current, ret); 1037 inc_syscw(current); 1038 return ret; 1039 } 1040 1041 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 1042 unsigned long, vlen) 1043 { 1044 return do_readv(fd, vec, vlen, 0); 1045 } 1046 1047 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 1048 unsigned long, vlen) 1049 { 1050 return do_writev(fd, vec, vlen, 0); 1051 } 1052 1053 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 1054 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1055 { 1056 loff_t pos = pos_from_hilo(pos_h, pos_l); 1057 1058 return do_preadv(fd, vec, vlen, pos, 0); 1059 } 1060 1061 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, 1062 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1063 rwf_t, flags) 1064 { 1065 loff_t pos = pos_from_hilo(pos_h, pos_l); 1066 1067 if (pos == -1) 1068 return do_readv(fd, vec, vlen, flags); 1069 1070 return do_preadv(fd, vec, vlen, pos, flags); 1071 } 1072 1073 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1074 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1075 { 1076 loff_t pos = pos_from_hilo(pos_h, pos_l); 1077 1078 return do_pwritev(fd, vec, vlen, pos, 0); 1079 } 1080 1081 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, 1082 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1083 rwf_t, flags) 1084 { 1085 loff_t pos = pos_from_hilo(pos_h, pos_l); 1086 1087 if (pos == -1) 1088 return do_writev(fd, vec, vlen, flags); 1089 1090 return do_pwritev(fd, vec, vlen, pos, flags); 1091 } 1092 1093 /* 1094 * Various compat syscalls. Note that they all pretend to take a native 1095 * iovec - import_iovec will properly treat those as compat_iovecs based on 1096 * in_compat_syscall(). 1097 */ 1098 #ifdef CONFIG_COMPAT 1099 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1100 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1101 const struct iovec __user *, vec, 1102 unsigned long, vlen, loff_t, pos) 1103 { 1104 return do_preadv(fd, vec, vlen, pos, 0); 1105 } 1106 #endif 1107 1108 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1109 const struct iovec __user *, vec, 1110 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1111 { 1112 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1113 1114 return do_preadv(fd, vec, vlen, pos, 0); 1115 } 1116 1117 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 1118 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, 1119 const struct iovec __user *, vec, 1120 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1121 { 1122 if (pos == -1) 1123 return do_readv(fd, vec, vlen, flags); 1124 return do_preadv(fd, vec, vlen, pos, flags); 1125 } 1126 #endif 1127 1128 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, 1129 const struct iovec __user *, vec, 1130 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, 1131 rwf_t, flags) 1132 { 1133 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1134 1135 if (pos == -1) 1136 return do_readv(fd, vec, vlen, flags); 1137 return do_preadv(fd, vec, vlen, pos, flags); 1138 } 1139 1140 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1141 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1142 const struct iovec __user *, vec, 1143 unsigned long, vlen, loff_t, pos) 1144 { 1145 return do_pwritev(fd, vec, vlen, pos, 0); 1146 } 1147 #endif 1148 1149 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1150 const struct iovec __user *,vec, 1151 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1152 { 1153 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1154 1155 return do_pwritev(fd, vec, vlen, pos, 0); 1156 } 1157 1158 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 1159 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, 1160 const struct iovec __user *, vec, 1161 unsigned long, vlen, loff_t, pos, rwf_t, flags) 1162 { 1163 if (pos == -1) 1164 return do_writev(fd, vec, vlen, flags); 1165 return do_pwritev(fd, vec, vlen, pos, flags); 1166 } 1167 #endif 1168 1169 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, 1170 const struct iovec __user *,vec, 1171 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) 1172 { 1173 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1174 1175 if (pos == -1) 1176 return do_writev(fd, vec, vlen, flags); 1177 return do_pwritev(fd, vec, vlen, pos, flags); 1178 } 1179 #endif /* CONFIG_COMPAT */ 1180 1181 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1182 size_t count, loff_t max) 1183 { 1184 struct fd in, out; 1185 struct inode *in_inode, *out_inode; 1186 struct pipe_inode_info *opipe; 1187 loff_t pos; 1188 loff_t out_pos; 1189 ssize_t retval; 1190 int fl; 1191 1192 /* 1193 * Get input file, and verify that it is ok.. 1194 */ 1195 retval = -EBADF; 1196 in = fdget(in_fd); 1197 if (!in.file) 1198 goto out; 1199 if (!(in.file->f_mode & FMODE_READ)) 1200 goto fput_in; 1201 retval = -ESPIPE; 1202 if (!ppos) { 1203 pos = in.file->f_pos; 1204 } else { 1205 pos = *ppos; 1206 if (!(in.file->f_mode & FMODE_PREAD)) 1207 goto fput_in; 1208 } 1209 retval = rw_verify_area(READ, in.file, &pos, count); 1210 if (retval < 0) 1211 goto fput_in; 1212 if (count > MAX_RW_COUNT) 1213 count = MAX_RW_COUNT; 1214 1215 /* 1216 * Get output file, and verify that it is ok.. 1217 */ 1218 retval = -EBADF; 1219 out = fdget(out_fd); 1220 if (!out.file) 1221 goto fput_in; 1222 if (!(out.file->f_mode & FMODE_WRITE)) 1223 goto fput_out; 1224 in_inode = file_inode(in.file); 1225 out_inode = file_inode(out.file); 1226 out_pos = out.file->f_pos; 1227 1228 if (!max) 1229 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1230 1231 if (unlikely(pos + count > max)) { 1232 retval = -EOVERFLOW; 1233 if (pos >= max) 1234 goto fput_out; 1235 count = max - pos; 1236 } 1237 1238 fl = 0; 1239 #if 0 1240 /* 1241 * We need to debate whether we can enable this or not. The 1242 * man page documents EAGAIN return for the output at least, 1243 * and the application is arguably buggy if it doesn't expect 1244 * EAGAIN on a non-blocking file descriptor. 1245 */ 1246 if (in.file->f_flags & O_NONBLOCK) 1247 fl = SPLICE_F_NONBLOCK; 1248 #endif 1249 opipe = get_pipe_info(out.file, true); 1250 if (!opipe) { 1251 retval = rw_verify_area(WRITE, out.file, &out_pos, count); 1252 if (retval < 0) 1253 goto fput_out; 1254 file_start_write(out.file); 1255 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, 1256 count, fl); 1257 file_end_write(out.file); 1258 } else { 1259 if (out.file->f_flags & O_NONBLOCK) 1260 fl |= SPLICE_F_NONBLOCK; 1261 1262 retval = splice_file_to_pipe(in.file, opipe, &pos, count, fl); 1263 } 1264 1265 if (retval > 0) { 1266 add_rchar(current, retval); 1267 add_wchar(current, retval); 1268 fsnotify_access(in.file); 1269 fsnotify_modify(out.file); 1270 out.file->f_pos = out_pos; 1271 if (ppos) 1272 *ppos = pos; 1273 else 1274 in.file->f_pos = pos; 1275 } 1276 1277 inc_syscr(current); 1278 inc_syscw(current); 1279 if (pos > max) 1280 retval = -EOVERFLOW; 1281 1282 fput_out: 1283 fdput(out); 1284 fput_in: 1285 fdput(in); 1286 out: 1287 return retval; 1288 } 1289 1290 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1291 { 1292 loff_t pos; 1293 off_t off; 1294 ssize_t ret; 1295 1296 if (offset) { 1297 if (unlikely(get_user(off, offset))) 1298 return -EFAULT; 1299 pos = off; 1300 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1301 if (unlikely(put_user(pos, offset))) 1302 return -EFAULT; 1303 return ret; 1304 } 1305 1306 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1307 } 1308 1309 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1310 { 1311 loff_t pos; 1312 ssize_t ret; 1313 1314 if (offset) { 1315 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1316 return -EFAULT; 1317 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1318 if (unlikely(put_user(pos, offset))) 1319 return -EFAULT; 1320 return ret; 1321 } 1322 1323 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1324 } 1325 1326 #ifdef CONFIG_COMPAT 1327 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1328 compat_off_t __user *, offset, compat_size_t, count) 1329 { 1330 loff_t pos; 1331 off_t off; 1332 ssize_t ret; 1333 1334 if (offset) { 1335 if (unlikely(get_user(off, offset))) 1336 return -EFAULT; 1337 pos = off; 1338 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1339 if (unlikely(put_user(pos, offset))) 1340 return -EFAULT; 1341 return ret; 1342 } 1343 1344 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1345 } 1346 1347 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1348 compat_loff_t __user *, offset, compat_size_t, count) 1349 { 1350 loff_t pos; 1351 ssize_t ret; 1352 1353 if (offset) { 1354 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1355 return -EFAULT; 1356 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1357 if (unlikely(put_user(pos, offset))) 1358 return -EFAULT; 1359 return ret; 1360 } 1361 1362 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1363 } 1364 #endif 1365 1366 /** 1367 * generic_copy_file_range - copy data between two files 1368 * @file_in: file structure to read from 1369 * @pos_in: file offset to read from 1370 * @file_out: file structure to write data to 1371 * @pos_out: file offset to write data to 1372 * @len: amount of data to copy 1373 * @flags: copy flags 1374 * 1375 * This is a generic filesystem helper to copy data from one file to another. 1376 * It has no constraints on the source or destination file owners - the files 1377 * can belong to different superblocks and different filesystem types. Short 1378 * copies are allowed. 1379 * 1380 * This should be called from the @file_out filesystem, as per the 1381 * ->copy_file_range() method. 1382 * 1383 * Returns the number of bytes copied or a negative error indicating the 1384 * failure. 1385 */ 1386 1387 ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, 1388 struct file *file_out, loff_t pos_out, 1389 size_t len, unsigned int flags) 1390 { 1391 lockdep_assert(sb_write_started(file_inode(file_out)->i_sb)); 1392 1393 return do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1394 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); 1395 } 1396 EXPORT_SYMBOL(generic_copy_file_range); 1397 1398 /* 1399 * Performs necessary checks before doing a file copy 1400 * 1401 * Can adjust amount of bytes to copy via @req_count argument. 1402 * Returns appropriate error code that caller should return or 1403 * zero in case the copy should be allowed. 1404 */ 1405 static int generic_copy_file_checks(struct file *file_in, loff_t pos_in, 1406 struct file *file_out, loff_t pos_out, 1407 size_t *req_count, unsigned int flags) 1408 { 1409 struct inode *inode_in = file_inode(file_in); 1410 struct inode *inode_out = file_inode(file_out); 1411 uint64_t count = *req_count; 1412 loff_t size_in; 1413 int ret; 1414 1415 ret = generic_file_rw_checks(file_in, file_out); 1416 if (ret) 1417 return ret; 1418 1419 /* 1420 * We allow some filesystems to handle cross sb copy, but passing 1421 * a file of the wrong filesystem type to filesystem driver can result 1422 * in an attempt to dereference the wrong type of ->private_data, so 1423 * avoid doing that until we really have a good reason. 1424 * 1425 * nfs and cifs define several different file_system_type structures 1426 * and several different sets of file_operations, but they all end up 1427 * using the same ->copy_file_range() function pointer. 1428 */ 1429 if (flags & COPY_FILE_SPLICE) { 1430 /* cross sb splice is allowed */ 1431 } else if (file_out->f_op->copy_file_range) { 1432 if (file_in->f_op->copy_file_range != 1433 file_out->f_op->copy_file_range) 1434 return -EXDEV; 1435 } else if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) { 1436 return -EXDEV; 1437 } 1438 1439 /* Don't touch certain kinds of inodes */ 1440 if (IS_IMMUTABLE(inode_out)) 1441 return -EPERM; 1442 1443 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1444 return -ETXTBSY; 1445 1446 /* Ensure offsets don't wrap. */ 1447 if (pos_in + count < pos_in || pos_out + count < pos_out) 1448 return -EOVERFLOW; 1449 1450 /* Shorten the copy to EOF */ 1451 size_in = i_size_read(inode_in); 1452 if (pos_in >= size_in) 1453 count = 0; 1454 else 1455 count = min(count, size_in - (uint64_t)pos_in); 1456 1457 ret = generic_write_check_limits(file_out, pos_out, &count); 1458 if (ret) 1459 return ret; 1460 1461 /* Don't allow overlapped copying within the same file. */ 1462 if (inode_in == inode_out && 1463 pos_out + count > pos_in && 1464 pos_out < pos_in + count) 1465 return -EINVAL; 1466 1467 *req_count = count; 1468 return 0; 1469 } 1470 1471 /* 1472 * copy_file_range() differs from regular file read and write in that it 1473 * specifically allows return partial success. When it does so is up to 1474 * the copy_file_range method. 1475 */ 1476 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1477 struct file *file_out, loff_t pos_out, 1478 size_t len, unsigned int flags) 1479 { 1480 ssize_t ret; 1481 bool splice = flags & COPY_FILE_SPLICE; 1482 1483 if (flags & ~COPY_FILE_SPLICE) 1484 return -EINVAL; 1485 1486 ret = generic_copy_file_checks(file_in, pos_in, file_out, pos_out, &len, 1487 flags); 1488 if (unlikely(ret)) 1489 return ret; 1490 1491 ret = rw_verify_area(READ, file_in, &pos_in, len); 1492 if (unlikely(ret)) 1493 return ret; 1494 1495 ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1496 if (unlikely(ret)) 1497 return ret; 1498 1499 if (len == 0) 1500 return 0; 1501 1502 file_start_write(file_out); 1503 1504 /* 1505 * Cloning is supported by more file systems, so we implement copy on 1506 * same sb using clone, but for filesystems where both clone and copy 1507 * are supported (e.g. nfs,cifs), we only call the copy method. 1508 */ 1509 if (!splice && file_out->f_op->copy_file_range) { 1510 ret = file_out->f_op->copy_file_range(file_in, pos_in, 1511 file_out, pos_out, 1512 len, flags); 1513 goto done; 1514 } 1515 1516 if (!splice && file_in->f_op->remap_file_range && 1517 file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) { 1518 ret = file_in->f_op->remap_file_range(file_in, pos_in, 1519 file_out, pos_out, 1520 min_t(loff_t, MAX_RW_COUNT, len), 1521 REMAP_FILE_CAN_SHORTEN); 1522 if (ret > 0) 1523 goto done; 1524 } 1525 1526 /* 1527 * We can get here for same sb copy of filesystems that do not implement 1528 * ->copy_file_range() in case filesystem does not support clone or in 1529 * case filesystem supports clone but rejected the clone request (e.g. 1530 * because it was not block aligned). 1531 * 1532 * In both cases, fall back to kernel copy so we are able to maintain a 1533 * consistent story about which filesystems support copy_file_range() 1534 * and which filesystems do not, that will allow userspace tools to 1535 * make consistent desicions w.r.t using copy_file_range(). 1536 * 1537 * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE. 1538 */ 1539 ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len, 1540 flags); 1541 1542 done: 1543 if (ret > 0) { 1544 fsnotify_access(file_in); 1545 add_rchar(current, ret); 1546 fsnotify_modify(file_out); 1547 add_wchar(current, ret); 1548 } 1549 1550 inc_syscr(current); 1551 inc_syscw(current); 1552 1553 file_end_write(file_out); 1554 1555 return ret; 1556 } 1557 EXPORT_SYMBOL(vfs_copy_file_range); 1558 1559 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1560 int, fd_out, loff_t __user *, off_out, 1561 size_t, len, unsigned int, flags) 1562 { 1563 loff_t pos_in; 1564 loff_t pos_out; 1565 struct fd f_in; 1566 struct fd f_out; 1567 ssize_t ret = -EBADF; 1568 1569 f_in = fdget(fd_in); 1570 if (!f_in.file) 1571 goto out2; 1572 1573 f_out = fdget(fd_out); 1574 if (!f_out.file) 1575 goto out1; 1576 1577 ret = -EFAULT; 1578 if (off_in) { 1579 if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1580 goto out; 1581 } else { 1582 pos_in = f_in.file->f_pos; 1583 } 1584 1585 if (off_out) { 1586 if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1587 goto out; 1588 } else { 1589 pos_out = f_out.file->f_pos; 1590 } 1591 1592 ret = -EINVAL; 1593 if (flags != 0) 1594 goto out; 1595 1596 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, 1597 flags); 1598 if (ret > 0) { 1599 pos_in += ret; 1600 pos_out += ret; 1601 1602 if (off_in) { 1603 if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1604 ret = -EFAULT; 1605 } else { 1606 f_in.file->f_pos = pos_in; 1607 } 1608 1609 if (off_out) { 1610 if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1611 ret = -EFAULT; 1612 } else { 1613 f_out.file->f_pos = pos_out; 1614 } 1615 } 1616 1617 out: 1618 fdput(f_out); 1619 out1: 1620 fdput(f_in); 1621 out2: 1622 return ret; 1623 } 1624 1625 /* 1626 * Don't operate on ranges the page cache doesn't support, and don't exceed the 1627 * LFS limits. If pos is under the limit it becomes a short access. If it 1628 * exceeds the limit we return -EFBIG. 1629 */ 1630 int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count) 1631 { 1632 struct inode *inode = file->f_mapping->host; 1633 loff_t max_size = inode->i_sb->s_maxbytes; 1634 loff_t limit = rlimit(RLIMIT_FSIZE); 1635 1636 if (limit != RLIM_INFINITY) { 1637 if (pos >= limit) { 1638 send_sig(SIGXFSZ, current, 0); 1639 return -EFBIG; 1640 } 1641 *count = min(*count, limit - pos); 1642 } 1643 1644 if (!(file->f_flags & O_LARGEFILE)) 1645 max_size = MAX_NON_LFS; 1646 1647 if (unlikely(pos >= max_size)) 1648 return -EFBIG; 1649 1650 *count = min(*count, max_size - pos); 1651 1652 return 0; 1653 } 1654 1655 /* Like generic_write_checks(), but takes size of write instead of iter. */ 1656 int generic_write_checks_count(struct kiocb *iocb, loff_t *count) 1657 { 1658 struct file *file = iocb->ki_filp; 1659 struct inode *inode = file->f_mapping->host; 1660 1661 if (IS_SWAPFILE(inode)) 1662 return -ETXTBSY; 1663 1664 if (!*count) 1665 return 0; 1666 1667 if (iocb->ki_flags & IOCB_APPEND) 1668 iocb->ki_pos = i_size_read(inode); 1669 1670 if ((iocb->ki_flags & IOCB_NOWAIT) && 1671 !((iocb->ki_flags & IOCB_DIRECT) || 1672 (file->f_mode & FMODE_BUF_WASYNC))) 1673 return -EINVAL; 1674 1675 return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); 1676 } 1677 EXPORT_SYMBOL(generic_write_checks_count); 1678 1679 /* 1680 * Performs necessary checks before doing a write 1681 * 1682 * Can adjust writing position or amount of bytes to write. 1683 * Returns appropriate error code that caller should return or 1684 * zero in case that write should be allowed. 1685 */ 1686 ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) 1687 { 1688 loff_t count = iov_iter_count(from); 1689 int ret; 1690 1691 ret = generic_write_checks_count(iocb, &count); 1692 if (ret) 1693 return ret; 1694 1695 iov_iter_truncate(from, count); 1696 return iov_iter_count(from); 1697 } 1698 EXPORT_SYMBOL(generic_write_checks); 1699 1700 /* 1701 * Performs common checks before doing a file copy/clone 1702 * from @file_in to @file_out. 1703 */ 1704 int generic_file_rw_checks(struct file *file_in, struct file *file_out) 1705 { 1706 struct inode *inode_in = file_inode(file_in); 1707 struct inode *inode_out = file_inode(file_out); 1708 1709 /* Don't copy dirs, pipes, sockets... */ 1710 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1711 return -EISDIR; 1712 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1713 return -EINVAL; 1714 1715 if (!(file_in->f_mode & FMODE_READ) || 1716 !(file_out->f_mode & FMODE_WRITE) || 1717 (file_out->f_flags & O_APPEND)) 1718 return -EBADF; 1719 1720 return 0; 1721 } 1722