1 /* 2 * linux/fs/read_write.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 #include <linux/slab.h> 8 #include <linux/stat.h> 9 #include <linux/sched/xacct.h> 10 #include <linux/fcntl.h> 11 #include <linux/file.h> 12 #include <linux/uio.h> 13 #include <linux/fsnotify.h> 14 #include <linux/security.h> 15 #include <linux/export.h> 16 #include <linux/syscalls.h> 17 #include <linux/pagemap.h> 18 #include <linux/splice.h> 19 #include <linux/compat.h> 20 #include <linux/mount.h> 21 #include <linux/fs.h> 22 #include "internal.h" 23 24 #include <linux/uaccess.h> 25 #include <asm/unistd.h> 26 27 const struct file_operations generic_ro_fops = { 28 .llseek = generic_file_llseek, 29 .read_iter = generic_file_read_iter, 30 .mmap = generic_file_readonly_mmap, 31 .splice_read = generic_file_splice_read, 32 }; 33 34 EXPORT_SYMBOL(generic_ro_fops); 35 36 static inline int unsigned_offsets(struct file *file) 37 { 38 return file->f_mode & FMODE_UNSIGNED_OFFSET; 39 } 40 41 /** 42 * vfs_setpos - update the file offset for lseek 43 * @file: file structure in question 44 * @offset: file offset to seek to 45 * @maxsize: maximum file size 46 * 47 * This is a low-level filesystem helper for updating the file offset to 48 * the value specified by @offset if the given offset is valid and it is 49 * not equal to the current file offset. 50 * 51 * Return the specified offset on success and -EINVAL on invalid offset. 52 */ 53 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 54 { 55 if (offset < 0 && !unsigned_offsets(file)) 56 return -EINVAL; 57 if (offset > maxsize) 58 return -EINVAL; 59 60 if (offset != file->f_pos) { 61 file->f_pos = offset; 62 file->f_version = 0; 63 } 64 return offset; 65 } 66 EXPORT_SYMBOL(vfs_setpos); 67 68 /** 69 * generic_file_llseek_size - generic llseek implementation for regular files 70 * @file: file structure to seek on 71 * @offset: file offset to seek to 72 * @whence: type of seek 73 * @size: max size of this file in file system 74 * @eof: offset used for SEEK_END position 75 * 76 * This is a variant of generic_file_llseek that allows passing in a custom 77 * maximum file size and a custom EOF position, for e.g. hashed directories 78 * 79 * Synchronization: 80 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 81 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 82 * read/writes behave like SEEK_SET against seeks. 83 */ 84 loff_t 85 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 86 loff_t maxsize, loff_t eof) 87 { 88 switch (whence) { 89 case SEEK_END: 90 offset += eof; 91 break; 92 case SEEK_CUR: 93 /* 94 * Here we special-case the lseek(fd, 0, SEEK_CUR) 95 * position-querying operation. Avoid rewriting the "same" 96 * f_pos value back to the file because a concurrent read(), 97 * write() or lseek() might have altered it 98 */ 99 if (offset == 0) 100 return file->f_pos; 101 /* 102 * f_lock protects against read/modify/write race with other 103 * SEEK_CURs. Note that parallel writes and reads behave 104 * like SEEK_SET. 105 */ 106 spin_lock(&file->f_lock); 107 offset = vfs_setpos(file, file->f_pos + offset, maxsize); 108 spin_unlock(&file->f_lock); 109 return offset; 110 case SEEK_DATA: 111 /* 112 * In the generic case the entire file is data, so as long as 113 * offset isn't at the end of the file then the offset is data. 114 */ 115 if (offset >= eof) 116 return -ENXIO; 117 break; 118 case SEEK_HOLE: 119 /* 120 * There is a virtual hole at the end of the file, so as long as 121 * offset isn't i_size or larger, return i_size. 122 */ 123 if (offset >= eof) 124 return -ENXIO; 125 offset = eof; 126 break; 127 } 128 129 return vfs_setpos(file, offset, maxsize); 130 } 131 EXPORT_SYMBOL(generic_file_llseek_size); 132 133 /** 134 * generic_file_llseek - generic llseek implementation for regular files 135 * @file: file structure to seek on 136 * @offset: file offset to seek to 137 * @whence: type of seek 138 * 139 * This is a generic implemenation of ->llseek useable for all normal local 140 * filesystems. It just updates the file offset to the value specified by 141 * @offset and @whence. 142 */ 143 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 144 { 145 struct inode *inode = file->f_mapping->host; 146 147 return generic_file_llseek_size(file, offset, whence, 148 inode->i_sb->s_maxbytes, 149 i_size_read(inode)); 150 } 151 EXPORT_SYMBOL(generic_file_llseek); 152 153 /** 154 * fixed_size_llseek - llseek implementation for fixed-sized devices 155 * @file: file structure to seek on 156 * @offset: file offset to seek to 157 * @whence: type of seek 158 * @size: size of the file 159 * 160 */ 161 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 162 { 163 switch (whence) { 164 case SEEK_SET: case SEEK_CUR: case SEEK_END: 165 return generic_file_llseek_size(file, offset, whence, 166 size, size); 167 default: 168 return -EINVAL; 169 } 170 } 171 EXPORT_SYMBOL(fixed_size_llseek); 172 173 /** 174 * no_seek_end_llseek - llseek implementation for fixed-sized devices 175 * @file: file structure to seek on 176 * @offset: file offset to seek to 177 * @whence: type of seek 178 * 179 */ 180 loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) 181 { 182 switch (whence) { 183 case SEEK_SET: case SEEK_CUR: 184 return generic_file_llseek_size(file, offset, whence, 185 OFFSET_MAX, 0); 186 default: 187 return -EINVAL; 188 } 189 } 190 EXPORT_SYMBOL(no_seek_end_llseek); 191 192 /** 193 * no_seek_end_llseek_size - llseek implementation for fixed-sized devices 194 * @file: file structure to seek on 195 * @offset: file offset to seek to 196 * @whence: type of seek 197 * @size: maximal offset allowed 198 * 199 */ 200 loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) 201 { 202 switch (whence) { 203 case SEEK_SET: case SEEK_CUR: 204 return generic_file_llseek_size(file, offset, whence, 205 size, 0); 206 default: 207 return -EINVAL; 208 } 209 } 210 EXPORT_SYMBOL(no_seek_end_llseek_size); 211 212 /** 213 * noop_llseek - No Operation Performed llseek implementation 214 * @file: file structure to seek on 215 * @offset: file offset to seek to 216 * @whence: type of seek 217 * 218 * This is an implementation of ->llseek useable for the rare special case when 219 * userspace expects the seek to succeed but the (device) file is actually not 220 * able to perform the seek. In this case you use noop_llseek() instead of 221 * falling back to the default implementation of ->llseek. 222 */ 223 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 224 { 225 return file->f_pos; 226 } 227 EXPORT_SYMBOL(noop_llseek); 228 229 loff_t no_llseek(struct file *file, loff_t offset, int whence) 230 { 231 return -ESPIPE; 232 } 233 EXPORT_SYMBOL(no_llseek); 234 235 loff_t default_llseek(struct file *file, loff_t offset, int whence) 236 { 237 struct inode *inode = file_inode(file); 238 loff_t retval; 239 240 inode_lock(inode); 241 switch (whence) { 242 case SEEK_END: 243 offset += i_size_read(inode); 244 break; 245 case SEEK_CUR: 246 if (offset == 0) { 247 retval = file->f_pos; 248 goto out; 249 } 250 offset += file->f_pos; 251 break; 252 case SEEK_DATA: 253 /* 254 * In the generic case the entire file is data, so as 255 * long as offset isn't at the end of the file then the 256 * offset is data. 257 */ 258 if (offset >= inode->i_size) { 259 retval = -ENXIO; 260 goto out; 261 } 262 break; 263 case SEEK_HOLE: 264 /* 265 * There is a virtual hole at the end of the file, so 266 * as long as offset isn't i_size or larger, return 267 * i_size. 268 */ 269 if (offset >= inode->i_size) { 270 retval = -ENXIO; 271 goto out; 272 } 273 offset = inode->i_size; 274 break; 275 } 276 retval = -EINVAL; 277 if (offset >= 0 || unsigned_offsets(file)) { 278 if (offset != file->f_pos) { 279 file->f_pos = offset; 280 file->f_version = 0; 281 } 282 retval = offset; 283 } 284 out: 285 inode_unlock(inode); 286 return retval; 287 } 288 EXPORT_SYMBOL(default_llseek); 289 290 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 291 { 292 loff_t (*fn)(struct file *, loff_t, int); 293 294 fn = no_llseek; 295 if (file->f_mode & FMODE_LSEEK) { 296 if (file->f_op->llseek) 297 fn = file->f_op->llseek; 298 } 299 return fn(file, offset, whence); 300 } 301 EXPORT_SYMBOL(vfs_llseek); 302 303 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 304 { 305 off_t retval; 306 struct fd f = fdget_pos(fd); 307 if (!f.file) 308 return -EBADF; 309 310 retval = -EINVAL; 311 if (whence <= SEEK_MAX) { 312 loff_t res = vfs_llseek(f.file, offset, whence); 313 retval = res; 314 if (res != (loff_t)retval) 315 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 316 } 317 fdput_pos(f); 318 return retval; 319 } 320 321 #ifdef CONFIG_COMPAT 322 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 323 { 324 return sys_lseek(fd, offset, whence); 325 } 326 #endif 327 328 #ifdef __ARCH_WANT_SYS_LLSEEK 329 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 330 unsigned long, offset_low, loff_t __user *, result, 331 unsigned int, whence) 332 { 333 int retval; 334 struct fd f = fdget_pos(fd); 335 loff_t offset; 336 337 if (!f.file) 338 return -EBADF; 339 340 retval = -EINVAL; 341 if (whence > SEEK_MAX) 342 goto out_putf; 343 344 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 345 whence); 346 347 retval = (int)offset; 348 if (offset >= 0) { 349 retval = -EFAULT; 350 if (!copy_to_user(result, &offset, sizeof(offset))) 351 retval = 0; 352 } 353 out_putf: 354 fdput_pos(f); 355 return retval; 356 } 357 #endif 358 359 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 360 { 361 struct inode *inode; 362 loff_t pos; 363 int retval = -EINVAL; 364 365 inode = file_inode(file); 366 if (unlikely((ssize_t) count < 0)) 367 return retval; 368 pos = *ppos; 369 if (unlikely(pos < 0)) { 370 if (!unsigned_offsets(file)) 371 return retval; 372 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 373 return -EOVERFLOW; 374 } else if (unlikely((loff_t) (pos + count) < 0)) { 375 if (!unsigned_offsets(file)) 376 return retval; 377 } 378 379 if (unlikely(inode->i_flctx && mandatory_lock(inode))) { 380 retval = locks_mandatory_area(inode, file, pos, pos + count - 1, 381 read_write == READ ? F_RDLCK : F_WRLCK); 382 if (retval < 0) 383 return retval; 384 } 385 return security_file_permission(file, 386 read_write == READ ? MAY_READ : MAY_WRITE); 387 } 388 389 static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 390 { 391 struct iovec iov = { .iov_base = buf, .iov_len = len }; 392 struct kiocb kiocb; 393 struct iov_iter iter; 394 ssize_t ret; 395 396 init_sync_kiocb(&kiocb, filp); 397 kiocb.ki_pos = *ppos; 398 iov_iter_init(&iter, READ, &iov, 1, len); 399 400 ret = call_read_iter(filp, &kiocb, &iter); 401 BUG_ON(ret == -EIOCBQUEUED); 402 *ppos = kiocb.ki_pos; 403 return ret; 404 } 405 406 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, 407 loff_t *pos) 408 { 409 if (file->f_op->read) 410 return file->f_op->read(file, buf, count, pos); 411 else if (file->f_op->read_iter) 412 return new_sync_read(file, buf, count, pos); 413 else 414 return -EINVAL; 415 } 416 EXPORT_SYMBOL(__vfs_read); 417 418 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 419 { 420 ssize_t ret; 421 422 if (!(file->f_mode & FMODE_READ)) 423 return -EBADF; 424 if (!(file->f_mode & FMODE_CAN_READ)) 425 return -EINVAL; 426 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 427 return -EFAULT; 428 429 ret = rw_verify_area(READ, file, pos, count); 430 if (!ret) { 431 if (count > MAX_RW_COUNT) 432 count = MAX_RW_COUNT; 433 ret = __vfs_read(file, buf, count, pos); 434 if (ret > 0) { 435 fsnotify_access(file); 436 add_rchar(current, ret); 437 } 438 inc_syscr(current); 439 } 440 441 return ret; 442 } 443 444 EXPORT_SYMBOL(vfs_read); 445 446 static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 447 { 448 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; 449 struct kiocb kiocb; 450 struct iov_iter iter; 451 ssize_t ret; 452 453 init_sync_kiocb(&kiocb, filp); 454 kiocb.ki_pos = *ppos; 455 iov_iter_init(&iter, WRITE, &iov, 1, len); 456 457 ret = call_write_iter(filp, &kiocb, &iter); 458 BUG_ON(ret == -EIOCBQUEUED); 459 if (ret > 0) 460 *ppos = kiocb.ki_pos; 461 return ret; 462 } 463 464 ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, 465 loff_t *pos) 466 { 467 if (file->f_op->write) 468 return file->f_op->write(file, p, count, pos); 469 else if (file->f_op->write_iter) 470 return new_sync_write(file, p, count, pos); 471 else 472 return -EINVAL; 473 } 474 EXPORT_SYMBOL(__vfs_write); 475 476 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) 477 { 478 mm_segment_t old_fs; 479 const char __user *p; 480 ssize_t ret; 481 482 if (!(file->f_mode & FMODE_CAN_WRITE)) 483 return -EINVAL; 484 485 old_fs = get_fs(); 486 set_fs(get_ds()); 487 p = (__force const char __user *)buf; 488 if (count > MAX_RW_COUNT) 489 count = MAX_RW_COUNT; 490 ret = __vfs_write(file, p, count, pos); 491 set_fs(old_fs); 492 if (ret > 0) { 493 fsnotify_modify(file); 494 add_wchar(current, ret); 495 } 496 inc_syscw(current); 497 return ret; 498 } 499 500 EXPORT_SYMBOL(__kernel_write); 501 502 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 503 { 504 ssize_t ret; 505 506 if (!(file->f_mode & FMODE_WRITE)) 507 return -EBADF; 508 if (!(file->f_mode & FMODE_CAN_WRITE)) 509 return -EINVAL; 510 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 511 return -EFAULT; 512 513 ret = rw_verify_area(WRITE, file, pos, count); 514 if (!ret) { 515 if (count > MAX_RW_COUNT) 516 count = MAX_RW_COUNT; 517 file_start_write(file); 518 ret = __vfs_write(file, buf, count, pos); 519 if (ret > 0) { 520 fsnotify_modify(file); 521 add_wchar(current, ret); 522 } 523 inc_syscw(current); 524 file_end_write(file); 525 } 526 527 return ret; 528 } 529 530 EXPORT_SYMBOL(vfs_write); 531 532 static inline loff_t file_pos_read(struct file *file) 533 { 534 return file->f_pos; 535 } 536 537 static inline void file_pos_write(struct file *file, loff_t pos) 538 { 539 file->f_pos = pos; 540 } 541 542 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 543 { 544 struct fd f = fdget_pos(fd); 545 ssize_t ret = -EBADF; 546 547 if (f.file) { 548 loff_t pos = file_pos_read(f.file); 549 ret = vfs_read(f.file, buf, count, &pos); 550 if (ret >= 0) 551 file_pos_write(f.file, pos); 552 fdput_pos(f); 553 } 554 return ret; 555 } 556 557 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 558 size_t, count) 559 { 560 struct fd f = fdget_pos(fd); 561 ssize_t ret = -EBADF; 562 563 if (f.file) { 564 loff_t pos = file_pos_read(f.file); 565 ret = vfs_write(f.file, buf, count, &pos); 566 if (ret >= 0) 567 file_pos_write(f.file, pos); 568 fdput_pos(f); 569 } 570 571 return ret; 572 } 573 574 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 575 size_t, count, loff_t, pos) 576 { 577 struct fd f; 578 ssize_t ret = -EBADF; 579 580 if (pos < 0) 581 return -EINVAL; 582 583 f = fdget(fd); 584 if (f.file) { 585 ret = -ESPIPE; 586 if (f.file->f_mode & FMODE_PREAD) 587 ret = vfs_read(f.file, buf, count, &pos); 588 fdput(f); 589 } 590 591 return ret; 592 } 593 594 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 595 size_t, count, loff_t, pos) 596 { 597 struct fd f; 598 ssize_t ret = -EBADF; 599 600 if (pos < 0) 601 return -EINVAL; 602 603 f = fdget(fd); 604 if (f.file) { 605 ret = -ESPIPE; 606 if (f.file->f_mode & FMODE_PWRITE) 607 ret = vfs_write(f.file, buf, count, &pos); 608 fdput(f); 609 } 610 611 return ret; 612 } 613 614 /* 615 * Reduce an iovec's length in-place. Return the resulting number of segments 616 */ 617 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) 618 { 619 unsigned long seg = 0; 620 size_t len = 0; 621 622 while (seg < nr_segs) { 623 seg++; 624 if (len + iov->iov_len >= to) { 625 iov->iov_len = to - len; 626 break; 627 } 628 len += iov->iov_len; 629 iov++; 630 } 631 return seg; 632 } 633 EXPORT_SYMBOL(iov_shorten); 634 635 static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, 636 loff_t *ppos, int type, int flags) 637 { 638 struct kiocb kiocb; 639 ssize_t ret; 640 641 init_sync_kiocb(&kiocb, filp); 642 ret = kiocb_set_rw_flags(&kiocb, flags); 643 if (ret) 644 return ret; 645 kiocb.ki_pos = *ppos; 646 647 if (type == READ) 648 ret = call_read_iter(filp, &kiocb, iter); 649 else 650 ret = call_write_iter(filp, &kiocb, iter); 651 BUG_ON(ret == -EIOCBQUEUED); 652 *ppos = kiocb.ki_pos; 653 return ret; 654 } 655 656 /* Do it by hand, with file-ops */ 657 static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, 658 loff_t *ppos, int type, int flags) 659 { 660 ssize_t ret = 0; 661 662 if (flags & ~RWF_HIPRI) 663 return -EOPNOTSUPP; 664 665 while (iov_iter_count(iter)) { 666 struct iovec iovec = iov_iter_iovec(iter); 667 ssize_t nr; 668 669 if (type == READ) { 670 nr = filp->f_op->read(filp, iovec.iov_base, 671 iovec.iov_len, ppos); 672 } else { 673 nr = filp->f_op->write(filp, iovec.iov_base, 674 iovec.iov_len, ppos); 675 } 676 677 if (nr < 0) { 678 if (!ret) 679 ret = nr; 680 break; 681 } 682 ret += nr; 683 if (nr != iovec.iov_len) 684 break; 685 iov_iter_advance(iter, nr); 686 } 687 688 return ret; 689 } 690 691 /* A write operation does a read from user space and vice versa */ 692 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) 693 694 /** 695 * rw_copy_check_uvector() - Copy an array of &struct iovec from userspace 696 * into the kernel and check that it is valid. 697 * 698 * @type: One of %CHECK_IOVEC_ONLY, %READ, or %WRITE. 699 * @uvector: Pointer to the userspace array. 700 * @nr_segs: Number of elements in userspace array. 701 * @fast_segs: Number of elements in @fast_pointer. 702 * @fast_pointer: Pointer to (usually small on-stack) kernel array. 703 * @ret_pointer: (output parameter) Pointer to a variable that will point to 704 * either @fast_pointer, a newly allocated kernel array, or NULL, 705 * depending on which array was used. 706 * 707 * This function copies an array of &struct iovec of @nr_segs from 708 * userspace into the kernel and checks that each element is valid (e.g. 709 * it does not point to a kernel address or cause overflow by being too 710 * large, etc.). 711 * 712 * As an optimization, the caller may provide a pointer to a small 713 * on-stack array in @fast_pointer, typically %UIO_FASTIOV elements long 714 * (the size of this array, or 0 if unused, should be given in @fast_segs). 715 * 716 * @ret_pointer will always point to the array that was used, so the 717 * caller must take care not to call kfree() on it e.g. in case the 718 * @fast_pointer array was used and it was allocated on the stack. 719 * 720 * Return: The total number of bytes covered by the iovec array on success 721 * or a negative error code on error. 722 */ 723 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 724 unsigned long nr_segs, unsigned long fast_segs, 725 struct iovec *fast_pointer, 726 struct iovec **ret_pointer) 727 { 728 unsigned long seg; 729 ssize_t ret; 730 struct iovec *iov = fast_pointer; 731 732 /* 733 * SuS says "The readv() function *may* fail if the iovcnt argument 734 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 735 * traditionally returned zero for zero segments, so... 736 */ 737 if (nr_segs == 0) { 738 ret = 0; 739 goto out; 740 } 741 742 /* 743 * First get the "struct iovec" from user memory and 744 * verify all the pointers 745 */ 746 if (nr_segs > UIO_MAXIOV) { 747 ret = -EINVAL; 748 goto out; 749 } 750 if (nr_segs > fast_segs) { 751 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 752 if (iov == NULL) { 753 ret = -ENOMEM; 754 goto out; 755 } 756 } 757 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 758 ret = -EFAULT; 759 goto out; 760 } 761 762 /* 763 * According to the Single Unix Specification we should return EINVAL 764 * if an element length is < 0 when cast to ssize_t or if the 765 * total length would overflow the ssize_t return value of the 766 * system call. 767 * 768 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 769 * overflow case. 770 */ 771 ret = 0; 772 for (seg = 0; seg < nr_segs; seg++) { 773 void __user *buf = iov[seg].iov_base; 774 ssize_t len = (ssize_t)iov[seg].iov_len; 775 776 /* see if we we're about to use an invalid len or if 777 * it's about to overflow ssize_t */ 778 if (len < 0) { 779 ret = -EINVAL; 780 goto out; 781 } 782 if (type >= 0 783 && unlikely(!access_ok(vrfy_dir(type), buf, len))) { 784 ret = -EFAULT; 785 goto out; 786 } 787 if (len > MAX_RW_COUNT - ret) { 788 len = MAX_RW_COUNT - ret; 789 iov[seg].iov_len = len; 790 } 791 ret += len; 792 } 793 out: 794 *ret_pointer = iov; 795 return ret; 796 } 797 798 #ifdef CONFIG_COMPAT 799 ssize_t compat_rw_copy_check_uvector(int type, 800 const struct compat_iovec __user *uvector, unsigned long nr_segs, 801 unsigned long fast_segs, struct iovec *fast_pointer, 802 struct iovec **ret_pointer) 803 { 804 compat_ssize_t tot_len; 805 struct iovec *iov = *ret_pointer = fast_pointer; 806 ssize_t ret = 0; 807 int seg; 808 809 /* 810 * SuS says "The readv() function *may* fail if the iovcnt argument 811 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 812 * traditionally returned zero for zero segments, so... 813 */ 814 if (nr_segs == 0) 815 goto out; 816 817 ret = -EINVAL; 818 if (nr_segs > UIO_MAXIOV) 819 goto out; 820 if (nr_segs > fast_segs) { 821 ret = -ENOMEM; 822 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 823 if (iov == NULL) 824 goto out; 825 } 826 *ret_pointer = iov; 827 828 ret = -EFAULT; 829 if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) 830 goto out; 831 832 /* 833 * Single unix specification: 834 * We should -EINVAL if an element length is not >= 0 and fitting an 835 * ssize_t. 836 * 837 * In Linux, the total length is limited to MAX_RW_COUNT, there is 838 * no overflow possibility. 839 */ 840 tot_len = 0; 841 ret = -EINVAL; 842 for (seg = 0; seg < nr_segs; seg++) { 843 compat_uptr_t buf; 844 compat_ssize_t len; 845 846 if (__get_user(len, &uvector->iov_len) || 847 __get_user(buf, &uvector->iov_base)) { 848 ret = -EFAULT; 849 goto out; 850 } 851 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 852 goto out; 853 if (type >= 0 && 854 !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 855 ret = -EFAULT; 856 goto out; 857 } 858 if (len > MAX_RW_COUNT - tot_len) 859 len = MAX_RW_COUNT - tot_len; 860 tot_len += len; 861 iov->iov_base = compat_ptr(buf); 862 iov->iov_len = (compat_size_t) len; 863 uvector++; 864 iov++; 865 } 866 ret = tot_len; 867 868 out: 869 return ret; 870 } 871 #endif 872 873 static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, 874 loff_t *pos, int flags) 875 { 876 size_t tot_len; 877 ssize_t ret = 0; 878 879 if (!(file->f_mode & FMODE_READ)) 880 return -EBADF; 881 if (!(file->f_mode & FMODE_CAN_READ)) 882 return -EINVAL; 883 884 tot_len = iov_iter_count(iter); 885 if (!tot_len) 886 goto out; 887 ret = rw_verify_area(READ, file, pos, tot_len); 888 if (ret < 0) 889 return ret; 890 891 if (file->f_op->read_iter) 892 ret = do_iter_readv_writev(file, iter, pos, READ, flags); 893 else 894 ret = do_loop_readv_writev(file, iter, pos, READ, flags); 895 out: 896 if (ret >= 0) 897 fsnotify_access(file); 898 return ret; 899 } 900 901 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, 902 int flags) 903 { 904 if (!file->f_op->read_iter) 905 return -EINVAL; 906 return do_iter_read(file, iter, ppos, flags); 907 } 908 EXPORT_SYMBOL(vfs_iter_read); 909 910 static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, 911 loff_t *pos, int flags) 912 { 913 size_t tot_len; 914 ssize_t ret = 0; 915 916 if (!(file->f_mode & FMODE_WRITE)) 917 return -EBADF; 918 if (!(file->f_mode & FMODE_CAN_WRITE)) 919 return -EINVAL; 920 921 tot_len = iov_iter_count(iter); 922 if (!tot_len) 923 return 0; 924 ret = rw_verify_area(WRITE, file, pos, tot_len); 925 if (ret < 0) 926 return ret; 927 928 if (file->f_op->write_iter) 929 ret = do_iter_readv_writev(file, iter, pos, WRITE, flags); 930 else 931 ret = do_loop_readv_writev(file, iter, pos, WRITE, flags); 932 if (ret > 0) 933 fsnotify_modify(file); 934 return ret; 935 } 936 937 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, 938 int flags) 939 { 940 if (!file->f_op->write_iter) 941 return -EINVAL; 942 return do_iter_write(file, iter, ppos, flags); 943 } 944 EXPORT_SYMBOL(vfs_iter_write); 945 946 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 947 unsigned long vlen, loff_t *pos, int flags) 948 { 949 struct iovec iovstack[UIO_FASTIOV]; 950 struct iovec *iov = iovstack; 951 struct iov_iter iter; 952 ssize_t ret; 953 954 ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 955 if (ret >= 0) { 956 ret = do_iter_read(file, &iter, pos, flags); 957 kfree(iov); 958 } 959 960 return ret; 961 } 962 EXPORT_SYMBOL(vfs_readv); 963 964 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 965 unsigned long vlen, loff_t *pos, int flags) 966 { 967 struct iovec iovstack[UIO_FASTIOV]; 968 struct iovec *iov = iovstack; 969 struct iov_iter iter; 970 ssize_t ret; 971 972 ret = import_iovec(WRITE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); 973 if (ret >= 0) { 974 file_start_write(file); 975 ret = do_iter_write(file, &iter, pos, flags); 976 file_end_write(file); 977 kfree(iov); 978 } 979 return ret; 980 } 981 EXPORT_SYMBOL(vfs_writev); 982 983 static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, 984 unsigned long vlen, int flags) 985 { 986 struct fd f = fdget_pos(fd); 987 ssize_t ret = -EBADF; 988 989 if (f.file) { 990 loff_t pos = file_pos_read(f.file); 991 ret = vfs_readv(f.file, vec, vlen, &pos, flags); 992 if (ret >= 0) 993 file_pos_write(f.file, pos); 994 fdput_pos(f); 995 } 996 997 if (ret > 0) 998 add_rchar(current, ret); 999 inc_syscr(current); 1000 return ret; 1001 } 1002 1003 static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, 1004 unsigned long vlen, int flags) 1005 { 1006 struct fd f = fdget_pos(fd); 1007 ssize_t ret = -EBADF; 1008 1009 if (f.file) { 1010 loff_t pos = file_pos_read(f.file); 1011 ret = vfs_writev(f.file, vec, vlen, &pos, flags); 1012 if (ret >= 0) 1013 file_pos_write(f.file, pos); 1014 fdput_pos(f); 1015 } 1016 1017 if (ret > 0) 1018 add_wchar(current, ret); 1019 inc_syscw(current); 1020 return ret; 1021 } 1022 1023 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 1024 { 1025 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 1026 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 1027 } 1028 1029 static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, 1030 unsigned long vlen, loff_t pos, int flags) 1031 { 1032 struct fd f; 1033 ssize_t ret = -EBADF; 1034 1035 if (pos < 0) 1036 return -EINVAL; 1037 1038 f = fdget(fd); 1039 if (f.file) { 1040 ret = -ESPIPE; 1041 if (f.file->f_mode & FMODE_PREAD) 1042 ret = vfs_readv(f.file, vec, vlen, &pos, flags); 1043 fdput(f); 1044 } 1045 1046 if (ret > 0) 1047 add_rchar(current, ret); 1048 inc_syscr(current); 1049 return ret; 1050 } 1051 1052 static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, 1053 unsigned long vlen, loff_t pos, int flags) 1054 { 1055 struct fd f; 1056 ssize_t ret = -EBADF; 1057 1058 if (pos < 0) 1059 return -EINVAL; 1060 1061 f = fdget(fd); 1062 if (f.file) { 1063 ret = -ESPIPE; 1064 if (f.file->f_mode & FMODE_PWRITE) 1065 ret = vfs_writev(f.file, vec, vlen, &pos, flags); 1066 fdput(f); 1067 } 1068 1069 if (ret > 0) 1070 add_wchar(current, ret); 1071 inc_syscw(current); 1072 return ret; 1073 } 1074 1075 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 1076 unsigned long, vlen) 1077 { 1078 return do_readv(fd, vec, vlen, 0); 1079 } 1080 1081 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 1082 unsigned long, vlen) 1083 { 1084 return do_writev(fd, vec, vlen, 0); 1085 } 1086 1087 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 1088 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1089 { 1090 loff_t pos = pos_from_hilo(pos_h, pos_l); 1091 1092 return do_preadv(fd, vec, vlen, pos, 0); 1093 } 1094 1095 SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, 1096 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1097 int, flags) 1098 { 1099 loff_t pos = pos_from_hilo(pos_h, pos_l); 1100 1101 if (pos == -1) 1102 return do_readv(fd, vec, vlen, flags); 1103 1104 return do_preadv(fd, vec, vlen, pos, flags); 1105 } 1106 1107 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1108 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1109 { 1110 loff_t pos = pos_from_hilo(pos_h, pos_l); 1111 1112 return do_pwritev(fd, vec, vlen, pos, 0); 1113 } 1114 1115 SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, 1116 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, 1117 int, flags) 1118 { 1119 loff_t pos = pos_from_hilo(pos_h, pos_l); 1120 1121 if (pos == -1) 1122 return do_writev(fd, vec, vlen, flags); 1123 1124 return do_pwritev(fd, vec, vlen, pos, flags); 1125 } 1126 1127 #ifdef CONFIG_COMPAT 1128 static size_t compat_readv(struct file *file, 1129 const struct compat_iovec __user *vec, 1130 unsigned long vlen, loff_t *pos, int flags) 1131 { 1132 struct iovec iovstack[UIO_FASTIOV]; 1133 struct iovec *iov = iovstack; 1134 struct iov_iter iter; 1135 ssize_t ret; 1136 1137 ret = compat_import_iovec(READ, vec, vlen, UIO_FASTIOV, &iov, &iter); 1138 if (ret >= 0) { 1139 ret = do_iter_read(file, &iter, pos, flags); 1140 kfree(iov); 1141 } 1142 if (ret > 0) 1143 add_rchar(current, ret); 1144 inc_syscr(current); 1145 return ret; 1146 } 1147 1148 static size_t do_compat_readv(compat_ulong_t fd, 1149 const struct compat_iovec __user *vec, 1150 compat_ulong_t vlen, int flags) 1151 { 1152 struct fd f = fdget_pos(fd); 1153 ssize_t ret; 1154 loff_t pos; 1155 1156 if (!f.file) 1157 return -EBADF; 1158 pos = f.file->f_pos; 1159 ret = compat_readv(f.file, vec, vlen, &pos, flags); 1160 if (ret >= 0) 1161 f.file->f_pos = pos; 1162 fdput_pos(f); 1163 return ret; 1164 1165 } 1166 1167 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, 1168 const struct compat_iovec __user *,vec, 1169 compat_ulong_t, vlen) 1170 { 1171 return do_compat_readv(fd, vec, vlen, 0); 1172 } 1173 1174 static long do_compat_preadv64(unsigned long fd, 1175 const struct compat_iovec __user *vec, 1176 unsigned long vlen, loff_t pos, int flags) 1177 { 1178 struct fd f; 1179 ssize_t ret; 1180 1181 if (pos < 0) 1182 return -EINVAL; 1183 f = fdget(fd); 1184 if (!f.file) 1185 return -EBADF; 1186 ret = -ESPIPE; 1187 if (f.file->f_mode & FMODE_PREAD) 1188 ret = compat_readv(f.file, vec, vlen, &pos, flags); 1189 fdput(f); 1190 return ret; 1191 } 1192 1193 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1194 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1195 const struct compat_iovec __user *,vec, 1196 unsigned long, vlen, loff_t, pos) 1197 { 1198 return do_compat_preadv64(fd, vec, vlen, pos, 0); 1199 } 1200 #endif 1201 1202 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1203 const struct compat_iovec __user *,vec, 1204 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1205 { 1206 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1207 1208 return do_compat_preadv64(fd, vec, vlen, pos, 0); 1209 } 1210 1211 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 1212 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, 1213 const struct compat_iovec __user *,vec, 1214 unsigned long, vlen, loff_t, pos, int, flags) 1215 { 1216 return do_compat_preadv64(fd, vec, vlen, pos, flags); 1217 } 1218 #endif 1219 1220 COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, 1221 const struct compat_iovec __user *,vec, 1222 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, 1223 int, flags) 1224 { 1225 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1226 1227 if (pos == -1) 1228 return do_compat_readv(fd, vec, vlen, flags); 1229 1230 return do_compat_preadv64(fd, vec, vlen, pos, flags); 1231 } 1232 1233 static size_t compat_writev(struct file *file, 1234 const struct compat_iovec __user *vec, 1235 unsigned long vlen, loff_t *pos, int flags) 1236 { 1237 struct iovec iovstack[UIO_FASTIOV]; 1238 struct iovec *iov = iovstack; 1239 struct iov_iter iter; 1240 ssize_t ret; 1241 1242 ret = compat_import_iovec(WRITE, vec, vlen, UIO_FASTIOV, &iov, &iter); 1243 if (ret >= 0) { 1244 file_start_write(file); 1245 ret = do_iter_write(file, &iter, pos, flags); 1246 file_end_write(file); 1247 kfree(iov); 1248 } 1249 if (ret > 0) 1250 add_wchar(current, ret); 1251 inc_syscw(current); 1252 return ret; 1253 } 1254 1255 static size_t do_compat_writev(compat_ulong_t fd, 1256 const struct compat_iovec __user* vec, 1257 compat_ulong_t vlen, int flags) 1258 { 1259 struct fd f = fdget_pos(fd); 1260 ssize_t ret; 1261 loff_t pos; 1262 1263 if (!f.file) 1264 return -EBADF; 1265 pos = f.file->f_pos; 1266 ret = compat_writev(f.file, vec, vlen, &pos, flags); 1267 if (ret >= 0) 1268 f.file->f_pos = pos; 1269 fdput_pos(f); 1270 return ret; 1271 } 1272 1273 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, 1274 const struct compat_iovec __user *, vec, 1275 compat_ulong_t, vlen) 1276 { 1277 return do_compat_writev(fd, vec, vlen, 0); 1278 } 1279 1280 static long do_compat_pwritev64(unsigned long fd, 1281 const struct compat_iovec __user *vec, 1282 unsigned long vlen, loff_t pos, int flags) 1283 { 1284 struct fd f; 1285 ssize_t ret; 1286 1287 if (pos < 0) 1288 return -EINVAL; 1289 f = fdget(fd); 1290 if (!f.file) 1291 return -EBADF; 1292 ret = -ESPIPE; 1293 if (f.file->f_mode & FMODE_PWRITE) 1294 ret = compat_writev(f.file, vec, vlen, &pos, flags); 1295 fdput(f); 1296 return ret; 1297 } 1298 1299 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1300 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1301 const struct compat_iovec __user *,vec, 1302 unsigned long, vlen, loff_t, pos) 1303 { 1304 return do_compat_pwritev64(fd, vec, vlen, pos, 0); 1305 } 1306 #endif 1307 1308 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1309 const struct compat_iovec __user *,vec, 1310 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1311 { 1312 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1313 1314 return do_compat_pwritev64(fd, vec, vlen, pos, 0); 1315 } 1316 1317 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 1318 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, 1319 const struct compat_iovec __user *,vec, 1320 unsigned long, vlen, loff_t, pos, int, flags) 1321 { 1322 return do_compat_pwritev64(fd, vec, vlen, pos, flags); 1323 } 1324 #endif 1325 1326 COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, 1327 const struct compat_iovec __user *,vec, 1328 compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) 1329 { 1330 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1331 1332 if (pos == -1) 1333 return do_compat_writev(fd, vec, vlen, flags); 1334 1335 return do_compat_pwritev64(fd, vec, vlen, pos, flags); 1336 } 1337 1338 #endif 1339 1340 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1341 size_t count, loff_t max) 1342 { 1343 struct fd in, out; 1344 struct inode *in_inode, *out_inode; 1345 loff_t pos; 1346 loff_t out_pos; 1347 ssize_t retval; 1348 int fl; 1349 1350 /* 1351 * Get input file, and verify that it is ok.. 1352 */ 1353 retval = -EBADF; 1354 in = fdget(in_fd); 1355 if (!in.file) 1356 goto out; 1357 if (!(in.file->f_mode & FMODE_READ)) 1358 goto fput_in; 1359 retval = -ESPIPE; 1360 if (!ppos) { 1361 pos = in.file->f_pos; 1362 } else { 1363 pos = *ppos; 1364 if (!(in.file->f_mode & FMODE_PREAD)) 1365 goto fput_in; 1366 } 1367 retval = rw_verify_area(READ, in.file, &pos, count); 1368 if (retval < 0) 1369 goto fput_in; 1370 if (count > MAX_RW_COUNT) 1371 count = MAX_RW_COUNT; 1372 1373 /* 1374 * Get output file, and verify that it is ok.. 1375 */ 1376 retval = -EBADF; 1377 out = fdget(out_fd); 1378 if (!out.file) 1379 goto fput_in; 1380 if (!(out.file->f_mode & FMODE_WRITE)) 1381 goto fput_out; 1382 retval = -EINVAL; 1383 in_inode = file_inode(in.file); 1384 out_inode = file_inode(out.file); 1385 out_pos = out.file->f_pos; 1386 retval = rw_verify_area(WRITE, out.file, &out_pos, count); 1387 if (retval < 0) 1388 goto fput_out; 1389 1390 if (!max) 1391 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1392 1393 if (unlikely(pos + count > max)) { 1394 retval = -EOVERFLOW; 1395 if (pos >= max) 1396 goto fput_out; 1397 count = max - pos; 1398 } 1399 1400 fl = 0; 1401 #if 0 1402 /* 1403 * We need to debate whether we can enable this or not. The 1404 * man page documents EAGAIN return for the output at least, 1405 * and the application is arguably buggy if it doesn't expect 1406 * EAGAIN on a non-blocking file descriptor. 1407 */ 1408 if (in.file->f_flags & O_NONBLOCK) 1409 fl = SPLICE_F_NONBLOCK; 1410 #endif 1411 file_start_write(out.file); 1412 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); 1413 file_end_write(out.file); 1414 1415 if (retval > 0) { 1416 add_rchar(current, retval); 1417 add_wchar(current, retval); 1418 fsnotify_access(in.file); 1419 fsnotify_modify(out.file); 1420 out.file->f_pos = out_pos; 1421 if (ppos) 1422 *ppos = pos; 1423 else 1424 in.file->f_pos = pos; 1425 } 1426 1427 inc_syscr(current); 1428 inc_syscw(current); 1429 if (pos > max) 1430 retval = -EOVERFLOW; 1431 1432 fput_out: 1433 fdput(out); 1434 fput_in: 1435 fdput(in); 1436 out: 1437 return retval; 1438 } 1439 1440 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1441 { 1442 loff_t pos; 1443 off_t off; 1444 ssize_t ret; 1445 1446 if (offset) { 1447 if (unlikely(get_user(off, offset))) 1448 return -EFAULT; 1449 pos = off; 1450 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1451 if (unlikely(put_user(pos, offset))) 1452 return -EFAULT; 1453 return ret; 1454 } 1455 1456 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1457 } 1458 1459 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1460 { 1461 loff_t pos; 1462 ssize_t ret; 1463 1464 if (offset) { 1465 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1466 return -EFAULT; 1467 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1468 if (unlikely(put_user(pos, offset))) 1469 return -EFAULT; 1470 return ret; 1471 } 1472 1473 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1474 } 1475 1476 #ifdef CONFIG_COMPAT 1477 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1478 compat_off_t __user *, offset, compat_size_t, count) 1479 { 1480 loff_t pos; 1481 off_t off; 1482 ssize_t ret; 1483 1484 if (offset) { 1485 if (unlikely(get_user(off, offset))) 1486 return -EFAULT; 1487 pos = off; 1488 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1489 if (unlikely(put_user(pos, offset))) 1490 return -EFAULT; 1491 return ret; 1492 } 1493 1494 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1495 } 1496 1497 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1498 compat_loff_t __user *, offset, compat_size_t, count) 1499 { 1500 loff_t pos; 1501 ssize_t ret; 1502 1503 if (offset) { 1504 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1505 return -EFAULT; 1506 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1507 if (unlikely(put_user(pos, offset))) 1508 return -EFAULT; 1509 return ret; 1510 } 1511 1512 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1513 } 1514 #endif 1515 1516 /* 1517 * copy_file_range() differs from regular file read and write in that it 1518 * specifically allows return partial success. When it does so is up to 1519 * the copy_file_range method. 1520 */ 1521 ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, 1522 struct file *file_out, loff_t pos_out, 1523 size_t len, unsigned int flags) 1524 { 1525 struct inode *inode_in = file_inode(file_in); 1526 struct inode *inode_out = file_inode(file_out); 1527 ssize_t ret; 1528 1529 if (flags != 0) 1530 return -EINVAL; 1531 1532 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1533 return -EISDIR; 1534 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1535 return -EINVAL; 1536 1537 ret = rw_verify_area(READ, file_in, &pos_in, len); 1538 if (unlikely(ret)) 1539 return ret; 1540 1541 ret = rw_verify_area(WRITE, file_out, &pos_out, len); 1542 if (unlikely(ret)) 1543 return ret; 1544 1545 if (!(file_in->f_mode & FMODE_READ) || 1546 !(file_out->f_mode & FMODE_WRITE) || 1547 (file_out->f_flags & O_APPEND)) 1548 return -EBADF; 1549 1550 /* this could be relaxed once a method supports cross-fs copies */ 1551 if (inode_in->i_sb != inode_out->i_sb) 1552 return -EXDEV; 1553 1554 if (len == 0) 1555 return 0; 1556 1557 file_start_write(file_out); 1558 1559 /* 1560 * Try cloning first, this is supported by more file systems, and 1561 * more efficient if both clone and copy are supported (e.g. NFS). 1562 */ 1563 if (file_in->f_op->clone_file_range) { 1564 ret = file_in->f_op->clone_file_range(file_in, pos_in, 1565 file_out, pos_out, len); 1566 if (ret == 0) { 1567 ret = len; 1568 goto done; 1569 } 1570 } 1571 1572 if (file_out->f_op->copy_file_range) { 1573 ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, 1574 pos_out, len, flags); 1575 if (ret != -EOPNOTSUPP) 1576 goto done; 1577 } 1578 1579 ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, 1580 len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); 1581 1582 done: 1583 if (ret > 0) { 1584 fsnotify_access(file_in); 1585 add_rchar(current, ret); 1586 fsnotify_modify(file_out); 1587 add_wchar(current, ret); 1588 } 1589 1590 inc_syscr(current); 1591 inc_syscw(current); 1592 1593 file_end_write(file_out); 1594 1595 return ret; 1596 } 1597 EXPORT_SYMBOL(vfs_copy_file_range); 1598 1599 SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, 1600 int, fd_out, loff_t __user *, off_out, 1601 size_t, len, unsigned int, flags) 1602 { 1603 loff_t pos_in; 1604 loff_t pos_out; 1605 struct fd f_in; 1606 struct fd f_out; 1607 ssize_t ret = -EBADF; 1608 1609 f_in = fdget(fd_in); 1610 if (!f_in.file) 1611 goto out2; 1612 1613 f_out = fdget(fd_out); 1614 if (!f_out.file) 1615 goto out1; 1616 1617 ret = -EFAULT; 1618 if (off_in) { 1619 if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) 1620 goto out; 1621 } else { 1622 pos_in = f_in.file->f_pos; 1623 } 1624 1625 if (off_out) { 1626 if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) 1627 goto out; 1628 } else { 1629 pos_out = f_out.file->f_pos; 1630 } 1631 1632 ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, 1633 flags); 1634 if (ret > 0) { 1635 pos_in += ret; 1636 pos_out += ret; 1637 1638 if (off_in) { 1639 if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) 1640 ret = -EFAULT; 1641 } else { 1642 f_in.file->f_pos = pos_in; 1643 } 1644 1645 if (off_out) { 1646 if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) 1647 ret = -EFAULT; 1648 } else { 1649 f_out.file->f_pos = pos_out; 1650 } 1651 } 1652 1653 out: 1654 fdput(f_out); 1655 out1: 1656 fdput(f_in); 1657 out2: 1658 return ret; 1659 } 1660 1661 static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) 1662 { 1663 struct inode *inode = file_inode(file); 1664 1665 if (unlikely(pos < 0)) 1666 return -EINVAL; 1667 1668 if (unlikely((loff_t) (pos + len) < 0)) 1669 return -EINVAL; 1670 1671 if (unlikely(inode->i_flctx && mandatory_lock(inode))) { 1672 loff_t end = len ? pos + len - 1 : OFFSET_MAX; 1673 int retval; 1674 1675 retval = locks_mandatory_area(inode, file, pos, end, 1676 write ? F_WRLCK : F_RDLCK); 1677 if (retval < 0) 1678 return retval; 1679 } 1680 1681 return security_file_permission(file, write ? MAY_WRITE : MAY_READ); 1682 } 1683 1684 /* 1685 * Check that the two inodes are eligible for cloning, the ranges make 1686 * sense, and then flush all dirty data. Caller must ensure that the 1687 * inodes have been locked against any other modifications. 1688 * 1689 * Returns: 0 for "nothing to clone", 1 for "something to clone", or 1690 * the usual negative error code. 1691 */ 1692 int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, 1693 struct inode *inode_out, loff_t pos_out, 1694 u64 *len, bool is_dedupe) 1695 { 1696 loff_t bs = inode_out->i_sb->s_blocksize; 1697 loff_t blen; 1698 loff_t isize; 1699 bool same_inode = (inode_in == inode_out); 1700 int ret; 1701 1702 /* Don't touch certain kinds of inodes */ 1703 if (IS_IMMUTABLE(inode_out)) 1704 return -EPERM; 1705 1706 if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) 1707 return -ETXTBSY; 1708 1709 /* Don't reflink dirs, pipes, sockets... */ 1710 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1711 return -EISDIR; 1712 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1713 return -EINVAL; 1714 1715 /* Are we going all the way to the end? */ 1716 isize = i_size_read(inode_in); 1717 if (isize == 0) 1718 return 0; 1719 1720 /* Zero length dedupe exits immediately; reflink goes to EOF. */ 1721 if (*len == 0) { 1722 if (is_dedupe || pos_in == isize) 1723 return 0; 1724 if (pos_in > isize) 1725 return -EINVAL; 1726 *len = isize - pos_in; 1727 } 1728 1729 /* Ensure offsets don't wrap and the input is inside i_size */ 1730 if (pos_in + *len < pos_in || pos_out + *len < pos_out || 1731 pos_in + *len > isize) 1732 return -EINVAL; 1733 1734 /* Don't allow dedupe past EOF in the dest file */ 1735 if (is_dedupe) { 1736 loff_t disize; 1737 1738 disize = i_size_read(inode_out); 1739 if (pos_out >= disize || pos_out + *len > disize) 1740 return -EINVAL; 1741 } 1742 1743 /* If we're linking to EOF, continue to the block boundary. */ 1744 if (pos_in + *len == isize) 1745 blen = ALIGN(isize, bs) - pos_in; 1746 else 1747 blen = *len; 1748 1749 /* Only reflink if we're aligned to block boundaries */ 1750 if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || 1751 !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) 1752 return -EINVAL; 1753 1754 /* Don't allow overlapped reflink within the same file */ 1755 if (same_inode) { 1756 if (pos_out + blen > pos_in && pos_out < pos_in + blen) 1757 return -EINVAL; 1758 } 1759 1760 /* Wait for the completion of any pending IOs on both files */ 1761 inode_dio_wait(inode_in); 1762 if (!same_inode) 1763 inode_dio_wait(inode_out); 1764 1765 ret = filemap_write_and_wait_range(inode_in->i_mapping, 1766 pos_in, pos_in + *len - 1); 1767 if (ret) 1768 return ret; 1769 1770 ret = filemap_write_and_wait_range(inode_out->i_mapping, 1771 pos_out, pos_out + *len - 1); 1772 if (ret) 1773 return ret; 1774 1775 /* 1776 * Check that the extents are the same. 1777 */ 1778 if (is_dedupe) { 1779 bool is_same = false; 1780 1781 ret = vfs_dedupe_file_range_compare(inode_in, pos_in, 1782 inode_out, pos_out, *len, &is_same); 1783 if (ret) 1784 return ret; 1785 if (!is_same) 1786 return -EBADE; 1787 } 1788 1789 return 1; 1790 } 1791 EXPORT_SYMBOL(vfs_clone_file_prep_inodes); 1792 1793 int vfs_clone_file_range(struct file *file_in, loff_t pos_in, 1794 struct file *file_out, loff_t pos_out, u64 len) 1795 { 1796 struct inode *inode_in = file_inode(file_in); 1797 struct inode *inode_out = file_inode(file_out); 1798 int ret; 1799 1800 if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) 1801 return -EISDIR; 1802 if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) 1803 return -EINVAL; 1804 1805 /* 1806 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on 1807 * the same mount. Practically, they only need to be on the same file 1808 * system. 1809 */ 1810 if (inode_in->i_sb != inode_out->i_sb) 1811 return -EXDEV; 1812 1813 if (!(file_in->f_mode & FMODE_READ) || 1814 !(file_out->f_mode & FMODE_WRITE) || 1815 (file_out->f_flags & O_APPEND)) 1816 return -EBADF; 1817 1818 if (!file_in->f_op->clone_file_range) 1819 return -EOPNOTSUPP; 1820 1821 ret = clone_verify_area(file_in, pos_in, len, false); 1822 if (ret) 1823 return ret; 1824 1825 ret = clone_verify_area(file_out, pos_out, len, true); 1826 if (ret) 1827 return ret; 1828 1829 if (pos_in + len > i_size_read(inode_in)) 1830 return -EINVAL; 1831 1832 ret = file_in->f_op->clone_file_range(file_in, pos_in, 1833 file_out, pos_out, len); 1834 if (!ret) { 1835 fsnotify_access(file_in); 1836 fsnotify_modify(file_out); 1837 } 1838 1839 return ret; 1840 } 1841 EXPORT_SYMBOL(vfs_clone_file_range); 1842 1843 /* 1844 * Read a page's worth of file data into the page cache. Return the page 1845 * locked. 1846 */ 1847 static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) 1848 { 1849 struct address_space *mapping; 1850 struct page *page; 1851 pgoff_t n; 1852 1853 n = offset >> PAGE_SHIFT; 1854 mapping = inode->i_mapping; 1855 page = read_mapping_page(mapping, n, NULL); 1856 if (IS_ERR(page)) 1857 return page; 1858 if (!PageUptodate(page)) { 1859 put_page(page); 1860 return ERR_PTR(-EIO); 1861 } 1862 lock_page(page); 1863 return page; 1864 } 1865 1866 /* 1867 * Compare extents of two files to see if they are the same. 1868 * Caller must have locked both inodes to prevent write races. 1869 */ 1870 int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, 1871 struct inode *dest, loff_t destoff, 1872 loff_t len, bool *is_same) 1873 { 1874 loff_t src_poff; 1875 loff_t dest_poff; 1876 void *src_addr; 1877 void *dest_addr; 1878 struct page *src_page; 1879 struct page *dest_page; 1880 loff_t cmp_len; 1881 bool same; 1882 int error; 1883 1884 error = -EINVAL; 1885 same = true; 1886 while (len) { 1887 src_poff = srcoff & (PAGE_SIZE - 1); 1888 dest_poff = destoff & (PAGE_SIZE - 1); 1889 cmp_len = min(PAGE_SIZE - src_poff, 1890 PAGE_SIZE - dest_poff); 1891 cmp_len = min(cmp_len, len); 1892 if (cmp_len <= 0) 1893 goto out_error; 1894 1895 src_page = vfs_dedupe_get_page(src, srcoff); 1896 if (IS_ERR(src_page)) { 1897 error = PTR_ERR(src_page); 1898 goto out_error; 1899 } 1900 dest_page = vfs_dedupe_get_page(dest, destoff); 1901 if (IS_ERR(dest_page)) { 1902 error = PTR_ERR(dest_page); 1903 unlock_page(src_page); 1904 put_page(src_page); 1905 goto out_error; 1906 } 1907 src_addr = kmap_atomic(src_page); 1908 dest_addr = kmap_atomic(dest_page); 1909 1910 flush_dcache_page(src_page); 1911 flush_dcache_page(dest_page); 1912 1913 if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) 1914 same = false; 1915 1916 kunmap_atomic(dest_addr); 1917 kunmap_atomic(src_addr); 1918 unlock_page(dest_page); 1919 unlock_page(src_page); 1920 put_page(dest_page); 1921 put_page(src_page); 1922 1923 if (!same) 1924 break; 1925 1926 srcoff += cmp_len; 1927 destoff += cmp_len; 1928 len -= cmp_len; 1929 } 1930 1931 *is_same = same; 1932 return 0; 1933 1934 out_error: 1935 return error; 1936 } 1937 EXPORT_SYMBOL(vfs_dedupe_file_range_compare); 1938 1939 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) 1940 { 1941 struct file_dedupe_range_info *info; 1942 struct inode *src = file_inode(file); 1943 u64 off; 1944 u64 len; 1945 int i; 1946 int ret; 1947 bool is_admin = capable(CAP_SYS_ADMIN); 1948 u16 count = same->dest_count; 1949 struct file *dst_file; 1950 loff_t dst_off; 1951 ssize_t deduped; 1952 1953 if (!(file->f_mode & FMODE_READ)) 1954 return -EINVAL; 1955 1956 if (same->reserved1 || same->reserved2) 1957 return -EINVAL; 1958 1959 off = same->src_offset; 1960 len = same->src_length; 1961 1962 ret = -EISDIR; 1963 if (S_ISDIR(src->i_mode)) 1964 goto out; 1965 1966 ret = -EINVAL; 1967 if (!S_ISREG(src->i_mode)) 1968 goto out; 1969 1970 ret = clone_verify_area(file, off, len, false); 1971 if (ret < 0) 1972 goto out; 1973 ret = 0; 1974 1975 if (off + len > i_size_read(src)) 1976 return -EINVAL; 1977 1978 /* pre-format output fields to sane values */ 1979 for (i = 0; i < count; i++) { 1980 same->info[i].bytes_deduped = 0ULL; 1981 same->info[i].status = FILE_DEDUPE_RANGE_SAME; 1982 } 1983 1984 for (i = 0, info = same->info; i < count; i++, info++) { 1985 struct inode *dst; 1986 struct fd dst_fd = fdget(info->dest_fd); 1987 1988 dst_file = dst_fd.file; 1989 if (!dst_file) { 1990 info->status = -EBADF; 1991 goto next_loop; 1992 } 1993 dst = file_inode(dst_file); 1994 1995 ret = mnt_want_write_file(dst_file); 1996 if (ret) { 1997 info->status = ret; 1998 goto next_loop; 1999 } 2000 2001 dst_off = info->dest_offset; 2002 ret = clone_verify_area(dst_file, dst_off, len, true); 2003 if (ret < 0) { 2004 info->status = ret; 2005 goto next_file; 2006 } 2007 ret = 0; 2008 2009 if (info->reserved) { 2010 info->status = -EINVAL; 2011 } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { 2012 info->status = -EINVAL; 2013 } else if (file->f_path.mnt != dst_file->f_path.mnt) { 2014 info->status = -EXDEV; 2015 } else if (S_ISDIR(dst->i_mode)) { 2016 info->status = -EISDIR; 2017 } else if (dst_file->f_op->dedupe_file_range == NULL) { 2018 info->status = -EINVAL; 2019 } else { 2020 deduped = dst_file->f_op->dedupe_file_range(file, off, 2021 len, dst_file, 2022 info->dest_offset); 2023 if (deduped == -EBADE) 2024 info->status = FILE_DEDUPE_RANGE_DIFFERS; 2025 else if (deduped < 0) 2026 info->status = deduped; 2027 else 2028 info->bytes_deduped += deduped; 2029 } 2030 2031 next_file: 2032 mnt_drop_write_file(dst_file); 2033 next_loop: 2034 fdput(dst_fd); 2035 2036 if (fatal_signal_pending(current)) 2037 goto out; 2038 } 2039 2040 out: 2041 return ret; 2042 } 2043 EXPORT_SYMBOL(vfs_dedupe_file_range); 2044