1 /* 2 * linux/fs/read_write.c 3 * 4 * Copyright (C) 1991, 1992 Linus Torvalds 5 */ 6 7 #include <linux/slab.h> 8 #include <linux/stat.h> 9 #include <linux/fcntl.h> 10 #include <linux/file.h> 11 #include <linux/uio.h> 12 #include <linux/aio.h> 13 #include <linux/fsnotify.h> 14 #include <linux/security.h> 15 #include <linux/export.h> 16 #include <linux/syscalls.h> 17 #include <linux/pagemap.h> 18 #include <linux/splice.h> 19 #include <linux/compat.h> 20 #include "internal.h" 21 22 #include <asm/uaccess.h> 23 #include <asm/unistd.h> 24 25 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); 26 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, 27 unsigned long, loff_t); 28 typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); 29 30 const struct file_operations generic_ro_fops = { 31 .llseek = generic_file_llseek, 32 .read = new_sync_read, 33 .read_iter = generic_file_read_iter, 34 .mmap = generic_file_readonly_mmap, 35 .splice_read = generic_file_splice_read, 36 }; 37 38 EXPORT_SYMBOL(generic_ro_fops); 39 40 static inline int unsigned_offsets(struct file *file) 41 { 42 return file->f_mode & FMODE_UNSIGNED_OFFSET; 43 } 44 45 /** 46 * vfs_setpos - update the file offset for lseek 47 * @file: file structure in question 48 * @offset: file offset to seek to 49 * @maxsize: maximum file size 50 * 51 * This is a low-level filesystem helper for updating the file offset to 52 * the value specified by @offset if the given offset is valid and it is 53 * not equal to the current file offset. 54 * 55 * Return the specified offset on success and -EINVAL on invalid offset. 56 */ 57 loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize) 58 { 59 if (offset < 0 && !unsigned_offsets(file)) 60 return -EINVAL; 61 if (offset > maxsize) 62 return -EINVAL; 63 64 if (offset != file->f_pos) { 65 file->f_pos = offset; 66 file->f_version = 0; 67 } 68 return offset; 69 } 70 EXPORT_SYMBOL(vfs_setpos); 71 72 /** 73 * generic_file_llseek_size - generic llseek implementation for regular files 74 * @file: file structure to seek on 75 * @offset: file offset to seek to 76 * @whence: type of seek 77 * @size: max size of this file in file system 78 * @eof: offset used for SEEK_END position 79 * 80 * This is a variant of generic_file_llseek that allows passing in a custom 81 * maximum file size and a custom EOF position, for e.g. hashed directories 82 * 83 * Synchronization: 84 * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms) 85 * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes. 86 * read/writes behave like SEEK_SET against seeks. 87 */ 88 loff_t 89 generic_file_llseek_size(struct file *file, loff_t offset, int whence, 90 loff_t maxsize, loff_t eof) 91 { 92 switch (whence) { 93 case SEEK_END: 94 offset += eof; 95 break; 96 case SEEK_CUR: 97 /* 98 * Here we special-case the lseek(fd, 0, SEEK_CUR) 99 * position-querying operation. Avoid rewriting the "same" 100 * f_pos value back to the file because a concurrent read(), 101 * write() or lseek() might have altered it 102 */ 103 if (offset == 0) 104 return file->f_pos; 105 /* 106 * f_lock protects against read/modify/write race with other 107 * SEEK_CURs. Note that parallel writes and reads behave 108 * like SEEK_SET. 109 */ 110 spin_lock(&file->f_lock); 111 offset = vfs_setpos(file, file->f_pos + offset, maxsize); 112 spin_unlock(&file->f_lock); 113 return offset; 114 case SEEK_DATA: 115 /* 116 * In the generic case the entire file is data, so as long as 117 * offset isn't at the end of the file then the offset is data. 118 */ 119 if (offset >= eof) 120 return -ENXIO; 121 break; 122 case SEEK_HOLE: 123 /* 124 * There is a virtual hole at the end of the file, so as long as 125 * offset isn't i_size or larger, return i_size. 126 */ 127 if (offset >= eof) 128 return -ENXIO; 129 offset = eof; 130 break; 131 } 132 133 return vfs_setpos(file, offset, maxsize); 134 } 135 EXPORT_SYMBOL(generic_file_llseek_size); 136 137 /** 138 * generic_file_llseek - generic llseek implementation for regular files 139 * @file: file structure to seek on 140 * @offset: file offset to seek to 141 * @whence: type of seek 142 * 143 * This is a generic implemenation of ->llseek useable for all normal local 144 * filesystems. It just updates the file offset to the value specified by 145 * @offset and @whence. 146 */ 147 loff_t generic_file_llseek(struct file *file, loff_t offset, int whence) 148 { 149 struct inode *inode = file->f_mapping->host; 150 151 return generic_file_llseek_size(file, offset, whence, 152 inode->i_sb->s_maxbytes, 153 i_size_read(inode)); 154 } 155 EXPORT_SYMBOL(generic_file_llseek); 156 157 /** 158 * fixed_size_llseek - llseek implementation for fixed-sized devices 159 * @file: file structure to seek on 160 * @offset: file offset to seek to 161 * @whence: type of seek 162 * @size: size of the file 163 * 164 */ 165 loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size) 166 { 167 switch (whence) { 168 case SEEK_SET: case SEEK_CUR: case SEEK_END: 169 return generic_file_llseek_size(file, offset, whence, 170 size, size); 171 default: 172 return -EINVAL; 173 } 174 } 175 EXPORT_SYMBOL(fixed_size_llseek); 176 177 /** 178 * noop_llseek - No Operation Performed llseek implementation 179 * @file: file structure to seek on 180 * @offset: file offset to seek to 181 * @whence: type of seek 182 * 183 * This is an implementation of ->llseek useable for the rare special case when 184 * userspace expects the seek to succeed but the (device) file is actually not 185 * able to perform the seek. In this case you use noop_llseek() instead of 186 * falling back to the default implementation of ->llseek. 187 */ 188 loff_t noop_llseek(struct file *file, loff_t offset, int whence) 189 { 190 return file->f_pos; 191 } 192 EXPORT_SYMBOL(noop_llseek); 193 194 loff_t no_llseek(struct file *file, loff_t offset, int whence) 195 { 196 return -ESPIPE; 197 } 198 EXPORT_SYMBOL(no_llseek); 199 200 loff_t default_llseek(struct file *file, loff_t offset, int whence) 201 { 202 struct inode *inode = file_inode(file); 203 loff_t retval; 204 205 mutex_lock(&inode->i_mutex); 206 switch (whence) { 207 case SEEK_END: 208 offset += i_size_read(inode); 209 break; 210 case SEEK_CUR: 211 if (offset == 0) { 212 retval = file->f_pos; 213 goto out; 214 } 215 offset += file->f_pos; 216 break; 217 case SEEK_DATA: 218 /* 219 * In the generic case the entire file is data, so as 220 * long as offset isn't at the end of the file then the 221 * offset is data. 222 */ 223 if (offset >= inode->i_size) { 224 retval = -ENXIO; 225 goto out; 226 } 227 break; 228 case SEEK_HOLE: 229 /* 230 * There is a virtual hole at the end of the file, so 231 * as long as offset isn't i_size or larger, return 232 * i_size. 233 */ 234 if (offset >= inode->i_size) { 235 retval = -ENXIO; 236 goto out; 237 } 238 offset = inode->i_size; 239 break; 240 } 241 retval = -EINVAL; 242 if (offset >= 0 || unsigned_offsets(file)) { 243 if (offset != file->f_pos) { 244 file->f_pos = offset; 245 file->f_version = 0; 246 } 247 retval = offset; 248 } 249 out: 250 mutex_unlock(&inode->i_mutex); 251 return retval; 252 } 253 EXPORT_SYMBOL(default_llseek); 254 255 loff_t vfs_llseek(struct file *file, loff_t offset, int whence) 256 { 257 loff_t (*fn)(struct file *, loff_t, int); 258 259 fn = no_llseek; 260 if (file->f_mode & FMODE_LSEEK) { 261 if (file->f_op->llseek) 262 fn = file->f_op->llseek; 263 } 264 return fn(file, offset, whence); 265 } 266 EXPORT_SYMBOL(vfs_llseek); 267 268 static inline struct fd fdget_pos(int fd) 269 { 270 return __to_fd(__fdget_pos(fd)); 271 } 272 273 static inline void fdput_pos(struct fd f) 274 { 275 if (f.flags & FDPUT_POS_UNLOCK) 276 mutex_unlock(&f.file->f_pos_lock); 277 fdput(f); 278 } 279 280 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) 281 { 282 off_t retval; 283 struct fd f = fdget_pos(fd); 284 if (!f.file) 285 return -EBADF; 286 287 retval = -EINVAL; 288 if (whence <= SEEK_MAX) { 289 loff_t res = vfs_llseek(f.file, offset, whence); 290 retval = res; 291 if (res != (loff_t)retval) 292 retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ 293 } 294 fdput_pos(f); 295 return retval; 296 } 297 298 #ifdef CONFIG_COMPAT 299 COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) 300 { 301 return sys_lseek(fd, offset, whence); 302 } 303 #endif 304 305 #ifdef __ARCH_WANT_SYS_LLSEEK 306 SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high, 307 unsigned long, offset_low, loff_t __user *, result, 308 unsigned int, whence) 309 { 310 int retval; 311 struct fd f = fdget_pos(fd); 312 loff_t offset; 313 314 if (!f.file) 315 return -EBADF; 316 317 retval = -EINVAL; 318 if (whence > SEEK_MAX) 319 goto out_putf; 320 321 offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low, 322 whence); 323 324 retval = (int)offset; 325 if (offset >= 0) { 326 retval = -EFAULT; 327 if (!copy_to_user(result, &offset, sizeof(offset))) 328 retval = 0; 329 } 330 out_putf: 331 fdput_pos(f); 332 return retval; 333 } 334 #endif 335 336 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) 337 { 338 struct kiocb kiocb; 339 ssize_t ret; 340 341 if (!file->f_op->read_iter) 342 return -EINVAL; 343 344 init_sync_kiocb(&kiocb, file); 345 kiocb.ki_pos = *ppos; 346 kiocb.ki_nbytes = iov_iter_count(iter); 347 348 iter->type |= READ; 349 ret = file->f_op->read_iter(&kiocb, iter); 350 if (ret == -EIOCBQUEUED) 351 ret = wait_on_sync_kiocb(&kiocb); 352 353 if (ret > 0) 354 *ppos = kiocb.ki_pos; 355 return ret; 356 } 357 EXPORT_SYMBOL(vfs_iter_read); 358 359 ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) 360 { 361 struct kiocb kiocb; 362 ssize_t ret; 363 364 if (!file->f_op->write_iter) 365 return -EINVAL; 366 367 init_sync_kiocb(&kiocb, file); 368 kiocb.ki_pos = *ppos; 369 kiocb.ki_nbytes = iov_iter_count(iter); 370 371 iter->type |= WRITE; 372 ret = file->f_op->write_iter(&kiocb, iter); 373 if (ret == -EIOCBQUEUED) 374 ret = wait_on_sync_kiocb(&kiocb); 375 376 if (ret > 0) 377 *ppos = kiocb.ki_pos; 378 return ret; 379 } 380 EXPORT_SYMBOL(vfs_iter_write); 381 382 /* 383 * rw_verify_area doesn't like huge counts. We limit 384 * them to something that fits in "int" so that others 385 * won't have to do range checks all the time. 386 */ 387 int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) 388 { 389 struct inode *inode; 390 loff_t pos; 391 int retval = -EINVAL; 392 393 inode = file_inode(file); 394 if (unlikely((ssize_t) count < 0)) 395 return retval; 396 pos = *ppos; 397 if (unlikely(pos < 0)) { 398 if (!unsigned_offsets(file)) 399 return retval; 400 if (count >= -pos) /* both values are in 0..LLONG_MAX */ 401 return -EOVERFLOW; 402 } else if (unlikely((loff_t) (pos + count) < 0)) { 403 if (!unsigned_offsets(file)) 404 return retval; 405 } 406 407 if (unlikely(inode->i_flctx && mandatory_lock(inode))) { 408 retval = locks_mandatory_area( 409 read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, 410 inode, file, pos, count); 411 if (retval < 0) 412 return retval; 413 } 414 retval = security_file_permission(file, 415 read_write == READ ? MAY_READ : MAY_WRITE); 416 if (retval) 417 return retval; 418 return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; 419 } 420 421 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 422 { 423 struct iovec iov = { .iov_base = buf, .iov_len = len }; 424 struct kiocb kiocb; 425 ssize_t ret; 426 427 init_sync_kiocb(&kiocb, filp); 428 kiocb.ki_pos = *ppos; 429 kiocb.ki_nbytes = len; 430 431 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 432 if (-EIOCBQUEUED == ret) 433 ret = wait_on_sync_kiocb(&kiocb); 434 *ppos = kiocb.ki_pos; 435 return ret; 436 } 437 438 EXPORT_SYMBOL(do_sync_read); 439 440 ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 441 { 442 struct iovec iov = { .iov_base = buf, .iov_len = len }; 443 struct kiocb kiocb; 444 struct iov_iter iter; 445 ssize_t ret; 446 447 init_sync_kiocb(&kiocb, filp); 448 kiocb.ki_pos = *ppos; 449 kiocb.ki_nbytes = len; 450 iov_iter_init(&iter, READ, &iov, 1, len); 451 452 ret = filp->f_op->read_iter(&kiocb, &iter); 453 if (-EIOCBQUEUED == ret) 454 ret = wait_on_sync_kiocb(&kiocb); 455 *ppos = kiocb.ki_pos; 456 return ret; 457 } 458 459 EXPORT_SYMBOL(new_sync_read); 460 461 ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, 462 loff_t *pos) 463 { 464 ssize_t ret; 465 466 if (file->f_op->read) 467 ret = file->f_op->read(file, buf, count, pos); 468 else if (file->f_op->aio_read) 469 ret = do_sync_read(file, buf, count, pos); 470 else if (file->f_op->read_iter) 471 ret = new_sync_read(file, buf, count, pos); 472 else 473 ret = -EINVAL; 474 475 return ret; 476 } 477 478 ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) 479 { 480 ssize_t ret; 481 482 if (!(file->f_mode & FMODE_READ)) 483 return -EBADF; 484 if (!(file->f_mode & FMODE_CAN_READ)) 485 return -EINVAL; 486 if (unlikely(!access_ok(VERIFY_WRITE, buf, count))) 487 return -EFAULT; 488 489 ret = rw_verify_area(READ, file, pos, count); 490 if (ret >= 0) { 491 count = ret; 492 ret = __vfs_read(file, buf, count, pos); 493 if (ret > 0) { 494 fsnotify_access(file); 495 add_rchar(current, ret); 496 } 497 inc_syscr(current); 498 } 499 500 return ret; 501 } 502 503 EXPORT_SYMBOL(vfs_read); 504 505 ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 506 { 507 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; 508 struct kiocb kiocb; 509 ssize_t ret; 510 511 init_sync_kiocb(&kiocb, filp); 512 kiocb.ki_pos = *ppos; 513 kiocb.ki_nbytes = len; 514 515 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 516 if (-EIOCBQUEUED == ret) 517 ret = wait_on_sync_kiocb(&kiocb); 518 *ppos = kiocb.ki_pos; 519 return ret; 520 } 521 522 EXPORT_SYMBOL(do_sync_write); 523 524 ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) 525 { 526 struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; 527 struct kiocb kiocb; 528 struct iov_iter iter; 529 ssize_t ret; 530 531 init_sync_kiocb(&kiocb, filp); 532 kiocb.ki_pos = *ppos; 533 kiocb.ki_nbytes = len; 534 iov_iter_init(&iter, WRITE, &iov, 1, len); 535 536 ret = filp->f_op->write_iter(&kiocb, &iter); 537 if (-EIOCBQUEUED == ret) 538 ret = wait_on_sync_kiocb(&kiocb); 539 *ppos = kiocb.ki_pos; 540 return ret; 541 } 542 543 EXPORT_SYMBOL(new_sync_write); 544 545 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) 546 { 547 mm_segment_t old_fs; 548 const char __user *p; 549 ssize_t ret; 550 551 if (!(file->f_mode & FMODE_CAN_WRITE)) 552 return -EINVAL; 553 554 old_fs = get_fs(); 555 set_fs(get_ds()); 556 p = (__force const char __user *)buf; 557 if (count > MAX_RW_COUNT) 558 count = MAX_RW_COUNT; 559 if (file->f_op->write) 560 ret = file->f_op->write(file, p, count, pos); 561 else if (file->f_op->aio_write) 562 ret = do_sync_write(file, p, count, pos); 563 else 564 ret = new_sync_write(file, p, count, pos); 565 set_fs(old_fs); 566 if (ret > 0) { 567 fsnotify_modify(file); 568 add_wchar(current, ret); 569 } 570 inc_syscw(current); 571 return ret; 572 } 573 574 EXPORT_SYMBOL(__kernel_write); 575 576 ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) 577 { 578 ssize_t ret; 579 580 if (!(file->f_mode & FMODE_WRITE)) 581 return -EBADF; 582 if (!(file->f_mode & FMODE_CAN_WRITE)) 583 return -EINVAL; 584 if (unlikely(!access_ok(VERIFY_READ, buf, count))) 585 return -EFAULT; 586 587 ret = rw_verify_area(WRITE, file, pos, count); 588 if (ret >= 0) { 589 count = ret; 590 file_start_write(file); 591 if (file->f_op->write) 592 ret = file->f_op->write(file, buf, count, pos); 593 else if (file->f_op->aio_write) 594 ret = do_sync_write(file, buf, count, pos); 595 else 596 ret = new_sync_write(file, buf, count, pos); 597 if (ret > 0) { 598 fsnotify_modify(file); 599 add_wchar(current, ret); 600 } 601 inc_syscw(current); 602 file_end_write(file); 603 } 604 605 return ret; 606 } 607 608 EXPORT_SYMBOL(vfs_write); 609 610 static inline loff_t file_pos_read(struct file *file) 611 { 612 return file->f_pos; 613 } 614 615 static inline void file_pos_write(struct file *file, loff_t pos) 616 { 617 file->f_pos = pos; 618 } 619 620 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) 621 { 622 struct fd f = fdget_pos(fd); 623 ssize_t ret = -EBADF; 624 625 if (f.file) { 626 loff_t pos = file_pos_read(f.file); 627 ret = vfs_read(f.file, buf, count, &pos); 628 if (ret >= 0) 629 file_pos_write(f.file, pos); 630 fdput_pos(f); 631 } 632 return ret; 633 } 634 635 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, 636 size_t, count) 637 { 638 struct fd f = fdget_pos(fd); 639 ssize_t ret = -EBADF; 640 641 if (f.file) { 642 loff_t pos = file_pos_read(f.file); 643 ret = vfs_write(f.file, buf, count, &pos); 644 if (ret >= 0) 645 file_pos_write(f.file, pos); 646 fdput_pos(f); 647 } 648 649 return ret; 650 } 651 652 SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, 653 size_t, count, loff_t, pos) 654 { 655 struct fd f; 656 ssize_t ret = -EBADF; 657 658 if (pos < 0) 659 return -EINVAL; 660 661 f = fdget(fd); 662 if (f.file) { 663 ret = -ESPIPE; 664 if (f.file->f_mode & FMODE_PREAD) 665 ret = vfs_read(f.file, buf, count, &pos); 666 fdput(f); 667 } 668 669 return ret; 670 } 671 672 SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, 673 size_t, count, loff_t, pos) 674 { 675 struct fd f; 676 ssize_t ret = -EBADF; 677 678 if (pos < 0) 679 return -EINVAL; 680 681 f = fdget(fd); 682 if (f.file) { 683 ret = -ESPIPE; 684 if (f.file->f_mode & FMODE_PWRITE) 685 ret = vfs_write(f.file, buf, count, &pos); 686 fdput(f); 687 } 688 689 return ret; 690 } 691 692 /* 693 * Reduce an iovec's length in-place. Return the resulting number of segments 694 */ 695 unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) 696 { 697 unsigned long seg = 0; 698 size_t len = 0; 699 700 while (seg < nr_segs) { 701 seg++; 702 if (len + iov->iov_len >= to) { 703 iov->iov_len = to - len; 704 break; 705 } 706 len += iov->iov_len; 707 iov++; 708 } 709 return seg; 710 } 711 EXPORT_SYMBOL(iov_shorten); 712 713 static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov, 714 unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn) 715 { 716 struct kiocb kiocb; 717 struct iov_iter iter; 718 ssize_t ret; 719 720 init_sync_kiocb(&kiocb, filp); 721 kiocb.ki_pos = *ppos; 722 kiocb.ki_nbytes = len; 723 724 iov_iter_init(&iter, rw, iov, nr_segs, len); 725 ret = fn(&kiocb, &iter); 726 if (ret == -EIOCBQUEUED) 727 ret = wait_on_sync_kiocb(&kiocb); 728 *ppos = kiocb.ki_pos; 729 return ret; 730 } 731 732 static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, 733 unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) 734 { 735 struct kiocb kiocb; 736 ssize_t ret; 737 738 init_sync_kiocb(&kiocb, filp); 739 kiocb.ki_pos = *ppos; 740 kiocb.ki_nbytes = len; 741 742 ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); 743 if (ret == -EIOCBQUEUED) 744 ret = wait_on_sync_kiocb(&kiocb); 745 *ppos = kiocb.ki_pos; 746 return ret; 747 } 748 749 /* Do it by hand, with file-ops */ 750 static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, 751 unsigned long nr_segs, loff_t *ppos, io_fn_t fn) 752 { 753 struct iovec *vector = iov; 754 ssize_t ret = 0; 755 756 while (nr_segs > 0) { 757 void __user *base; 758 size_t len; 759 ssize_t nr; 760 761 base = vector->iov_base; 762 len = vector->iov_len; 763 vector++; 764 nr_segs--; 765 766 nr = fn(filp, base, len, ppos); 767 768 if (nr < 0) { 769 if (!ret) 770 ret = nr; 771 break; 772 } 773 ret += nr; 774 if (nr != len) 775 break; 776 } 777 778 return ret; 779 } 780 781 /* A write operation does a read from user space and vice versa */ 782 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) 783 784 ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 785 unsigned long nr_segs, unsigned long fast_segs, 786 struct iovec *fast_pointer, 787 struct iovec **ret_pointer) 788 { 789 unsigned long seg; 790 ssize_t ret; 791 struct iovec *iov = fast_pointer; 792 793 /* 794 * SuS says "The readv() function *may* fail if the iovcnt argument 795 * was less than or equal to 0, or greater than {IOV_MAX}. Linux has 796 * traditionally returned zero for zero segments, so... 797 */ 798 if (nr_segs == 0) { 799 ret = 0; 800 goto out; 801 } 802 803 /* 804 * First get the "struct iovec" from user memory and 805 * verify all the pointers 806 */ 807 if (nr_segs > UIO_MAXIOV) { 808 ret = -EINVAL; 809 goto out; 810 } 811 if (nr_segs > fast_segs) { 812 iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); 813 if (iov == NULL) { 814 ret = -ENOMEM; 815 goto out; 816 } 817 } 818 if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) { 819 ret = -EFAULT; 820 goto out; 821 } 822 823 /* 824 * According to the Single Unix Specification we should return EINVAL 825 * if an element length is < 0 when cast to ssize_t or if the 826 * total length would overflow the ssize_t return value of the 827 * system call. 828 * 829 * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the 830 * overflow case. 831 */ 832 ret = 0; 833 for (seg = 0; seg < nr_segs; seg++) { 834 void __user *buf = iov[seg].iov_base; 835 ssize_t len = (ssize_t)iov[seg].iov_len; 836 837 /* see if we we're about to use an invalid len or if 838 * it's about to overflow ssize_t */ 839 if (len < 0) { 840 ret = -EINVAL; 841 goto out; 842 } 843 if (type >= 0 844 && unlikely(!access_ok(vrfy_dir(type), buf, len))) { 845 ret = -EFAULT; 846 goto out; 847 } 848 if (len > MAX_RW_COUNT - ret) { 849 len = MAX_RW_COUNT - ret; 850 iov[seg].iov_len = len; 851 } 852 ret += len; 853 } 854 out: 855 *ret_pointer = iov; 856 return ret; 857 } 858 859 static ssize_t do_readv_writev(int type, struct file *file, 860 const struct iovec __user * uvector, 861 unsigned long nr_segs, loff_t *pos) 862 { 863 size_t tot_len; 864 struct iovec iovstack[UIO_FASTIOV]; 865 struct iovec *iov = iovstack; 866 ssize_t ret; 867 io_fn_t fn; 868 iov_fn_t fnv; 869 iter_fn_t iter_fn; 870 871 ret = rw_copy_check_uvector(type, uvector, nr_segs, 872 ARRAY_SIZE(iovstack), iovstack, &iov); 873 if (ret <= 0) 874 goto out; 875 876 tot_len = ret; 877 ret = rw_verify_area(type, file, pos, tot_len); 878 if (ret < 0) 879 goto out; 880 881 fnv = NULL; 882 if (type == READ) { 883 fn = file->f_op->read; 884 fnv = file->f_op->aio_read; 885 iter_fn = file->f_op->read_iter; 886 } else { 887 fn = (io_fn_t)file->f_op->write; 888 fnv = file->f_op->aio_write; 889 iter_fn = file->f_op->write_iter; 890 file_start_write(file); 891 } 892 893 if (iter_fn) 894 ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, 895 pos, iter_fn); 896 else if (fnv) 897 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, 898 pos, fnv); 899 else 900 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); 901 902 if (type != READ) 903 file_end_write(file); 904 905 out: 906 if (iov != iovstack) 907 kfree(iov); 908 if ((ret + (type == READ)) > 0) { 909 if (type == READ) 910 fsnotify_access(file); 911 else 912 fsnotify_modify(file); 913 } 914 return ret; 915 } 916 917 ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, 918 unsigned long vlen, loff_t *pos) 919 { 920 if (!(file->f_mode & FMODE_READ)) 921 return -EBADF; 922 if (!(file->f_mode & FMODE_CAN_READ)) 923 return -EINVAL; 924 925 return do_readv_writev(READ, file, vec, vlen, pos); 926 } 927 928 EXPORT_SYMBOL(vfs_readv); 929 930 ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, 931 unsigned long vlen, loff_t *pos) 932 { 933 if (!(file->f_mode & FMODE_WRITE)) 934 return -EBADF; 935 if (!(file->f_mode & FMODE_CAN_WRITE)) 936 return -EINVAL; 937 938 return do_readv_writev(WRITE, file, vec, vlen, pos); 939 } 940 941 EXPORT_SYMBOL(vfs_writev); 942 943 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, 944 unsigned long, vlen) 945 { 946 struct fd f = fdget_pos(fd); 947 ssize_t ret = -EBADF; 948 949 if (f.file) { 950 loff_t pos = file_pos_read(f.file); 951 ret = vfs_readv(f.file, vec, vlen, &pos); 952 if (ret >= 0) 953 file_pos_write(f.file, pos); 954 fdput_pos(f); 955 } 956 957 if (ret > 0) 958 add_rchar(current, ret); 959 inc_syscr(current); 960 return ret; 961 } 962 963 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, 964 unsigned long, vlen) 965 { 966 struct fd f = fdget_pos(fd); 967 ssize_t ret = -EBADF; 968 969 if (f.file) { 970 loff_t pos = file_pos_read(f.file); 971 ret = vfs_writev(f.file, vec, vlen, &pos); 972 if (ret >= 0) 973 file_pos_write(f.file, pos); 974 fdput_pos(f); 975 } 976 977 if (ret > 0) 978 add_wchar(current, ret); 979 inc_syscw(current); 980 return ret; 981 } 982 983 static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) 984 { 985 #define HALF_LONG_BITS (BITS_PER_LONG / 2) 986 return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low; 987 } 988 989 SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, 990 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 991 { 992 loff_t pos = pos_from_hilo(pos_h, pos_l); 993 struct fd f; 994 ssize_t ret = -EBADF; 995 996 if (pos < 0) 997 return -EINVAL; 998 999 f = fdget(fd); 1000 if (f.file) { 1001 ret = -ESPIPE; 1002 if (f.file->f_mode & FMODE_PREAD) 1003 ret = vfs_readv(f.file, vec, vlen, &pos); 1004 fdput(f); 1005 } 1006 1007 if (ret > 0) 1008 add_rchar(current, ret); 1009 inc_syscr(current); 1010 return ret; 1011 } 1012 1013 SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, 1014 unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h) 1015 { 1016 loff_t pos = pos_from_hilo(pos_h, pos_l); 1017 struct fd f; 1018 ssize_t ret = -EBADF; 1019 1020 if (pos < 0) 1021 return -EINVAL; 1022 1023 f = fdget(fd); 1024 if (f.file) { 1025 ret = -ESPIPE; 1026 if (f.file->f_mode & FMODE_PWRITE) 1027 ret = vfs_writev(f.file, vec, vlen, &pos); 1028 fdput(f); 1029 } 1030 1031 if (ret > 0) 1032 add_wchar(current, ret); 1033 inc_syscw(current); 1034 return ret; 1035 } 1036 1037 #ifdef CONFIG_COMPAT 1038 1039 static ssize_t compat_do_readv_writev(int type, struct file *file, 1040 const struct compat_iovec __user *uvector, 1041 unsigned long nr_segs, loff_t *pos) 1042 { 1043 compat_ssize_t tot_len; 1044 struct iovec iovstack[UIO_FASTIOV]; 1045 struct iovec *iov = iovstack; 1046 ssize_t ret; 1047 io_fn_t fn; 1048 iov_fn_t fnv; 1049 iter_fn_t iter_fn; 1050 1051 ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, 1052 UIO_FASTIOV, iovstack, &iov); 1053 if (ret <= 0) 1054 goto out; 1055 1056 tot_len = ret; 1057 ret = rw_verify_area(type, file, pos, tot_len); 1058 if (ret < 0) 1059 goto out; 1060 1061 fnv = NULL; 1062 if (type == READ) { 1063 fn = file->f_op->read; 1064 fnv = file->f_op->aio_read; 1065 iter_fn = file->f_op->read_iter; 1066 } else { 1067 fn = (io_fn_t)file->f_op->write; 1068 fnv = file->f_op->aio_write; 1069 iter_fn = file->f_op->write_iter; 1070 file_start_write(file); 1071 } 1072 1073 if (iter_fn) 1074 ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, 1075 pos, iter_fn); 1076 else if (fnv) 1077 ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, 1078 pos, fnv); 1079 else 1080 ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); 1081 1082 if (type != READ) 1083 file_end_write(file); 1084 1085 out: 1086 if (iov != iovstack) 1087 kfree(iov); 1088 if ((ret + (type == READ)) > 0) { 1089 if (type == READ) 1090 fsnotify_access(file); 1091 else 1092 fsnotify_modify(file); 1093 } 1094 return ret; 1095 } 1096 1097 static size_t compat_readv(struct file *file, 1098 const struct compat_iovec __user *vec, 1099 unsigned long vlen, loff_t *pos) 1100 { 1101 ssize_t ret = -EBADF; 1102 1103 if (!(file->f_mode & FMODE_READ)) 1104 goto out; 1105 1106 ret = -EINVAL; 1107 if (!(file->f_mode & FMODE_CAN_READ)) 1108 goto out; 1109 1110 ret = compat_do_readv_writev(READ, file, vec, vlen, pos); 1111 1112 out: 1113 if (ret > 0) 1114 add_rchar(current, ret); 1115 inc_syscr(current); 1116 return ret; 1117 } 1118 1119 COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, 1120 const struct compat_iovec __user *,vec, 1121 compat_ulong_t, vlen) 1122 { 1123 struct fd f = fdget_pos(fd); 1124 ssize_t ret; 1125 loff_t pos; 1126 1127 if (!f.file) 1128 return -EBADF; 1129 pos = f.file->f_pos; 1130 ret = compat_readv(f.file, vec, vlen, &pos); 1131 if (ret >= 0) 1132 f.file->f_pos = pos; 1133 fdput_pos(f); 1134 return ret; 1135 } 1136 1137 static long __compat_sys_preadv64(unsigned long fd, 1138 const struct compat_iovec __user *vec, 1139 unsigned long vlen, loff_t pos) 1140 { 1141 struct fd f; 1142 ssize_t ret; 1143 1144 if (pos < 0) 1145 return -EINVAL; 1146 f = fdget(fd); 1147 if (!f.file) 1148 return -EBADF; 1149 ret = -ESPIPE; 1150 if (f.file->f_mode & FMODE_PREAD) 1151 ret = compat_readv(f.file, vec, vlen, &pos); 1152 fdput(f); 1153 return ret; 1154 } 1155 1156 #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64 1157 COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd, 1158 const struct compat_iovec __user *,vec, 1159 unsigned long, vlen, loff_t, pos) 1160 { 1161 return __compat_sys_preadv64(fd, vec, vlen, pos); 1162 } 1163 #endif 1164 1165 COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, 1166 const struct compat_iovec __user *,vec, 1167 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1168 { 1169 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1170 1171 return __compat_sys_preadv64(fd, vec, vlen, pos); 1172 } 1173 1174 static size_t compat_writev(struct file *file, 1175 const struct compat_iovec __user *vec, 1176 unsigned long vlen, loff_t *pos) 1177 { 1178 ssize_t ret = -EBADF; 1179 1180 if (!(file->f_mode & FMODE_WRITE)) 1181 goto out; 1182 1183 ret = -EINVAL; 1184 if (!(file->f_mode & FMODE_CAN_WRITE)) 1185 goto out; 1186 1187 ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos); 1188 1189 out: 1190 if (ret > 0) 1191 add_wchar(current, ret); 1192 inc_syscw(current); 1193 return ret; 1194 } 1195 1196 COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, 1197 const struct compat_iovec __user *, vec, 1198 compat_ulong_t, vlen) 1199 { 1200 struct fd f = fdget_pos(fd); 1201 ssize_t ret; 1202 loff_t pos; 1203 1204 if (!f.file) 1205 return -EBADF; 1206 pos = f.file->f_pos; 1207 ret = compat_writev(f.file, vec, vlen, &pos); 1208 if (ret >= 0) 1209 f.file->f_pos = pos; 1210 fdput_pos(f); 1211 return ret; 1212 } 1213 1214 static long __compat_sys_pwritev64(unsigned long fd, 1215 const struct compat_iovec __user *vec, 1216 unsigned long vlen, loff_t pos) 1217 { 1218 struct fd f; 1219 ssize_t ret; 1220 1221 if (pos < 0) 1222 return -EINVAL; 1223 f = fdget(fd); 1224 if (!f.file) 1225 return -EBADF; 1226 ret = -ESPIPE; 1227 if (f.file->f_mode & FMODE_PWRITE) 1228 ret = compat_writev(f.file, vec, vlen, &pos); 1229 fdput(f); 1230 return ret; 1231 } 1232 1233 #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64 1234 COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd, 1235 const struct compat_iovec __user *,vec, 1236 unsigned long, vlen, loff_t, pos) 1237 { 1238 return __compat_sys_pwritev64(fd, vec, vlen, pos); 1239 } 1240 #endif 1241 1242 COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, 1243 const struct compat_iovec __user *,vec, 1244 compat_ulong_t, vlen, u32, pos_low, u32, pos_high) 1245 { 1246 loff_t pos = ((loff_t)pos_high << 32) | pos_low; 1247 1248 return __compat_sys_pwritev64(fd, vec, vlen, pos); 1249 } 1250 #endif 1251 1252 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, 1253 size_t count, loff_t max) 1254 { 1255 struct fd in, out; 1256 struct inode *in_inode, *out_inode; 1257 loff_t pos; 1258 loff_t out_pos; 1259 ssize_t retval; 1260 int fl; 1261 1262 /* 1263 * Get input file, and verify that it is ok.. 1264 */ 1265 retval = -EBADF; 1266 in = fdget(in_fd); 1267 if (!in.file) 1268 goto out; 1269 if (!(in.file->f_mode & FMODE_READ)) 1270 goto fput_in; 1271 retval = -ESPIPE; 1272 if (!ppos) { 1273 pos = in.file->f_pos; 1274 } else { 1275 pos = *ppos; 1276 if (!(in.file->f_mode & FMODE_PREAD)) 1277 goto fput_in; 1278 } 1279 retval = rw_verify_area(READ, in.file, &pos, count); 1280 if (retval < 0) 1281 goto fput_in; 1282 count = retval; 1283 1284 /* 1285 * Get output file, and verify that it is ok.. 1286 */ 1287 retval = -EBADF; 1288 out = fdget(out_fd); 1289 if (!out.file) 1290 goto fput_in; 1291 if (!(out.file->f_mode & FMODE_WRITE)) 1292 goto fput_out; 1293 retval = -EINVAL; 1294 in_inode = file_inode(in.file); 1295 out_inode = file_inode(out.file); 1296 out_pos = out.file->f_pos; 1297 retval = rw_verify_area(WRITE, out.file, &out_pos, count); 1298 if (retval < 0) 1299 goto fput_out; 1300 count = retval; 1301 1302 if (!max) 1303 max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); 1304 1305 if (unlikely(pos + count > max)) { 1306 retval = -EOVERFLOW; 1307 if (pos >= max) 1308 goto fput_out; 1309 count = max - pos; 1310 } 1311 1312 fl = 0; 1313 #if 0 1314 /* 1315 * We need to debate whether we can enable this or not. The 1316 * man page documents EAGAIN return for the output at least, 1317 * and the application is arguably buggy if it doesn't expect 1318 * EAGAIN on a non-blocking file descriptor. 1319 */ 1320 if (in.file->f_flags & O_NONBLOCK) 1321 fl = SPLICE_F_NONBLOCK; 1322 #endif 1323 file_start_write(out.file); 1324 retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); 1325 file_end_write(out.file); 1326 1327 if (retval > 0) { 1328 add_rchar(current, retval); 1329 add_wchar(current, retval); 1330 fsnotify_access(in.file); 1331 fsnotify_modify(out.file); 1332 out.file->f_pos = out_pos; 1333 if (ppos) 1334 *ppos = pos; 1335 else 1336 in.file->f_pos = pos; 1337 } 1338 1339 inc_syscr(current); 1340 inc_syscw(current); 1341 if (pos > max) 1342 retval = -EOVERFLOW; 1343 1344 fput_out: 1345 fdput(out); 1346 fput_in: 1347 fdput(in); 1348 out: 1349 return retval; 1350 } 1351 1352 SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count) 1353 { 1354 loff_t pos; 1355 off_t off; 1356 ssize_t ret; 1357 1358 if (offset) { 1359 if (unlikely(get_user(off, offset))) 1360 return -EFAULT; 1361 pos = off; 1362 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1363 if (unlikely(put_user(pos, offset))) 1364 return -EFAULT; 1365 return ret; 1366 } 1367 1368 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1369 } 1370 1371 SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count) 1372 { 1373 loff_t pos; 1374 ssize_t ret; 1375 1376 if (offset) { 1377 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1378 return -EFAULT; 1379 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1380 if (unlikely(put_user(pos, offset))) 1381 return -EFAULT; 1382 return ret; 1383 } 1384 1385 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1386 } 1387 1388 #ifdef CONFIG_COMPAT 1389 COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, 1390 compat_off_t __user *, offset, compat_size_t, count) 1391 { 1392 loff_t pos; 1393 off_t off; 1394 ssize_t ret; 1395 1396 if (offset) { 1397 if (unlikely(get_user(off, offset))) 1398 return -EFAULT; 1399 pos = off; 1400 ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS); 1401 if (unlikely(put_user(pos, offset))) 1402 return -EFAULT; 1403 return ret; 1404 } 1405 1406 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1407 } 1408 1409 COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, 1410 compat_loff_t __user *, offset, compat_size_t, count) 1411 { 1412 loff_t pos; 1413 ssize_t ret; 1414 1415 if (offset) { 1416 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t)))) 1417 return -EFAULT; 1418 ret = do_sendfile(out_fd, in_fd, &pos, count, 0); 1419 if (unlikely(put_user(pos, offset))) 1420 return -EFAULT; 1421 return ret; 1422 } 1423 1424 return do_sendfile(out_fd, in_fd, NULL, count, 0); 1425 } 1426 #endif 1427