1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/pipe_fs_i.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 32 struct partial_page { 33 unsigned int offset; 34 unsigned int len; 35 }; 36 37 /* 38 * Passed to splice_to_pipe 39 */ 40 struct splice_pipe_desc { 41 struct page **pages; /* page map */ 42 struct partial_page *partial; /* pages[] may not be contig */ 43 int nr_pages; /* number of pages in map */ 44 unsigned int flags; /* splice flags */ 45 const struct pipe_buf_operations *ops;/* ops associated with output pipe */ 46 }; 47 48 /* 49 * Attempt to steal a page from a pipe buffer. This should perhaps go into 50 * a vm helper function, it's already simplified quite a bit by the 51 * addition of remove_mapping(). If success is returned, the caller may 52 * attempt to reuse this page for another destination. 53 */ 54 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 55 struct pipe_buffer *buf) 56 { 57 struct page *page = buf->page; 58 struct address_space *mapping; 59 60 lock_page(page); 61 62 mapping = page_mapping(page); 63 if (mapping) { 64 WARN_ON(!PageUptodate(page)); 65 66 /* 67 * At least for ext2 with nobh option, we need to wait on 68 * writeback completing on this page, since we'll remove it 69 * from the pagecache. Otherwise truncate wont wait on the 70 * page, allowing the disk blocks to be reused by someone else 71 * before we actually wrote our data to them. fs corruption 72 * ensues. 73 */ 74 wait_on_page_writeback(page); 75 76 if (PagePrivate(page)) 77 try_to_release_page(page, GFP_KERNEL); 78 79 /* 80 * If we succeeded in removing the mapping, set LRU flag 81 * and return good. 82 */ 83 if (remove_mapping(mapping, page)) { 84 buf->flags |= PIPE_BUF_FLAG_LRU; 85 return 0; 86 } 87 } 88 89 /* 90 * Raced with truncate or failed to remove page from current 91 * address space, unlock and return failure. 92 */ 93 unlock_page(page); 94 return 1; 95 } 96 97 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 98 struct pipe_buffer *buf) 99 { 100 page_cache_release(buf->page); 101 buf->flags &= ~PIPE_BUF_FLAG_LRU; 102 } 103 104 static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, 105 struct pipe_buffer *buf) 106 { 107 struct page *page = buf->page; 108 int err; 109 110 if (!PageUptodate(page)) { 111 lock_page(page); 112 113 /* 114 * Page got truncated/unhashed. This will cause a 0-byte 115 * splice, if this is the first page. 116 */ 117 if (!page->mapping) { 118 err = -ENODATA; 119 goto error; 120 } 121 122 /* 123 * Uh oh, read-error from disk. 124 */ 125 if (!PageUptodate(page)) { 126 err = -EIO; 127 goto error; 128 } 129 130 /* 131 * Page is ok afterall, we are done. 132 */ 133 unlock_page(page); 134 } 135 136 return 0; 137 error: 138 unlock_page(page); 139 return err; 140 } 141 142 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 143 .can_merge = 0, 144 .map = generic_pipe_buf_map, 145 .unmap = generic_pipe_buf_unmap, 146 .pin = page_cache_pipe_buf_pin, 147 .release = page_cache_pipe_buf_release, 148 .steal = page_cache_pipe_buf_steal, 149 .get = generic_pipe_buf_get, 150 }; 151 152 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 153 struct pipe_buffer *buf) 154 { 155 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 156 return 1; 157 158 buf->flags |= PIPE_BUF_FLAG_LRU; 159 return generic_pipe_buf_steal(pipe, buf); 160 } 161 162 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 163 .can_merge = 0, 164 .map = generic_pipe_buf_map, 165 .unmap = generic_pipe_buf_unmap, 166 .pin = generic_pipe_buf_pin, 167 .release = page_cache_pipe_buf_release, 168 .steal = user_page_pipe_buf_steal, 169 .get = generic_pipe_buf_get, 170 }; 171 172 /* 173 * Pipe output worker. This sets up our pipe format with the page cache 174 * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). 175 */ 176 static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 177 struct splice_pipe_desc *spd) 178 { 179 int ret, do_wakeup, page_nr; 180 181 ret = 0; 182 do_wakeup = 0; 183 page_nr = 0; 184 185 if (pipe->inode) 186 mutex_lock(&pipe->inode->i_mutex); 187 188 for (;;) { 189 if (!pipe->readers) { 190 send_sig(SIGPIPE, current, 0); 191 if (!ret) 192 ret = -EPIPE; 193 break; 194 } 195 196 if (pipe->nrbufs < PIPE_BUFFERS) { 197 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 198 struct pipe_buffer *buf = pipe->bufs + newbuf; 199 200 buf->page = spd->pages[page_nr]; 201 buf->offset = spd->partial[page_nr].offset; 202 buf->len = spd->partial[page_nr].len; 203 buf->ops = spd->ops; 204 if (spd->flags & SPLICE_F_GIFT) 205 buf->flags |= PIPE_BUF_FLAG_GIFT; 206 207 pipe->nrbufs++; 208 page_nr++; 209 ret += buf->len; 210 211 if (pipe->inode) 212 do_wakeup = 1; 213 214 if (!--spd->nr_pages) 215 break; 216 if (pipe->nrbufs < PIPE_BUFFERS) 217 continue; 218 219 break; 220 } 221 222 if (spd->flags & SPLICE_F_NONBLOCK) { 223 if (!ret) 224 ret = -EAGAIN; 225 break; 226 } 227 228 if (signal_pending(current)) { 229 if (!ret) 230 ret = -ERESTARTSYS; 231 break; 232 } 233 234 if (do_wakeup) { 235 smp_mb(); 236 if (waitqueue_active(&pipe->wait)) 237 wake_up_interruptible_sync(&pipe->wait); 238 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 239 do_wakeup = 0; 240 } 241 242 pipe->waiting_writers++; 243 pipe_wait(pipe); 244 pipe->waiting_writers--; 245 } 246 247 if (pipe->inode) 248 mutex_unlock(&pipe->inode->i_mutex); 249 250 if (do_wakeup) { 251 smp_mb(); 252 if (waitqueue_active(&pipe->wait)) 253 wake_up_interruptible(&pipe->wait); 254 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 255 } 256 257 while (page_nr < spd->nr_pages) 258 page_cache_release(spd->pages[page_nr++]); 259 260 return ret; 261 } 262 263 static int 264 __generic_file_splice_read(struct file *in, loff_t *ppos, 265 struct pipe_inode_info *pipe, size_t len, 266 unsigned int flags) 267 { 268 struct address_space *mapping = in->f_mapping; 269 unsigned int loff, nr_pages; 270 struct page *pages[PIPE_BUFFERS]; 271 struct partial_page partial[PIPE_BUFFERS]; 272 struct page *page; 273 pgoff_t index, end_index; 274 loff_t isize; 275 size_t total_len; 276 int error, page_nr; 277 struct splice_pipe_desc spd = { 278 .pages = pages, 279 .partial = partial, 280 .flags = flags, 281 .ops = &page_cache_pipe_buf_ops, 282 }; 283 284 index = *ppos >> PAGE_CACHE_SHIFT; 285 loff = *ppos & ~PAGE_CACHE_MASK; 286 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 287 288 if (nr_pages > PIPE_BUFFERS) 289 nr_pages = PIPE_BUFFERS; 290 291 /* 292 * Don't try to 2nd guess the read-ahead logic, call into 293 * page_cache_readahead() like the page cache reads would do. 294 */ 295 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 296 297 /* 298 * Now fill in the holes: 299 */ 300 error = 0; 301 total_len = 0; 302 303 /* 304 * Lookup the (hopefully) full range of pages we need. 305 */ 306 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 307 308 /* 309 * If find_get_pages_contig() returned fewer pages than we needed, 310 * allocate the rest. 311 */ 312 index += spd.nr_pages; 313 while (spd.nr_pages < nr_pages) { 314 /* 315 * Page could be there, find_get_pages_contig() breaks on 316 * the first hole. 317 */ 318 page = find_get_page(mapping, index); 319 if (!page) { 320 /* 321 * Make sure the read-ahead engine is notified 322 * about this failure. 323 */ 324 handle_ra_miss(mapping, &in->f_ra, index); 325 326 /* 327 * page didn't exist, allocate one. 328 */ 329 page = page_cache_alloc_cold(mapping); 330 if (!page) 331 break; 332 333 error = add_to_page_cache_lru(page, mapping, index, 334 GFP_KERNEL); 335 if (unlikely(error)) { 336 page_cache_release(page); 337 if (error == -EEXIST) 338 continue; 339 break; 340 } 341 /* 342 * add_to_page_cache() locks the page, unlock it 343 * to avoid convoluting the logic below even more. 344 */ 345 unlock_page(page); 346 } 347 348 pages[spd.nr_pages++] = page; 349 index++; 350 } 351 352 /* 353 * Now loop over the map and see if we need to start IO on any 354 * pages, fill in the partial map, etc. 355 */ 356 index = *ppos >> PAGE_CACHE_SHIFT; 357 nr_pages = spd.nr_pages; 358 spd.nr_pages = 0; 359 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 360 unsigned int this_len; 361 362 if (!len) 363 break; 364 365 /* 366 * this_len is the max we'll use from this page 367 */ 368 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 369 page = pages[page_nr]; 370 371 /* 372 * If the page isn't uptodate, we may need to start io on it 373 */ 374 if (!PageUptodate(page)) { 375 /* 376 * If in nonblock mode then dont block on waiting 377 * for an in-flight io page 378 */ 379 if (flags & SPLICE_F_NONBLOCK) { 380 if (TestSetPageLocked(page)) 381 break; 382 } else 383 lock_page(page); 384 385 /* 386 * page was truncated, stop here. if this isn't the 387 * first page, we'll just complete what we already 388 * added 389 */ 390 if (!page->mapping) { 391 unlock_page(page); 392 break; 393 } 394 /* 395 * page was already under io and is now done, great 396 */ 397 if (PageUptodate(page)) { 398 unlock_page(page); 399 goto fill_it; 400 } 401 402 /* 403 * need to read in the page 404 */ 405 error = mapping->a_ops->readpage(in, page); 406 if (unlikely(error)) { 407 /* 408 * We really should re-lookup the page here, 409 * but it complicates things a lot. Instead 410 * lets just do what we already stored, and 411 * we'll get it the next time we are called. 412 */ 413 if (error == AOP_TRUNCATED_PAGE) 414 error = 0; 415 416 break; 417 } 418 419 /* 420 * i_size must be checked after ->readpage(). 421 */ 422 isize = i_size_read(mapping->host); 423 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 424 if (unlikely(!isize || index > end_index)) 425 break; 426 427 /* 428 * if this is the last page, see if we need to shrink 429 * the length and stop 430 */ 431 if (end_index == index) { 432 loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); 433 if (total_len + loff > isize) 434 break; 435 /* 436 * force quit after adding this page 437 */ 438 len = this_len; 439 this_len = min(this_len, loff); 440 loff = 0; 441 } 442 } 443 fill_it: 444 partial[page_nr].offset = loff; 445 partial[page_nr].len = this_len; 446 len -= this_len; 447 total_len += this_len; 448 loff = 0; 449 spd.nr_pages++; 450 index++; 451 } 452 453 /* 454 * Release any pages at the end, if we quit early. 'i' is how far 455 * we got, 'nr_pages' is how many pages are in the map. 456 */ 457 while (page_nr < nr_pages) 458 page_cache_release(pages[page_nr++]); 459 460 if (spd.nr_pages) 461 return splice_to_pipe(pipe, &spd); 462 463 return error; 464 } 465 466 /** 467 * generic_file_splice_read - splice data from file to a pipe 468 * @in: file to splice from 469 * @pipe: pipe to splice to 470 * @len: number of bytes to splice 471 * @flags: splice modifier flags 472 * 473 * Will read pages from given file and fill them into a pipe. 474 */ 475 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 476 struct pipe_inode_info *pipe, size_t len, 477 unsigned int flags) 478 { 479 ssize_t spliced; 480 int ret; 481 482 ret = 0; 483 spliced = 0; 484 485 while (len) { 486 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 487 488 if (ret < 0) 489 break; 490 else if (!ret) { 491 if (spliced) 492 break; 493 if (flags & SPLICE_F_NONBLOCK) { 494 ret = -EAGAIN; 495 break; 496 } 497 } 498 499 *ppos += ret; 500 len -= ret; 501 spliced += ret; 502 } 503 504 if (spliced) 505 return spliced; 506 507 return ret; 508 } 509 510 EXPORT_SYMBOL(generic_file_splice_read); 511 512 /* 513 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 514 * using sendpage(). Return the number of bytes sent. 515 */ 516 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 517 struct pipe_buffer *buf, struct splice_desc *sd) 518 { 519 struct file *file = sd->file; 520 loff_t pos = sd->pos; 521 int ret, more; 522 523 ret = buf->ops->pin(pipe, buf); 524 if (!ret) { 525 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 526 527 ret = file->f_op->sendpage(file, buf->page, buf->offset, 528 sd->len, &pos, more); 529 } 530 531 return ret; 532 } 533 534 /* 535 * This is a little more tricky than the file -> pipe splicing. There are 536 * basically three cases: 537 * 538 * - Destination page already exists in the address space and there 539 * are users of it. For that case we have no other option that 540 * copying the data. Tough luck. 541 * - Destination page already exists in the address space, but there 542 * are no users of it. Make sure it's uptodate, then drop it. Fall 543 * through to last case. 544 * - Destination page does not exist, we can add the pipe page to 545 * the page cache and avoid the copy. 546 * 547 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 548 * sd->flags), we attempt to migrate pages from the pipe to the output 549 * file address space page cache. This is possible if no one else has 550 * the pipe page referenced outside of the pipe and page cache. If 551 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 552 * a new page in the output file page cache and fill/dirty that. 553 */ 554 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 555 struct splice_desc *sd) 556 { 557 struct file *file = sd->file; 558 struct address_space *mapping = file->f_mapping; 559 unsigned int offset, this_len; 560 struct page *page; 561 pgoff_t index; 562 int ret; 563 564 /* 565 * make sure the data in this buffer is uptodate 566 */ 567 ret = buf->ops->pin(pipe, buf); 568 if (unlikely(ret)) 569 return ret; 570 571 index = sd->pos >> PAGE_CACHE_SHIFT; 572 offset = sd->pos & ~PAGE_CACHE_MASK; 573 574 this_len = sd->len; 575 if (this_len + offset > PAGE_CACHE_SIZE) 576 this_len = PAGE_CACHE_SIZE - offset; 577 578 find_page: 579 page = find_lock_page(mapping, index); 580 if (!page) { 581 ret = -ENOMEM; 582 page = page_cache_alloc_cold(mapping); 583 if (unlikely(!page)) 584 goto out_ret; 585 586 /* 587 * This will also lock the page 588 */ 589 ret = add_to_page_cache_lru(page, mapping, index, 590 GFP_KERNEL); 591 if (unlikely(ret)) 592 goto out; 593 } 594 595 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 596 if (unlikely(ret)) { 597 loff_t isize = i_size_read(mapping->host); 598 599 if (ret != AOP_TRUNCATED_PAGE) 600 unlock_page(page); 601 page_cache_release(page); 602 if (ret == AOP_TRUNCATED_PAGE) 603 goto find_page; 604 605 /* 606 * prepare_write() may have instantiated a few blocks 607 * outside i_size. Trim these off again. 608 */ 609 if (sd->pos + this_len > isize) 610 vmtruncate(mapping->host, isize); 611 612 goto out_ret; 613 } 614 615 if (buf->page != page) { 616 /* 617 * Careful, ->map() uses KM_USER0! 618 */ 619 char *src = buf->ops->map(pipe, buf, 1); 620 char *dst = kmap_atomic(page, KM_USER1); 621 622 memcpy(dst + offset, src + buf->offset, this_len); 623 flush_dcache_page(page); 624 kunmap_atomic(dst, KM_USER1); 625 buf->ops->unmap(pipe, buf, src); 626 } 627 628 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 629 if (ret) { 630 if (ret == AOP_TRUNCATED_PAGE) { 631 page_cache_release(page); 632 goto find_page; 633 } 634 if (ret < 0) 635 goto out; 636 /* 637 * Partial write has happened, so 'ret' already initialized by 638 * number of bytes written, Where is nothing we have to do here. 639 */ 640 } else 641 ret = this_len; 642 /* 643 * Return the number of bytes written and mark page as 644 * accessed, we are now done! 645 */ 646 mark_page_accessed(page); 647 balance_dirty_pages_ratelimited(mapping); 648 out: 649 page_cache_release(page); 650 unlock_page(page); 651 out_ret: 652 return ret; 653 } 654 655 /* 656 * Pipe input worker. Most of this logic works like a regular pipe, the 657 * key here is the 'actor' worker passed in that actually moves the data 658 * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. 659 */ 660 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, 661 struct file *out, loff_t *ppos, size_t len, 662 unsigned int flags, splice_actor *actor) 663 { 664 int ret, do_wakeup, err; 665 struct splice_desc sd; 666 667 ret = 0; 668 do_wakeup = 0; 669 670 sd.total_len = len; 671 sd.flags = flags; 672 sd.file = out; 673 sd.pos = *ppos; 674 675 for (;;) { 676 if (pipe->nrbufs) { 677 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 678 const struct pipe_buf_operations *ops = buf->ops; 679 680 sd.len = buf->len; 681 if (sd.len > sd.total_len) 682 sd.len = sd.total_len; 683 684 err = actor(pipe, buf, &sd); 685 if (err <= 0) { 686 if (!ret && err != -ENODATA) 687 ret = err; 688 689 break; 690 } 691 692 ret += err; 693 buf->offset += err; 694 buf->len -= err; 695 696 sd.len -= err; 697 sd.pos += err; 698 sd.total_len -= err; 699 if (sd.len) 700 continue; 701 702 if (!buf->len) { 703 buf->ops = NULL; 704 ops->release(pipe, buf); 705 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 706 pipe->nrbufs--; 707 if (pipe->inode) 708 do_wakeup = 1; 709 } 710 711 if (!sd.total_len) 712 break; 713 } 714 715 if (pipe->nrbufs) 716 continue; 717 if (!pipe->writers) 718 break; 719 if (!pipe->waiting_writers) { 720 if (ret) 721 break; 722 } 723 724 if (flags & SPLICE_F_NONBLOCK) { 725 if (!ret) 726 ret = -EAGAIN; 727 break; 728 } 729 730 if (signal_pending(current)) { 731 if (!ret) 732 ret = -ERESTARTSYS; 733 break; 734 } 735 736 if (do_wakeup) { 737 smp_mb(); 738 if (waitqueue_active(&pipe->wait)) 739 wake_up_interruptible_sync(&pipe->wait); 740 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 741 do_wakeup = 0; 742 } 743 744 pipe_wait(pipe); 745 } 746 747 if (do_wakeup) { 748 smp_mb(); 749 if (waitqueue_active(&pipe->wait)) 750 wake_up_interruptible(&pipe->wait); 751 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 752 } 753 754 return ret; 755 } 756 EXPORT_SYMBOL(__splice_from_pipe); 757 758 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 759 loff_t *ppos, size_t len, unsigned int flags, 760 splice_actor *actor) 761 { 762 ssize_t ret; 763 struct inode *inode = out->f_mapping->host; 764 765 /* 766 * The actor worker might be calling ->prepare_write and 767 * ->commit_write. Most of the time, these expect i_mutex to 768 * be held. Since this may result in an ABBA deadlock with 769 * pipe->inode, we have to order lock acquiry here. 770 */ 771 inode_double_lock(inode, pipe->inode); 772 ret = __splice_from_pipe(pipe, out, ppos, len, flags, actor); 773 inode_double_unlock(inode, pipe->inode); 774 775 return ret; 776 } 777 778 /** 779 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 780 * @pipe: pipe info 781 * @out: file to write to 782 * @len: number of bytes to splice 783 * @flags: splice modifier flags 784 * 785 * Will either move or copy pages (determined by @flags options) from 786 * the given pipe inode to the given file. The caller is responsible 787 * for acquiring i_mutex on both inodes. 788 * 789 */ 790 ssize_t 791 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 792 loff_t *ppos, size_t len, unsigned int flags) 793 { 794 struct address_space *mapping = out->f_mapping; 795 struct inode *inode = mapping->host; 796 ssize_t ret; 797 int err; 798 799 err = remove_suid(out->f_path.dentry); 800 if (unlikely(err)) 801 return err; 802 803 ret = __splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 804 if (ret > 0) { 805 *ppos += ret; 806 807 /* 808 * If file or inode is SYNC and we actually wrote some data, 809 * sync it. 810 */ 811 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 812 err = generic_osync_inode(inode, mapping, 813 OSYNC_METADATA|OSYNC_DATA); 814 815 if (err) 816 ret = err; 817 } 818 } 819 820 return ret; 821 } 822 823 EXPORT_SYMBOL(generic_file_splice_write_nolock); 824 825 /** 826 * generic_file_splice_write - splice data from a pipe to a file 827 * @pipe: pipe info 828 * @out: file to write to 829 * @len: number of bytes to splice 830 * @flags: splice modifier flags 831 * 832 * Will either move or copy pages (determined by @flags options) from 833 * the given pipe inode to the given file. 834 * 835 */ 836 ssize_t 837 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 838 loff_t *ppos, size_t len, unsigned int flags) 839 { 840 struct address_space *mapping = out->f_mapping; 841 struct inode *inode = mapping->host; 842 ssize_t ret; 843 int err; 844 845 err = should_remove_suid(out->f_path.dentry); 846 if (unlikely(err)) { 847 mutex_lock(&inode->i_mutex); 848 err = __remove_suid(out->f_path.dentry, err); 849 mutex_unlock(&inode->i_mutex); 850 if (err) 851 return err; 852 } 853 854 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 855 if (ret > 0) { 856 *ppos += ret; 857 858 /* 859 * If file or inode is SYNC and we actually wrote some data, 860 * sync it. 861 */ 862 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 863 mutex_lock(&inode->i_mutex); 864 err = generic_osync_inode(inode, mapping, 865 OSYNC_METADATA|OSYNC_DATA); 866 mutex_unlock(&inode->i_mutex); 867 868 if (err) 869 ret = err; 870 } 871 } 872 873 return ret; 874 } 875 876 EXPORT_SYMBOL(generic_file_splice_write); 877 878 /** 879 * generic_splice_sendpage - splice data from a pipe to a socket 880 * @inode: pipe inode 881 * @out: socket to write to 882 * @len: number of bytes to splice 883 * @flags: splice modifier flags 884 * 885 * Will send @len bytes from the pipe to a network socket. No data copying 886 * is involved. 887 * 888 */ 889 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 890 loff_t *ppos, size_t len, unsigned int flags) 891 { 892 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 893 } 894 895 EXPORT_SYMBOL(generic_splice_sendpage); 896 897 /* 898 * Attempt to initiate a splice from pipe to file. 899 */ 900 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 901 loff_t *ppos, size_t len, unsigned int flags) 902 { 903 int ret; 904 905 if (unlikely(!out->f_op || !out->f_op->splice_write)) 906 return -EINVAL; 907 908 if (unlikely(!(out->f_mode & FMODE_WRITE))) 909 return -EBADF; 910 911 ret = rw_verify_area(WRITE, out, ppos, len); 912 if (unlikely(ret < 0)) 913 return ret; 914 915 return out->f_op->splice_write(pipe, out, ppos, len, flags); 916 } 917 918 /* 919 * Attempt to initiate a splice from a file to a pipe. 920 */ 921 static long do_splice_to(struct file *in, loff_t *ppos, 922 struct pipe_inode_info *pipe, size_t len, 923 unsigned int flags) 924 { 925 loff_t isize, left; 926 int ret; 927 928 if (unlikely(!in->f_op || !in->f_op->splice_read)) 929 return -EINVAL; 930 931 if (unlikely(!(in->f_mode & FMODE_READ))) 932 return -EBADF; 933 934 ret = rw_verify_area(READ, in, ppos, len); 935 if (unlikely(ret < 0)) 936 return ret; 937 938 isize = i_size_read(in->f_mapping->host); 939 if (unlikely(*ppos >= isize)) 940 return 0; 941 942 left = isize - *ppos; 943 if (unlikely(left < len)) 944 len = left; 945 946 return in->f_op->splice_read(in, ppos, pipe, len, flags); 947 } 948 949 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 950 size_t len, unsigned int flags) 951 { 952 struct pipe_inode_info *pipe; 953 long ret, bytes; 954 loff_t out_off; 955 umode_t i_mode; 956 int i; 957 958 /* 959 * We require the input being a regular file, as we don't want to 960 * randomly drop data for eg socket -> socket splicing. Use the 961 * piped splicing for that! 962 */ 963 i_mode = in->f_path.dentry->d_inode->i_mode; 964 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 965 return -EINVAL; 966 967 /* 968 * neither in nor out is a pipe, setup an internal pipe attached to 969 * 'out' and transfer the wanted data from 'in' to 'out' through that 970 */ 971 pipe = current->splice_pipe; 972 if (unlikely(!pipe)) { 973 pipe = alloc_pipe_info(NULL); 974 if (!pipe) 975 return -ENOMEM; 976 977 /* 978 * We don't have an immediate reader, but we'll read the stuff 979 * out of the pipe right after the splice_to_pipe(). So set 980 * PIPE_READERS appropriately. 981 */ 982 pipe->readers = 1; 983 984 current->splice_pipe = pipe; 985 } 986 987 /* 988 * Do the splice. 989 */ 990 ret = 0; 991 bytes = 0; 992 out_off = 0; 993 994 while (len) { 995 size_t read_len, max_read_len; 996 997 /* 998 * Do at most PIPE_BUFFERS pages worth of transfer: 999 */ 1000 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 1001 1002 ret = do_splice_to(in, ppos, pipe, max_read_len, flags); 1003 if (unlikely(ret < 0)) 1004 goto out_release; 1005 1006 read_len = ret; 1007 1008 /* 1009 * NOTE: nonblocking mode only applies to the input. We 1010 * must not do the output in nonblocking mode as then we 1011 * could get stuck data in the internal pipe: 1012 */ 1013 ret = do_splice_from(pipe, out, &out_off, read_len, 1014 flags & ~SPLICE_F_NONBLOCK); 1015 if (unlikely(ret < 0)) 1016 goto out_release; 1017 1018 bytes += ret; 1019 len -= ret; 1020 1021 /* 1022 * In nonblocking mode, if we got back a short read then 1023 * that was due to either an IO error or due to the 1024 * pagecache entry not being there. In the IO error case 1025 * the _next_ splice attempt will produce a clean IO error 1026 * return value (not a short read), so in both cases it's 1027 * correct to break out of the loop here: 1028 */ 1029 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1030 break; 1031 } 1032 1033 pipe->nrbufs = pipe->curbuf = 0; 1034 1035 return bytes; 1036 1037 out_release: 1038 /* 1039 * If we did an incomplete transfer we must release 1040 * the pipe buffers in question: 1041 */ 1042 for (i = 0; i < PIPE_BUFFERS; i++) { 1043 struct pipe_buffer *buf = pipe->bufs + i; 1044 1045 if (buf->ops) { 1046 buf->ops->release(pipe, buf); 1047 buf->ops = NULL; 1048 } 1049 } 1050 pipe->nrbufs = pipe->curbuf = 0; 1051 1052 /* 1053 * If we transferred some data, return the number of bytes: 1054 */ 1055 if (bytes > 0) 1056 return bytes; 1057 1058 return ret; 1059 } 1060 1061 EXPORT_SYMBOL(do_splice_direct); 1062 1063 /* 1064 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1065 * location, so checking ->i_pipe is not enough to verify that this is a 1066 * pipe. 1067 */ 1068 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1069 { 1070 if (S_ISFIFO(inode->i_mode)) 1071 return inode->i_pipe; 1072 1073 return NULL; 1074 } 1075 1076 /* 1077 * Determine where to splice to/from. 1078 */ 1079 static long do_splice(struct file *in, loff_t __user *off_in, 1080 struct file *out, loff_t __user *off_out, 1081 size_t len, unsigned int flags) 1082 { 1083 struct pipe_inode_info *pipe; 1084 loff_t offset, *off; 1085 long ret; 1086 1087 pipe = pipe_info(in->f_path.dentry->d_inode); 1088 if (pipe) { 1089 if (off_in) 1090 return -ESPIPE; 1091 if (off_out) { 1092 if (out->f_op->llseek == no_llseek) 1093 return -EINVAL; 1094 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1095 return -EFAULT; 1096 off = &offset; 1097 } else 1098 off = &out->f_pos; 1099 1100 ret = do_splice_from(pipe, out, off, len, flags); 1101 1102 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1103 ret = -EFAULT; 1104 1105 return ret; 1106 } 1107 1108 pipe = pipe_info(out->f_path.dentry->d_inode); 1109 if (pipe) { 1110 if (off_out) 1111 return -ESPIPE; 1112 if (off_in) { 1113 if (in->f_op->llseek == no_llseek) 1114 return -EINVAL; 1115 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1116 return -EFAULT; 1117 off = &offset; 1118 } else 1119 off = &in->f_pos; 1120 1121 ret = do_splice_to(in, off, pipe, len, flags); 1122 1123 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1124 ret = -EFAULT; 1125 1126 return ret; 1127 } 1128 1129 return -EINVAL; 1130 } 1131 1132 /* 1133 * Map an iov into an array of pages and offset/length tupples. With the 1134 * partial_page structure, we can map several non-contiguous ranges into 1135 * our ones pages[] map instead of splitting that operation into pieces. 1136 * Could easily be exported as a generic helper for other users, in which 1137 * case one would probably want to add a 'max_nr_pages' parameter as well. 1138 */ 1139 static int get_iovec_page_array(const struct iovec __user *iov, 1140 unsigned int nr_vecs, struct page **pages, 1141 struct partial_page *partial, int aligned) 1142 { 1143 int buffers = 0, error = 0; 1144 1145 /* 1146 * It's ok to take the mmap_sem for reading, even 1147 * across a "get_user()". 1148 */ 1149 down_read(¤t->mm->mmap_sem); 1150 1151 while (nr_vecs) { 1152 unsigned long off, npages; 1153 void __user *base; 1154 size_t len; 1155 int i; 1156 1157 /* 1158 * Get user address base and length for this iovec. 1159 */ 1160 error = get_user(base, &iov->iov_base); 1161 if (unlikely(error)) 1162 break; 1163 error = get_user(len, &iov->iov_len); 1164 if (unlikely(error)) 1165 break; 1166 1167 /* 1168 * Sanity check this iovec. 0 read succeeds. 1169 */ 1170 if (unlikely(!len)) 1171 break; 1172 error = -EFAULT; 1173 if (unlikely(!base)) 1174 break; 1175 1176 /* 1177 * Get this base offset and number of pages, then map 1178 * in the user pages. 1179 */ 1180 off = (unsigned long) base & ~PAGE_MASK; 1181 1182 /* 1183 * If asked for alignment, the offset must be zero and the 1184 * length a multiple of the PAGE_SIZE. 1185 */ 1186 error = -EINVAL; 1187 if (aligned && (off || len & ~PAGE_MASK)) 1188 break; 1189 1190 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1191 if (npages > PIPE_BUFFERS - buffers) 1192 npages = PIPE_BUFFERS - buffers; 1193 1194 error = get_user_pages(current, current->mm, 1195 (unsigned long) base, npages, 0, 0, 1196 &pages[buffers], NULL); 1197 1198 if (unlikely(error <= 0)) 1199 break; 1200 1201 /* 1202 * Fill this contiguous range into the partial page map. 1203 */ 1204 for (i = 0; i < error; i++) { 1205 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1206 1207 partial[buffers].offset = off; 1208 partial[buffers].len = plen; 1209 1210 off = 0; 1211 len -= plen; 1212 buffers++; 1213 } 1214 1215 /* 1216 * We didn't complete this iov, stop here since it probably 1217 * means we have to move some of this into a pipe to 1218 * be able to continue. 1219 */ 1220 if (len) 1221 break; 1222 1223 /* 1224 * Don't continue if we mapped fewer pages than we asked for, 1225 * or if we mapped the max number of pages that we have 1226 * room for. 1227 */ 1228 if (error < npages || buffers == PIPE_BUFFERS) 1229 break; 1230 1231 nr_vecs--; 1232 iov++; 1233 } 1234 1235 up_read(¤t->mm->mmap_sem); 1236 1237 if (buffers) 1238 return buffers; 1239 1240 return error; 1241 } 1242 1243 /* 1244 * vmsplice splices a user address range into a pipe. It can be thought of 1245 * as splice-from-memory, where the regular splice is splice-from-file (or 1246 * to file). In both cases the output is a pipe, naturally. 1247 * 1248 * Note that vmsplice only supports splicing _from_ user memory to a pipe, 1249 * not the other way around. Splicing from user memory is a simple operation 1250 * that can be supported without any funky alignment restrictions or nasty 1251 * vm tricks. We simply map in the user memory and fill them into a pipe. 1252 * The reverse isn't quite as easy, though. There are two possible solutions 1253 * for that: 1254 * 1255 * - memcpy() the data internally, at which point we might as well just 1256 * do a regular read() on the buffer anyway. 1257 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1258 * has restriction limitations on both ends of the pipe). 1259 * 1260 * Alas, it isn't here. 1261 * 1262 */ 1263 static long do_vmsplice(struct file *file, const struct iovec __user *iov, 1264 unsigned long nr_segs, unsigned int flags) 1265 { 1266 struct pipe_inode_info *pipe; 1267 struct page *pages[PIPE_BUFFERS]; 1268 struct partial_page partial[PIPE_BUFFERS]; 1269 struct splice_pipe_desc spd = { 1270 .pages = pages, 1271 .partial = partial, 1272 .flags = flags, 1273 .ops = &user_page_pipe_buf_ops, 1274 }; 1275 1276 pipe = pipe_info(file->f_path.dentry->d_inode); 1277 if (!pipe) 1278 return -EBADF; 1279 if (unlikely(nr_segs > UIO_MAXIOV)) 1280 return -EINVAL; 1281 else if (unlikely(!nr_segs)) 1282 return 0; 1283 1284 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1285 flags & SPLICE_F_GIFT); 1286 if (spd.nr_pages <= 0) 1287 return spd.nr_pages; 1288 1289 return splice_to_pipe(pipe, &spd); 1290 } 1291 1292 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1293 unsigned long nr_segs, unsigned int flags) 1294 { 1295 struct file *file; 1296 long error; 1297 int fput; 1298 1299 error = -EBADF; 1300 file = fget_light(fd, &fput); 1301 if (file) { 1302 if (file->f_mode & FMODE_WRITE) 1303 error = do_vmsplice(file, iov, nr_segs, flags); 1304 1305 fput_light(file, fput); 1306 } 1307 1308 return error; 1309 } 1310 1311 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1312 int fd_out, loff_t __user *off_out, 1313 size_t len, unsigned int flags) 1314 { 1315 long error; 1316 struct file *in, *out; 1317 int fput_in, fput_out; 1318 1319 if (unlikely(!len)) 1320 return 0; 1321 1322 error = -EBADF; 1323 in = fget_light(fd_in, &fput_in); 1324 if (in) { 1325 if (in->f_mode & FMODE_READ) { 1326 out = fget_light(fd_out, &fput_out); 1327 if (out) { 1328 if (out->f_mode & FMODE_WRITE) 1329 error = do_splice(in, off_in, 1330 out, off_out, 1331 len, flags); 1332 fput_light(out, fput_out); 1333 } 1334 } 1335 1336 fput_light(in, fput_in); 1337 } 1338 1339 return error; 1340 } 1341 1342 /* 1343 * Make sure there's data to read. Wait for input if we can, otherwise 1344 * return an appropriate error. 1345 */ 1346 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1347 { 1348 int ret; 1349 1350 /* 1351 * Check ->nrbufs without the inode lock first. This function 1352 * is speculative anyways, so missing one is ok. 1353 */ 1354 if (pipe->nrbufs) 1355 return 0; 1356 1357 ret = 0; 1358 mutex_lock(&pipe->inode->i_mutex); 1359 1360 while (!pipe->nrbufs) { 1361 if (signal_pending(current)) { 1362 ret = -ERESTARTSYS; 1363 break; 1364 } 1365 if (!pipe->writers) 1366 break; 1367 if (!pipe->waiting_writers) { 1368 if (flags & SPLICE_F_NONBLOCK) { 1369 ret = -EAGAIN; 1370 break; 1371 } 1372 } 1373 pipe_wait(pipe); 1374 } 1375 1376 mutex_unlock(&pipe->inode->i_mutex); 1377 return ret; 1378 } 1379 1380 /* 1381 * Make sure there's writeable room. Wait for room if we can, otherwise 1382 * return an appropriate error. 1383 */ 1384 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1385 { 1386 int ret; 1387 1388 /* 1389 * Check ->nrbufs without the inode lock first. This function 1390 * is speculative anyways, so missing one is ok. 1391 */ 1392 if (pipe->nrbufs < PIPE_BUFFERS) 1393 return 0; 1394 1395 ret = 0; 1396 mutex_lock(&pipe->inode->i_mutex); 1397 1398 while (pipe->nrbufs >= PIPE_BUFFERS) { 1399 if (!pipe->readers) { 1400 send_sig(SIGPIPE, current, 0); 1401 ret = -EPIPE; 1402 break; 1403 } 1404 if (flags & SPLICE_F_NONBLOCK) { 1405 ret = -EAGAIN; 1406 break; 1407 } 1408 if (signal_pending(current)) { 1409 ret = -ERESTARTSYS; 1410 break; 1411 } 1412 pipe->waiting_writers++; 1413 pipe_wait(pipe); 1414 pipe->waiting_writers--; 1415 } 1416 1417 mutex_unlock(&pipe->inode->i_mutex); 1418 return ret; 1419 } 1420 1421 /* 1422 * Link contents of ipipe to opipe. 1423 */ 1424 static int link_pipe(struct pipe_inode_info *ipipe, 1425 struct pipe_inode_info *opipe, 1426 size_t len, unsigned int flags) 1427 { 1428 struct pipe_buffer *ibuf, *obuf; 1429 int ret = 0, i = 0, nbuf; 1430 1431 /* 1432 * Potential ABBA deadlock, work around it by ordering lock 1433 * grabbing by inode address. Otherwise two different processes 1434 * could deadlock (one doing tee from A -> B, the other from B -> A). 1435 */ 1436 inode_double_lock(ipipe->inode, opipe->inode); 1437 1438 do { 1439 if (!opipe->readers) { 1440 send_sig(SIGPIPE, current, 0); 1441 if (!ret) 1442 ret = -EPIPE; 1443 break; 1444 } 1445 1446 /* 1447 * If we have iterated all input buffers or ran out of 1448 * output room, break. 1449 */ 1450 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1451 break; 1452 1453 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1454 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1455 1456 /* 1457 * Get a reference to this pipe buffer, 1458 * so we can copy the contents over. 1459 */ 1460 ibuf->ops->get(ipipe, ibuf); 1461 1462 obuf = opipe->bufs + nbuf; 1463 *obuf = *ibuf; 1464 1465 /* 1466 * Don't inherit the gift flag, we need to 1467 * prevent multiple steals of this page. 1468 */ 1469 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1470 1471 if (obuf->len > len) 1472 obuf->len = len; 1473 1474 opipe->nrbufs++; 1475 ret += obuf->len; 1476 len -= obuf->len; 1477 i++; 1478 } while (len); 1479 1480 inode_double_unlock(ipipe->inode, opipe->inode); 1481 1482 /* 1483 * If we put data in the output pipe, wakeup any potential readers. 1484 */ 1485 if (ret > 0) { 1486 smp_mb(); 1487 if (waitqueue_active(&opipe->wait)) 1488 wake_up_interruptible(&opipe->wait); 1489 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1490 } 1491 1492 return ret; 1493 } 1494 1495 /* 1496 * This is a tee(1) implementation that works on pipes. It doesn't copy 1497 * any data, it simply references the 'in' pages on the 'out' pipe. 1498 * The 'flags' used are the SPLICE_F_* variants, currently the only 1499 * applicable one is SPLICE_F_NONBLOCK. 1500 */ 1501 static long do_tee(struct file *in, struct file *out, size_t len, 1502 unsigned int flags) 1503 { 1504 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1505 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1506 int ret = -EINVAL; 1507 1508 /* 1509 * Duplicate the contents of ipipe to opipe without actually 1510 * copying the data. 1511 */ 1512 if (ipipe && opipe && ipipe != opipe) { 1513 /* 1514 * Keep going, unless we encounter an error. The ipipe/opipe 1515 * ordering doesn't really matter. 1516 */ 1517 ret = link_ipipe_prep(ipipe, flags); 1518 if (!ret) { 1519 ret = link_opipe_prep(opipe, flags); 1520 if (!ret) { 1521 ret = link_pipe(ipipe, opipe, len, flags); 1522 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1523 ret = -EAGAIN; 1524 } 1525 } 1526 } 1527 1528 return ret; 1529 } 1530 1531 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1532 { 1533 struct file *in; 1534 int error, fput_in; 1535 1536 if (unlikely(!len)) 1537 return 0; 1538 1539 error = -EBADF; 1540 in = fget_light(fdin, &fput_in); 1541 if (in) { 1542 if (in->f_mode & FMODE_READ) { 1543 int fput_out; 1544 struct file *out = fget_light(fdout, &fput_out); 1545 1546 if (out) { 1547 if (out->f_mode & FMODE_WRITE) 1548 error = do_tee(in, out, len, flags); 1549 fput_light(out, fput_out); 1550 } 1551 } 1552 fput_light(in, fput_in); 1553 } 1554 1555 return error; 1556 } 1557