1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 32 /* 33 * Attempt to steal a page from a pipe buffer. This should perhaps go into 34 * a vm helper function, it's already simplified quite a bit by the 35 * addition of remove_mapping(). If success is returned, the caller may 36 * attempt to reuse this page for another destination. 37 */ 38 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 39 struct pipe_buffer *buf) 40 { 41 struct page *page = buf->page; 42 struct address_space *mapping; 43 44 lock_page(page); 45 46 mapping = page_mapping(page); 47 if (mapping) { 48 WARN_ON(!PageUptodate(page)); 49 50 /* 51 * At least for ext2 with nobh option, we need to wait on 52 * writeback completing on this page, since we'll remove it 53 * from the pagecache. Otherwise truncate wont wait on the 54 * page, allowing the disk blocks to be reused by someone else 55 * before we actually wrote our data to them. fs corruption 56 * ensues. 57 */ 58 wait_on_page_writeback(page); 59 60 if (PagePrivate(page)) 61 try_to_release_page(page, GFP_KERNEL); 62 63 /* 64 * If we succeeded in removing the mapping, set LRU flag 65 * and return good. 66 */ 67 if (remove_mapping(mapping, page)) { 68 buf->flags |= PIPE_BUF_FLAG_LRU; 69 return 0; 70 } 71 } 72 73 /* 74 * Raced with truncate or failed to remove page from current 75 * address space, unlock and return failure. 76 */ 77 unlock_page(page); 78 return 1; 79 } 80 81 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 82 struct pipe_buffer *buf) 83 { 84 page_cache_release(buf->page); 85 buf->flags &= ~PIPE_BUF_FLAG_LRU; 86 } 87 88 /* 89 * Check whether the contents of buf is OK to access. Since the content 90 * is a page cache page, IO may be in flight. 91 */ 92 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 93 struct pipe_buffer *buf) 94 { 95 struct page *page = buf->page; 96 int err; 97 98 if (!PageUptodate(page)) { 99 lock_page(page); 100 101 /* 102 * Page got truncated/unhashed. This will cause a 0-byte 103 * splice, if this is the first page. 104 */ 105 if (!page->mapping) { 106 err = -ENODATA; 107 goto error; 108 } 109 110 /* 111 * Uh oh, read-error from disk. 112 */ 113 if (!PageUptodate(page)) { 114 err = -EIO; 115 goto error; 116 } 117 118 /* 119 * Page is ok afterall, we are done. 120 */ 121 unlock_page(page); 122 } 123 124 return 0; 125 error: 126 unlock_page(page); 127 return err; 128 } 129 130 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 131 .can_merge = 0, 132 .map = generic_pipe_buf_map, 133 .unmap = generic_pipe_buf_unmap, 134 .confirm = page_cache_pipe_buf_confirm, 135 .release = page_cache_pipe_buf_release, 136 .steal = page_cache_pipe_buf_steal, 137 .get = generic_pipe_buf_get, 138 }; 139 140 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 141 struct pipe_buffer *buf) 142 { 143 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 144 return 1; 145 146 buf->flags |= PIPE_BUF_FLAG_LRU; 147 return generic_pipe_buf_steal(pipe, buf); 148 } 149 150 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 151 .can_merge = 0, 152 .map = generic_pipe_buf_map, 153 .unmap = generic_pipe_buf_unmap, 154 .confirm = generic_pipe_buf_confirm, 155 .release = page_cache_pipe_buf_release, 156 .steal = user_page_pipe_buf_steal, 157 .get = generic_pipe_buf_get, 158 }; 159 160 /** 161 * splice_to_pipe - fill passed data into a pipe 162 * @pipe: pipe to fill 163 * @spd: data to fill 164 * 165 * Description: 166 * @spd contains a map of pages and len/offset tupples, a long with 167 * the struct pipe_buf_operations associated with these pages. This 168 * function will link that data to the pipe. 169 * 170 */ 171 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 172 struct splice_pipe_desc *spd) 173 { 174 unsigned int spd_pages = spd->nr_pages; 175 int ret, do_wakeup, page_nr; 176 177 ret = 0; 178 do_wakeup = 0; 179 page_nr = 0; 180 181 if (pipe->inode) 182 mutex_lock(&pipe->inode->i_mutex); 183 184 for (;;) { 185 if (!pipe->readers) { 186 send_sig(SIGPIPE, current, 0); 187 if (!ret) 188 ret = -EPIPE; 189 break; 190 } 191 192 if (pipe->nrbufs < PIPE_BUFFERS) { 193 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 194 struct pipe_buffer *buf = pipe->bufs + newbuf; 195 196 buf->page = spd->pages[page_nr]; 197 buf->offset = spd->partial[page_nr].offset; 198 buf->len = spd->partial[page_nr].len; 199 buf->private = spd->partial[page_nr].private; 200 buf->ops = spd->ops; 201 if (spd->flags & SPLICE_F_GIFT) 202 buf->flags |= PIPE_BUF_FLAG_GIFT; 203 204 pipe->nrbufs++; 205 page_nr++; 206 ret += buf->len; 207 208 if (pipe->inode) 209 do_wakeup = 1; 210 211 if (!--spd->nr_pages) 212 break; 213 if (pipe->nrbufs < PIPE_BUFFERS) 214 continue; 215 216 break; 217 } 218 219 if (spd->flags & SPLICE_F_NONBLOCK) { 220 if (!ret) 221 ret = -EAGAIN; 222 break; 223 } 224 225 if (signal_pending(current)) { 226 if (!ret) 227 ret = -ERESTARTSYS; 228 break; 229 } 230 231 if (do_wakeup) { 232 smp_mb(); 233 if (waitqueue_active(&pipe->wait)) 234 wake_up_interruptible_sync(&pipe->wait); 235 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 236 do_wakeup = 0; 237 } 238 239 pipe->waiting_writers++; 240 pipe_wait(pipe); 241 pipe->waiting_writers--; 242 } 243 244 if (pipe->inode) { 245 mutex_unlock(&pipe->inode->i_mutex); 246 247 if (do_wakeup) { 248 smp_mb(); 249 if (waitqueue_active(&pipe->wait)) 250 wake_up_interruptible(&pipe->wait); 251 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 252 } 253 } 254 255 while (page_nr < spd_pages) 256 page_cache_release(spd->pages[page_nr++]); 257 258 return ret; 259 } 260 261 static int 262 __generic_file_splice_read(struct file *in, loff_t *ppos, 263 struct pipe_inode_info *pipe, size_t len, 264 unsigned int flags) 265 { 266 struct address_space *mapping = in->f_mapping; 267 unsigned int loff, nr_pages; 268 struct page *pages[PIPE_BUFFERS]; 269 struct partial_page partial[PIPE_BUFFERS]; 270 struct page *page; 271 pgoff_t index, end_index; 272 loff_t isize; 273 int error, page_nr; 274 struct splice_pipe_desc spd = { 275 .pages = pages, 276 .partial = partial, 277 .flags = flags, 278 .ops = &page_cache_pipe_buf_ops, 279 }; 280 281 index = *ppos >> PAGE_CACHE_SHIFT; 282 loff = *ppos & ~PAGE_CACHE_MASK; 283 nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 284 285 if (nr_pages > PIPE_BUFFERS) 286 nr_pages = PIPE_BUFFERS; 287 288 /* 289 * Don't try to 2nd guess the read-ahead logic, call into 290 * page_cache_readahead() like the page cache reads would do. 291 */ 292 page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); 293 294 /* 295 * Lookup the (hopefully) full range of pages we need. 296 */ 297 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 298 299 /* 300 * If find_get_pages_contig() returned fewer pages than we needed, 301 * allocate the rest and fill in the holes. 302 */ 303 error = 0; 304 index += spd.nr_pages; 305 while (spd.nr_pages < nr_pages) { 306 /* 307 * Page could be there, find_get_pages_contig() breaks on 308 * the first hole. 309 */ 310 page = find_get_page(mapping, index); 311 if (!page) { 312 /* 313 * Make sure the read-ahead engine is notified 314 * about this failure. 315 */ 316 handle_ra_miss(mapping, &in->f_ra, index); 317 318 /* 319 * page didn't exist, allocate one. 320 */ 321 page = page_cache_alloc_cold(mapping); 322 if (!page) 323 break; 324 325 error = add_to_page_cache_lru(page, mapping, index, 326 GFP_KERNEL); 327 if (unlikely(error)) { 328 page_cache_release(page); 329 if (error == -EEXIST) 330 continue; 331 break; 332 } 333 /* 334 * add_to_page_cache() locks the page, unlock it 335 * to avoid convoluting the logic below even more. 336 */ 337 unlock_page(page); 338 } 339 340 pages[spd.nr_pages++] = page; 341 index++; 342 } 343 344 /* 345 * Now loop over the map and see if we need to start IO on any 346 * pages, fill in the partial map, etc. 347 */ 348 index = *ppos >> PAGE_CACHE_SHIFT; 349 nr_pages = spd.nr_pages; 350 spd.nr_pages = 0; 351 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 352 unsigned int this_len; 353 354 if (!len) 355 break; 356 357 /* 358 * this_len is the max we'll use from this page 359 */ 360 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 361 page = pages[page_nr]; 362 363 /* 364 * If the page isn't uptodate, we may need to start io on it 365 */ 366 if (!PageUptodate(page)) { 367 /* 368 * If in nonblock mode then dont block on waiting 369 * for an in-flight io page 370 */ 371 if (flags & SPLICE_F_NONBLOCK) { 372 if (TestSetPageLocked(page)) 373 break; 374 } else 375 lock_page(page); 376 377 /* 378 * page was truncated, stop here. if this isn't the 379 * first page, we'll just complete what we already 380 * added 381 */ 382 if (!page->mapping) { 383 unlock_page(page); 384 break; 385 } 386 /* 387 * page was already under io and is now done, great 388 */ 389 if (PageUptodate(page)) { 390 unlock_page(page); 391 goto fill_it; 392 } 393 394 /* 395 * need to read in the page 396 */ 397 error = mapping->a_ops->readpage(in, page); 398 if (unlikely(error)) { 399 /* 400 * We really should re-lookup the page here, 401 * but it complicates things a lot. Instead 402 * lets just do what we already stored, and 403 * we'll get it the next time we are called. 404 */ 405 if (error == AOP_TRUNCATED_PAGE) 406 error = 0; 407 408 break; 409 } 410 } 411 fill_it: 412 /* 413 * i_size must be checked after PageUptodate. 414 */ 415 isize = i_size_read(mapping->host); 416 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 417 if (unlikely(!isize || index > end_index)) 418 break; 419 420 /* 421 * if this is the last page, see if we need to shrink 422 * the length and stop 423 */ 424 if (end_index == index) { 425 unsigned int plen; 426 427 /* 428 * max good bytes in this page 429 */ 430 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 431 if (plen <= loff) 432 break; 433 434 /* 435 * force quit after adding this page 436 */ 437 this_len = min(this_len, plen - loff); 438 len = this_len; 439 } 440 441 partial[page_nr].offset = loff; 442 partial[page_nr].len = this_len; 443 len -= this_len; 444 loff = 0; 445 spd.nr_pages++; 446 index++; 447 } 448 449 /* 450 * Release any pages at the end, if we quit early. 'page_nr' is how far 451 * we got, 'nr_pages' is how many pages are in the map. 452 */ 453 while (page_nr < nr_pages) 454 page_cache_release(pages[page_nr++]); 455 456 if (spd.nr_pages) 457 return splice_to_pipe(pipe, &spd); 458 459 return error; 460 } 461 462 /** 463 * generic_file_splice_read - splice data from file to a pipe 464 * @in: file to splice from 465 * @ppos: position in @in 466 * @pipe: pipe to splice to 467 * @len: number of bytes to splice 468 * @flags: splice modifier flags 469 * 470 * Description: 471 * Will read pages from given file and fill them into a pipe. Can be 472 * used as long as the address_space operations for the source implements 473 * a readpage() hook. 474 * 475 */ 476 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 477 struct pipe_inode_info *pipe, size_t len, 478 unsigned int flags) 479 { 480 ssize_t spliced; 481 int ret; 482 loff_t isize, left; 483 484 isize = i_size_read(in->f_mapping->host); 485 if (unlikely(*ppos >= isize)) 486 return 0; 487 488 left = isize - *ppos; 489 if (unlikely(left < len)) 490 len = left; 491 492 ret = 0; 493 spliced = 0; 494 while (len) { 495 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 496 497 if (ret < 0) 498 break; 499 else if (!ret) { 500 if (spliced) 501 break; 502 if (flags & SPLICE_F_NONBLOCK) { 503 ret = -EAGAIN; 504 break; 505 } 506 } 507 508 *ppos += ret; 509 len -= ret; 510 spliced += ret; 511 } 512 513 if (spliced) 514 return spliced; 515 516 return ret; 517 } 518 519 EXPORT_SYMBOL(generic_file_splice_read); 520 521 /* 522 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 523 * using sendpage(). Return the number of bytes sent. 524 */ 525 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 526 struct pipe_buffer *buf, struct splice_desc *sd) 527 { 528 struct file *file = sd->u.file; 529 loff_t pos = sd->pos; 530 int ret, more; 531 532 ret = buf->ops->confirm(pipe, buf); 533 if (!ret) { 534 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 535 536 ret = file->f_op->sendpage(file, buf->page, buf->offset, 537 sd->len, &pos, more); 538 } 539 540 return ret; 541 } 542 543 /* 544 * This is a little more tricky than the file -> pipe splicing. There are 545 * basically three cases: 546 * 547 * - Destination page already exists in the address space and there 548 * are users of it. For that case we have no other option that 549 * copying the data. Tough luck. 550 * - Destination page already exists in the address space, but there 551 * are no users of it. Make sure it's uptodate, then drop it. Fall 552 * through to last case. 553 * - Destination page does not exist, we can add the pipe page to 554 * the page cache and avoid the copy. 555 * 556 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 557 * sd->flags), we attempt to migrate pages from the pipe to the output 558 * file address space page cache. This is possible if no one else has 559 * the pipe page referenced outside of the pipe and page cache. If 560 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 561 * a new page in the output file page cache and fill/dirty that. 562 */ 563 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 564 struct splice_desc *sd) 565 { 566 struct file *file = sd->u.file; 567 struct address_space *mapping = file->f_mapping; 568 unsigned int offset, this_len; 569 struct page *page; 570 pgoff_t index; 571 int ret; 572 573 /* 574 * make sure the data in this buffer is uptodate 575 */ 576 ret = buf->ops->confirm(pipe, buf); 577 if (unlikely(ret)) 578 return ret; 579 580 index = sd->pos >> PAGE_CACHE_SHIFT; 581 offset = sd->pos & ~PAGE_CACHE_MASK; 582 583 this_len = sd->len; 584 if (this_len + offset > PAGE_CACHE_SIZE) 585 this_len = PAGE_CACHE_SIZE - offset; 586 587 find_page: 588 page = find_lock_page(mapping, index); 589 if (!page) { 590 ret = -ENOMEM; 591 page = page_cache_alloc_cold(mapping); 592 if (unlikely(!page)) 593 goto out_ret; 594 595 /* 596 * This will also lock the page 597 */ 598 ret = add_to_page_cache_lru(page, mapping, index, 599 GFP_KERNEL); 600 if (unlikely(ret)) 601 goto out; 602 } 603 604 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 605 if (unlikely(ret)) { 606 loff_t isize = i_size_read(mapping->host); 607 608 if (ret != AOP_TRUNCATED_PAGE) 609 unlock_page(page); 610 page_cache_release(page); 611 if (ret == AOP_TRUNCATED_PAGE) 612 goto find_page; 613 614 /* 615 * prepare_write() may have instantiated a few blocks 616 * outside i_size. Trim these off again. 617 */ 618 if (sd->pos + this_len > isize) 619 vmtruncate(mapping->host, isize); 620 621 goto out_ret; 622 } 623 624 if (buf->page != page) { 625 /* 626 * Careful, ->map() uses KM_USER0! 627 */ 628 char *src = buf->ops->map(pipe, buf, 1); 629 char *dst = kmap_atomic(page, KM_USER1); 630 631 memcpy(dst + offset, src + buf->offset, this_len); 632 flush_dcache_page(page); 633 kunmap_atomic(dst, KM_USER1); 634 buf->ops->unmap(pipe, buf, src); 635 } 636 637 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 638 if (ret) { 639 if (ret == AOP_TRUNCATED_PAGE) { 640 page_cache_release(page); 641 goto find_page; 642 } 643 if (ret < 0) 644 goto out; 645 /* 646 * Partial write has happened, so 'ret' already initialized by 647 * number of bytes written, Where is nothing we have to do here. 648 */ 649 } else 650 ret = this_len; 651 /* 652 * Return the number of bytes written and mark page as 653 * accessed, we are now done! 654 */ 655 mark_page_accessed(page); 656 out: 657 page_cache_release(page); 658 unlock_page(page); 659 out_ret: 660 return ret; 661 } 662 663 /** 664 * __splice_from_pipe - splice data from a pipe to given actor 665 * @pipe: pipe to splice from 666 * @sd: information to @actor 667 * @actor: handler that splices the data 668 * 669 * Description: 670 * This function does little more than loop over the pipe and call 671 * @actor to do the actual moving of a single struct pipe_buffer to 672 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 673 * pipe_to_user. 674 * 675 */ 676 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 677 splice_actor *actor) 678 { 679 int ret, do_wakeup, err; 680 681 ret = 0; 682 do_wakeup = 0; 683 684 for (;;) { 685 if (pipe->nrbufs) { 686 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 687 const struct pipe_buf_operations *ops = buf->ops; 688 689 sd->len = buf->len; 690 if (sd->len > sd->total_len) 691 sd->len = sd->total_len; 692 693 err = actor(pipe, buf, sd); 694 if (err <= 0) { 695 if (!ret && err != -ENODATA) 696 ret = err; 697 698 break; 699 } 700 701 ret += err; 702 buf->offset += err; 703 buf->len -= err; 704 705 sd->len -= err; 706 sd->pos += err; 707 sd->total_len -= err; 708 if (sd->len) 709 continue; 710 711 if (!buf->len) { 712 buf->ops = NULL; 713 ops->release(pipe, buf); 714 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 715 pipe->nrbufs--; 716 if (pipe->inode) 717 do_wakeup = 1; 718 } 719 720 if (!sd->total_len) 721 break; 722 } 723 724 if (pipe->nrbufs) 725 continue; 726 if (!pipe->writers) 727 break; 728 if (!pipe->waiting_writers) { 729 if (ret) 730 break; 731 } 732 733 if (sd->flags & SPLICE_F_NONBLOCK) { 734 if (!ret) 735 ret = -EAGAIN; 736 break; 737 } 738 739 if (signal_pending(current)) { 740 if (!ret) 741 ret = -ERESTARTSYS; 742 break; 743 } 744 745 if (do_wakeup) { 746 smp_mb(); 747 if (waitqueue_active(&pipe->wait)) 748 wake_up_interruptible_sync(&pipe->wait); 749 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 750 do_wakeup = 0; 751 } 752 753 pipe_wait(pipe); 754 } 755 756 if (do_wakeup) { 757 smp_mb(); 758 if (waitqueue_active(&pipe->wait)) 759 wake_up_interruptible(&pipe->wait); 760 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 761 } 762 763 return ret; 764 } 765 EXPORT_SYMBOL(__splice_from_pipe); 766 767 /** 768 * splice_from_pipe - splice data from a pipe to a file 769 * @pipe: pipe to splice from 770 * @out: file to splice to 771 * @ppos: position in @out 772 * @len: how many bytes to splice 773 * @flags: splice modifier flags 774 * @actor: handler that splices the data 775 * 776 * Description: 777 * See __splice_from_pipe. This function locks the input and output inodes, 778 * otherwise it's identical to __splice_from_pipe(). 779 * 780 */ 781 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 782 loff_t *ppos, size_t len, unsigned int flags, 783 splice_actor *actor) 784 { 785 ssize_t ret; 786 struct inode *inode = out->f_mapping->host; 787 struct splice_desc sd = { 788 .total_len = len, 789 .flags = flags, 790 .pos = *ppos, 791 .u.file = out, 792 }; 793 794 /* 795 * The actor worker might be calling ->prepare_write and 796 * ->commit_write. Most of the time, these expect i_mutex to 797 * be held. Since this may result in an ABBA deadlock with 798 * pipe->inode, we have to order lock acquiry here. 799 */ 800 inode_double_lock(inode, pipe->inode); 801 ret = __splice_from_pipe(pipe, &sd, actor); 802 inode_double_unlock(inode, pipe->inode); 803 804 return ret; 805 } 806 807 /** 808 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 809 * @pipe: pipe info 810 * @out: file to write to 811 * @ppos: position in @out 812 * @len: number of bytes to splice 813 * @flags: splice modifier flags 814 * 815 * Description: 816 * Will either move or copy pages (determined by @flags options) from 817 * the given pipe inode to the given file. The caller is responsible 818 * for acquiring i_mutex on both inodes. 819 * 820 */ 821 ssize_t 822 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 823 loff_t *ppos, size_t len, unsigned int flags) 824 { 825 struct address_space *mapping = out->f_mapping; 826 struct inode *inode = mapping->host; 827 struct splice_desc sd = { 828 .total_len = len, 829 .flags = flags, 830 .pos = *ppos, 831 .u.file = out, 832 }; 833 ssize_t ret; 834 int err; 835 836 err = remove_suid(out->f_path.dentry); 837 if (unlikely(err)) 838 return err; 839 840 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 841 if (ret > 0) { 842 unsigned long nr_pages; 843 844 *ppos += ret; 845 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 846 847 /* 848 * If file or inode is SYNC and we actually wrote some data, 849 * sync it. 850 */ 851 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 852 err = generic_osync_inode(inode, mapping, 853 OSYNC_METADATA|OSYNC_DATA); 854 855 if (err) 856 ret = err; 857 } 858 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 859 } 860 861 return ret; 862 } 863 864 EXPORT_SYMBOL(generic_file_splice_write_nolock); 865 866 /** 867 * generic_file_splice_write - splice data from a pipe to a file 868 * @pipe: pipe info 869 * @out: file to write to 870 * @ppos: position in @out 871 * @len: number of bytes to splice 872 * @flags: splice modifier flags 873 * 874 * Description: 875 * Will either move or copy pages (determined by @flags options) from 876 * the given pipe inode to the given file. 877 * 878 */ 879 ssize_t 880 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 881 loff_t *ppos, size_t len, unsigned int flags) 882 { 883 struct address_space *mapping = out->f_mapping; 884 struct inode *inode = mapping->host; 885 ssize_t ret; 886 int err; 887 888 err = should_remove_suid(out->f_path.dentry); 889 if (unlikely(err)) { 890 mutex_lock(&inode->i_mutex); 891 err = __remove_suid(out->f_path.dentry, err); 892 mutex_unlock(&inode->i_mutex); 893 if (err) 894 return err; 895 } 896 897 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 898 if (ret > 0) { 899 unsigned long nr_pages; 900 901 *ppos += ret; 902 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 903 904 /* 905 * If file or inode is SYNC and we actually wrote some data, 906 * sync it. 907 */ 908 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 909 mutex_lock(&inode->i_mutex); 910 err = generic_osync_inode(inode, mapping, 911 OSYNC_METADATA|OSYNC_DATA); 912 mutex_unlock(&inode->i_mutex); 913 914 if (err) 915 ret = err; 916 } 917 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 918 } 919 920 return ret; 921 } 922 923 EXPORT_SYMBOL(generic_file_splice_write); 924 925 /** 926 * generic_splice_sendpage - splice data from a pipe to a socket 927 * @pipe: pipe to splice from 928 * @out: socket to write to 929 * @ppos: position in @out 930 * @len: number of bytes to splice 931 * @flags: splice modifier flags 932 * 933 * Description: 934 * Will send @len bytes from the pipe to a network socket. No data copying 935 * is involved. 936 * 937 */ 938 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 939 loff_t *ppos, size_t len, unsigned int flags) 940 { 941 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 942 } 943 944 EXPORT_SYMBOL(generic_splice_sendpage); 945 946 /* 947 * Attempt to initiate a splice from pipe to file. 948 */ 949 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 950 loff_t *ppos, size_t len, unsigned int flags) 951 { 952 int ret; 953 954 if (unlikely(!out->f_op || !out->f_op->splice_write)) 955 return -EINVAL; 956 957 if (unlikely(!(out->f_mode & FMODE_WRITE))) 958 return -EBADF; 959 960 ret = rw_verify_area(WRITE, out, ppos, len); 961 if (unlikely(ret < 0)) 962 return ret; 963 964 return out->f_op->splice_write(pipe, out, ppos, len, flags); 965 } 966 967 /* 968 * Attempt to initiate a splice from a file to a pipe. 969 */ 970 static long do_splice_to(struct file *in, loff_t *ppos, 971 struct pipe_inode_info *pipe, size_t len, 972 unsigned int flags) 973 { 974 int ret; 975 976 if (unlikely(!in->f_op || !in->f_op->splice_read)) 977 return -EINVAL; 978 979 if (unlikely(!(in->f_mode & FMODE_READ))) 980 return -EBADF; 981 982 ret = rw_verify_area(READ, in, ppos, len); 983 if (unlikely(ret < 0)) 984 return ret; 985 986 return in->f_op->splice_read(in, ppos, pipe, len, flags); 987 } 988 989 /** 990 * splice_direct_to_actor - splices data directly between two non-pipes 991 * @in: file to splice from 992 * @sd: actor information on where to splice to 993 * @actor: handles the data splicing 994 * 995 * Description: 996 * This is a special case helper to splice directly between two 997 * points, without requiring an explicit pipe. Internally an allocated 998 * pipe is cached in the process, and reused during the life time of 999 * that process. 1000 * 1001 */ 1002 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1003 splice_direct_actor *actor) 1004 { 1005 struct pipe_inode_info *pipe; 1006 long ret, bytes; 1007 umode_t i_mode; 1008 size_t len; 1009 int i, flags; 1010 1011 /* 1012 * We require the input being a regular file, as we don't want to 1013 * randomly drop data for eg socket -> socket splicing. Use the 1014 * piped splicing for that! 1015 */ 1016 i_mode = in->f_path.dentry->d_inode->i_mode; 1017 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1018 return -EINVAL; 1019 1020 /* 1021 * neither in nor out is a pipe, setup an internal pipe attached to 1022 * 'out' and transfer the wanted data from 'in' to 'out' through that 1023 */ 1024 pipe = current->splice_pipe; 1025 if (unlikely(!pipe)) { 1026 pipe = alloc_pipe_info(NULL); 1027 if (!pipe) 1028 return -ENOMEM; 1029 1030 /* 1031 * We don't have an immediate reader, but we'll read the stuff 1032 * out of the pipe right after the splice_to_pipe(). So set 1033 * PIPE_READERS appropriately. 1034 */ 1035 pipe->readers = 1; 1036 1037 current->splice_pipe = pipe; 1038 } 1039 1040 /* 1041 * Do the splice. 1042 */ 1043 ret = 0; 1044 bytes = 0; 1045 len = sd->total_len; 1046 flags = sd->flags; 1047 1048 /* 1049 * Don't block on output, we have to drain the direct pipe. 1050 */ 1051 sd->flags &= ~SPLICE_F_NONBLOCK; 1052 1053 while (len) { 1054 size_t read_len, max_read_len; 1055 1056 /* 1057 * Do at most PIPE_BUFFERS pages worth of transfer: 1058 */ 1059 max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); 1060 1061 ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags); 1062 if (unlikely(ret < 0)) 1063 goto out_release; 1064 1065 read_len = ret; 1066 sd->total_len = read_len; 1067 1068 /* 1069 * NOTE: nonblocking mode only applies to the input. We 1070 * must not do the output in nonblocking mode as then we 1071 * could get stuck data in the internal pipe: 1072 */ 1073 ret = actor(pipe, sd); 1074 if (unlikely(ret < 0)) 1075 goto out_release; 1076 1077 bytes += ret; 1078 len -= ret; 1079 1080 /* 1081 * In nonblocking mode, if we got back a short read then 1082 * that was due to either an IO error or due to the 1083 * pagecache entry not being there. In the IO error case 1084 * the _next_ splice attempt will produce a clean IO error 1085 * return value (not a short read), so in both cases it's 1086 * correct to break out of the loop here: 1087 */ 1088 if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) 1089 break; 1090 } 1091 1092 pipe->nrbufs = pipe->curbuf = 0; 1093 1094 return bytes; 1095 1096 out_release: 1097 /* 1098 * If we did an incomplete transfer we must release 1099 * the pipe buffers in question: 1100 */ 1101 for (i = 0; i < PIPE_BUFFERS; i++) { 1102 struct pipe_buffer *buf = pipe->bufs + i; 1103 1104 if (buf->ops) { 1105 buf->ops->release(pipe, buf); 1106 buf->ops = NULL; 1107 } 1108 } 1109 pipe->nrbufs = pipe->curbuf = 0; 1110 1111 /* 1112 * If we transferred some data, return the number of bytes: 1113 */ 1114 if (bytes > 0) 1115 return bytes; 1116 1117 return ret; 1118 1119 } 1120 EXPORT_SYMBOL(splice_direct_to_actor); 1121 1122 static int direct_splice_actor(struct pipe_inode_info *pipe, 1123 struct splice_desc *sd) 1124 { 1125 struct file *file = sd->u.file; 1126 1127 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1128 } 1129 1130 /** 1131 * do_splice_direct - splices data directly between two files 1132 * @in: file to splice from 1133 * @ppos: input file offset 1134 * @out: file to splice to 1135 * @len: number of bytes to splice 1136 * @flags: splice modifier flags 1137 * 1138 * Description: 1139 * For use by do_sendfile(). splice can easily emulate sendfile, but 1140 * doing it in the application would incur an extra system call 1141 * (splice in + splice out, as compared to just sendfile()). So this helper 1142 * can splice directly through a process-private pipe. 1143 * 1144 */ 1145 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1146 size_t len, unsigned int flags) 1147 { 1148 struct splice_desc sd = { 1149 .len = len, 1150 .total_len = len, 1151 .flags = flags, 1152 .pos = *ppos, 1153 .u.file = out, 1154 }; 1155 size_t ret; 1156 1157 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1158 *ppos = sd.pos; 1159 return ret; 1160 } 1161 1162 /* 1163 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1164 * location, so checking ->i_pipe is not enough to verify that this is a 1165 * pipe. 1166 */ 1167 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1168 { 1169 if (S_ISFIFO(inode->i_mode)) 1170 return inode->i_pipe; 1171 1172 return NULL; 1173 } 1174 1175 /* 1176 * Determine where to splice to/from. 1177 */ 1178 static long do_splice(struct file *in, loff_t __user *off_in, 1179 struct file *out, loff_t __user *off_out, 1180 size_t len, unsigned int flags) 1181 { 1182 struct pipe_inode_info *pipe; 1183 loff_t offset, *off; 1184 long ret; 1185 1186 pipe = pipe_info(in->f_path.dentry->d_inode); 1187 if (pipe) { 1188 if (off_in) 1189 return -ESPIPE; 1190 if (off_out) { 1191 if (out->f_op->llseek == no_llseek) 1192 return -EINVAL; 1193 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1194 return -EFAULT; 1195 off = &offset; 1196 } else 1197 off = &out->f_pos; 1198 1199 ret = do_splice_from(pipe, out, off, len, flags); 1200 1201 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1202 ret = -EFAULT; 1203 1204 return ret; 1205 } 1206 1207 pipe = pipe_info(out->f_path.dentry->d_inode); 1208 if (pipe) { 1209 if (off_out) 1210 return -ESPIPE; 1211 if (off_in) { 1212 if (in->f_op->llseek == no_llseek) 1213 return -EINVAL; 1214 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1215 return -EFAULT; 1216 off = &offset; 1217 } else 1218 off = &in->f_pos; 1219 1220 ret = do_splice_to(in, off, pipe, len, flags); 1221 1222 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1223 ret = -EFAULT; 1224 1225 return ret; 1226 } 1227 1228 return -EINVAL; 1229 } 1230 1231 /* 1232 * Map an iov into an array of pages and offset/length tupples. With the 1233 * partial_page structure, we can map several non-contiguous ranges into 1234 * our ones pages[] map instead of splitting that operation into pieces. 1235 * Could easily be exported as a generic helper for other users, in which 1236 * case one would probably want to add a 'max_nr_pages' parameter as well. 1237 */ 1238 static int get_iovec_page_array(const struct iovec __user *iov, 1239 unsigned int nr_vecs, struct page **pages, 1240 struct partial_page *partial, int aligned) 1241 { 1242 int buffers = 0, error = 0; 1243 1244 /* 1245 * It's ok to take the mmap_sem for reading, even 1246 * across a "get_user()". 1247 */ 1248 down_read(¤t->mm->mmap_sem); 1249 1250 while (nr_vecs) { 1251 unsigned long off, npages; 1252 void __user *base; 1253 size_t len; 1254 int i; 1255 1256 /* 1257 * Get user address base and length for this iovec. 1258 */ 1259 error = get_user(base, &iov->iov_base); 1260 if (unlikely(error)) 1261 break; 1262 error = get_user(len, &iov->iov_len); 1263 if (unlikely(error)) 1264 break; 1265 1266 /* 1267 * Sanity check this iovec. 0 read succeeds. 1268 */ 1269 if (unlikely(!len)) 1270 break; 1271 error = -EFAULT; 1272 if (unlikely(!base)) 1273 break; 1274 1275 /* 1276 * Get this base offset and number of pages, then map 1277 * in the user pages. 1278 */ 1279 off = (unsigned long) base & ~PAGE_MASK; 1280 1281 /* 1282 * If asked for alignment, the offset must be zero and the 1283 * length a multiple of the PAGE_SIZE. 1284 */ 1285 error = -EINVAL; 1286 if (aligned && (off || len & ~PAGE_MASK)) 1287 break; 1288 1289 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1290 if (npages > PIPE_BUFFERS - buffers) 1291 npages = PIPE_BUFFERS - buffers; 1292 1293 error = get_user_pages(current, current->mm, 1294 (unsigned long) base, npages, 0, 0, 1295 &pages[buffers], NULL); 1296 1297 if (unlikely(error <= 0)) 1298 break; 1299 1300 /* 1301 * Fill this contiguous range into the partial page map. 1302 */ 1303 for (i = 0; i < error; i++) { 1304 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1305 1306 partial[buffers].offset = off; 1307 partial[buffers].len = plen; 1308 1309 off = 0; 1310 len -= plen; 1311 buffers++; 1312 } 1313 1314 /* 1315 * We didn't complete this iov, stop here since it probably 1316 * means we have to move some of this into a pipe to 1317 * be able to continue. 1318 */ 1319 if (len) 1320 break; 1321 1322 /* 1323 * Don't continue if we mapped fewer pages than we asked for, 1324 * or if we mapped the max number of pages that we have 1325 * room for. 1326 */ 1327 if (error < npages || buffers == PIPE_BUFFERS) 1328 break; 1329 1330 nr_vecs--; 1331 iov++; 1332 } 1333 1334 up_read(¤t->mm->mmap_sem); 1335 1336 if (buffers) 1337 return buffers; 1338 1339 return error; 1340 } 1341 1342 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1343 struct splice_desc *sd) 1344 { 1345 char *src; 1346 int ret; 1347 1348 ret = buf->ops->confirm(pipe, buf); 1349 if (unlikely(ret)) 1350 return ret; 1351 1352 /* 1353 * See if we can use the atomic maps, by prefaulting in the 1354 * pages and doing an atomic copy 1355 */ 1356 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1357 src = buf->ops->map(pipe, buf, 1); 1358 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1359 sd->len); 1360 buf->ops->unmap(pipe, buf, src); 1361 if (!ret) { 1362 ret = sd->len; 1363 goto out; 1364 } 1365 } 1366 1367 /* 1368 * No dice, use slow non-atomic map and copy 1369 */ 1370 src = buf->ops->map(pipe, buf, 0); 1371 1372 ret = sd->len; 1373 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1374 ret = -EFAULT; 1375 1376 out: 1377 if (ret > 0) 1378 sd->u.userptr += ret; 1379 buf->ops->unmap(pipe, buf, src); 1380 return ret; 1381 } 1382 1383 /* 1384 * For lack of a better implementation, implement vmsplice() to userspace 1385 * as a simple copy of the pipes pages to the user iov. 1386 */ 1387 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1388 unsigned long nr_segs, unsigned int flags) 1389 { 1390 struct pipe_inode_info *pipe; 1391 struct splice_desc sd; 1392 ssize_t size; 1393 int error; 1394 long ret; 1395 1396 pipe = pipe_info(file->f_path.dentry->d_inode); 1397 if (!pipe) 1398 return -EBADF; 1399 1400 if (pipe->inode) 1401 mutex_lock(&pipe->inode->i_mutex); 1402 1403 error = ret = 0; 1404 while (nr_segs) { 1405 void __user *base; 1406 size_t len; 1407 1408 /* 1409 * Get user address base and length for this iovec. 1410 */ 1411 error = get_user(base, &iov->iov_base); 1412 if (unlikely(error)) 1413 break; 1414 error = get_user(len, &iov->iov_len); 1415 if (unlikely(error)) 1416 break; 1417 1418 /* 1419 * Sanity check this iovec. 0 read succeeds. 1420 */ 1421 if (unlikely(!len)) 1422 break; 1423 if (unlikely(!base)) { 1424 error = -EFAULT; 1425 break; 1426 } 1427 1428 sd.len = 0; 1429 sd.total_len = len; 1430 sd.flags = flags; 1431 sd.u.userptr = base; 1432 sd.pos = 0; 1433 1434 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1435 if (size < 0) { 1436 if (!ret) 1437 ret = size; 1438 1439 break; 1440 } 1441 1442 ret += size; 1443 1444 if (size < len) 1445 break; 1446 1447 nr_segs--; 1448 iov++; 1449 } 1450 1451 if (pipe->inode) 1452 mutex_unlock(&pipe->inode->i_mutex); 1453 1454 if (!ret) 1455 ret = error; 1456 1457 return ret; 1458 } 1459 1460 /* 1461 * vmsplice splices a user address range into a pipe. It can be thought of 1462 * as splice-from-memory, where the regular splice is splice-from-file (or 1463 * to file). In both cases the output is a pipe, naturally. 1464 */ 1465 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1466 unsigned long nr_segs, unsigned int flags) 1467 { 1468 struct pipe_inode_info *pipe; 1469 struct page *pages[PIPE_BUFFERS]; 1470 struct partial_page partial[PIPE_BUFFERS]; 1471 struct splice_pipe_desc spd = { 1472 .pages = pages, 1473 .partial = partial, 1474 .flags = flags, 1475 .ops = &user_page_pipe_buf_ops, 1476 }; 1477 1478 pipe = pipe_info(file->f_path.dentry->d_inode); 1479 if (!pipe) 1480 return -EBADF; 1481 1482 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1483 flags & SPLICE_F_GIFT); 1484 if (spd.nr_pages <= 0) 1485 return spd.nr_pages; 1486 1487 return splice_to_pipe(pipe, &spd); 1488 } 1489 1490 /* 1491 * Note that vmsplice only really supports true splicing _from_ user memory 1492 * to a pipe, not the other way around. Splicing from user memory is a simple 1493 * operation that can be supported without any funky alignment restrictions 1494 * or nasty vm tricks. We simply map in the user memory and fill them into 1495 * a pipe. The reverse isn't quite as easy, though. There are two possible 1496 * solutions for that: 1497 * 1498 * - memcpy() the data internally, at which point we might as well just 1499 * do a regular read() on the buffer anyway. 1500 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1501 * has restriction limitations on both ends of the pipe). 1502 * 1503 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1504 * 1505 */ 1506 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1507 unsigned long nr_segs, unsigned int flags) 1508 { 1509 struct file *file; 1510 long error; 1511 int fput; 1512 1513 if (unlikely(nr_segs > UIO_MAXIOV)) 1514 return -EINVAL; 1515 else if (unlikely(!nr_segs)) 1516 return 0; 1517 1518 error = -EBADF; 1519 file = fget_light(fd, &fput); 1520 if (file) { 1521 if (file->f_mode & FMODE_WRITE) 1522 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1523 else if (file->f_mode & FMODE_READ) 1524 error = vmsplice_to_user(file, iov, nr_segs, flags); 1525 1526 fput_light(file, fput); 1527 } 1528 1529 return error; 1530 } 1531 1532 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1533 int fd_out, loff_t __user *off_out, 1534 size_t len, unsigned int flags) 1535 { 1536 long error; 1537 struct file *in, *out; 1538 int fput_in, fput_out; 1539 1540 if (unlikely(!len)) 1541 return 0; 1542 1543 error = -EBADF; 1544 in = fget_light(fd_in, &fput_in); 1545 if (in) { 1546 if (in->f_mode & FMODE_READ) { 1547 out = fget_light(fd_out, &fput_out); 1548 if (out) { 1549 if (out->f_mode & FMODE_WRITE) 1550 error = do_splice(in, off_in, 1551 out, off_out, 1552 len, flags); 1553 fput_light(out, fput_out); 1554 } 1555 } 1556 1557 fput_light(in, fput_in); 1558 } 1559 1560 return error; 1561 } 1562 1563 /* 1564 * Make sure there's data to read. Wait for input if we can, otherwise 1565 * return an appropriate error. 1566 */ 1567 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1568 { 1569 int ret; 1570 1571 /* 1572 * Check ->nrbufs without the inode lock first. This function 1573 * is speculative anyways, so missing one is ok. 1574 */ 1575 if (pipe->nrbufs) 1576 return 0; 1577 1578 ret = 0; 1579 mutex_lock(&pipe->inode->i_mutex); 1580 1581 while (!pipe->nrbufs) { 1582 if (signal_pending(current)) { 1583 ret = -ERESTARTSYS; 1584 break; 1585 } 1586 if (!pipe->writers) 1587 break; 1588 if (!pipe->waiting_writers) { 1589 if (flags & SPLICE_F_NONBLOCK) { 1590 ret = -EAGAIN; 1591 break; 1592 } 1593 } 1594 pipe_wait(pipe); 1595 } 1596 1597 mutex_unlock(&pipe->inode->i_mutex); 1598 return ret; 1599 } 1600 1601 /* 1602 * Make sure there's writeable room. Wait for room if we can, otherwise 1603 * return an appropriate error. 1604 */ 1605 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1606 { 1607 int ret; 1608 1609 /* 1610 * Check ->nrbufs without the inode lock first. This function 1611 * is speculative anyways, so missing one is ok. 1612 */ 1613 if (pipe->nrbufs < PIPE_BUFFERS) 1614 return 0; 1615 1616 ret = 0; 1617 mutex_lock(&pipe->inode->i_mutex); 1618 1619 while (pipe->nrbufs >= PIPE_BUFFERS) { 1620 if (!pipe->readers) { 1621 send_sig(SIGPIPE, current, 0); 1622 ret = -EPIPE; 1623 break; 1624 } 1625 if (flags & SPLICE_F_NONBLOCK) { 1626 ret = -EAGAIN; 1627 break; 1628 } 1629 if (signal_pending(current)) { 1630 ret = -ERESTARTSYS; 1631 break; 1632 } 1633 pipe->waiting_writers++; 1634 pipe_wait(pipe); 1635 pipe->waiting_writers--; 1636 } 1637 1638 mutex_unlock(&pipe->inode->i_mutex); 1639 return ret; 1640 } 1641 1642 /* 1643 * Link contents of ipipe to opipe. 1644 */ 1645 static int link_pipe(struct pipe_inode_info *ipipe, 1646 struct pipe_inode_info *opipe, 1647 size_t len, unsigned int flags) 1648 { 1649 struct pipe_buffer *ibuf, *obuf; 1650 int ret = 0, i = 0, nbuf; 1651 1652 /* 1653 * Potential ABBA deadlock, work around it by ordering lock 1654 * grabbing by inode address. Otherwise two different processes 1655 * could deadlock (one doing tee from A -> B, the other from B -> A). 1656 */ 1657 inode_double_lock(ipipe->inode, opipe->inode); 1658 1659 do { 1660 if (!opipe->readers) { 1661 send_sig(SIGPIPE, current, 0); 1662 if (!ret) 1663 ret = -EPIPE; 1664 break; 1665 } 1666 1667 /* 1668 * If we have iterated all input buffers or ran out of 1669 * output room, break. 1670 */ 1671 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1672 break; 1673 1674 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1675 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1676 1677 /* 1678 * Get a reference to this pipe buffer, 1679 * so we can copy the contents over. 1680 */ 1681 ibuf->ops->get(ipipe, ibuf); 1682 1683 obuf = opipe->bufs + nbuf; 1684 *obuf = *ibuf; 1685 1686 /* 1687 * Don't inherit the gift flag, we need to 1688 * prevent multiple steals of this page. 1689 */ 1690 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1691 1692 if (obuf->len > len) 1693 obuf->len = len; 1694 1695 opipe->nrbufs++; 1696 ret += obuf->len; 1697 len -= obuf->len; 1698 i++; 1699 } while (len); 1700 1701 inode_double_unlock(ipipe->inode, opipe->inode); 1702 1703 /* 1704 * If we put data in the output pipe, wakeup any potential readers. 1705 */ 1706 if (ret > 0) { 1707 smp_mb(); 1708 if (waitqueue_active(&opipe->wait)) 1709 wake_up_interruptible(&opipe->wait); 1710 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1711 } 1712 1713 return ret; 1714 } 1715 1716 /* 1717 * This is a tee(1) implementation that works on pipes. It doesn't copy 1718 * any data, it simply references the 'in' pages on the 'out' pipe. 1719 * The 'flags' used are the SPLICE_F_* variants, currently the only 1720 * applicable one is SPLICE_F_NONBLOCK. 1721 */ 1722 static long do_tee(struct file *in, struct file *out, size_t len, 1723 unsigned int flags) 1724 { 1725 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1726 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1727 int ret = -EINVAL; 1728 1729 /* 1730 * Duplicate the contents of ipipe to opipe without actually 1731 * copying the data. 1732 */ 1733 if (ipipe && opipe && ipipe != opipe) { 1734 /* 1735 * Keep going, unless we encounter an error. The ipipe/opipe 1736 * ordering doesn't really matter. 1737 */ 1738 ret = link_ipipe_prep(ipipe, flags); 1739 if (!ret) { 1740 ret = link_opipe_prep(opipe, flags); 1741 if (!ret) { 1742 ret = link_pipe(ipipe, opipe, len, flags); 1743 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1744 ret = -EAGAIN; 1745 } 1746 } 1747 } 1748 1749 return ret; 1750 } 1751 1752 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1753 { 1754 struct file *in; 1755 int error, fput_in; 1756 1757 if (unlikely(!len)) 1758 return 0; 1759 1760 error = -EBADF; 1761 in = fget_light(fdin, &fput_in); 1762 if (in) { 1763 if (in->f_mode & FMODE_READ) { 1764 int fput_out; 1765 struct file *out = fget_light(fdout, &fput_out); 1766 1767 if (out) { 1768 if (out->f_mode & FMODE_WRITE) 1769 error = do_tee(in, out, len, flags); 1770 fput_light(out, fput_out); 1771 } 1772 } 1773 fput_light(in, fput_in); 1774 } 1775 1776 return error; 1777 } 1778