1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 33 /* 34 * Attempt to steal a page from a pipe buffer. This should perhaps go into 35 * a vm helper function, it's already simplified quite a bit by the 36 * addition of remove_mapping(). If success is returned, the caller may 37 * attempt to reuse this page for another destination. 38 */ 39 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 40 struct pipe_buffer *buf) 41 { 42 struct page *page = buf->page; 43 struct address_space *mapping; 44 45 lock_page(page); 46 47 mapping = page_mapping(page); 48 if (mapping) { 49 WARN_ON(!PageUptodate(page)); 50 51 /* 52 * At least for ext2 with nobh option, we need to wait on 53 * writeback completing on this page, since we'll remove it 54 * from the pagecache. Otherwise truncate wont wait on the 55 * page, allowing the disk blocks to be reused by someone else 56 * before we actually wrote our data to them. fs corruption 57 * ensues. 58 */ 59 wait_on_page_writeback(page); 60 61 if (PagePrivate(page)) 62 try_to_release_page(page, GFP_KERNEL); 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag 66 * and return good. 67 */ 68 if (remove_mapping(mapping, page)) { 69 buf->flags |= PIPE_BUF_FLAG_LRU; 70 return 0; 71 } 72 } 73 74 /* 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 unlock_page(page); 79 return 1; 80 } 81 82 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 83 struct pipe_buffer *buf) 84 { 85 page_cache_release(buf->page); 86 buf->flags &= ~PIPE_BUF_FLAG_LRU; 87 } 88 89 /* 90 * Check whether the contents of buf is OK to access. Since the content 91 * is a page cache page, IO may be in flight. 92 */ 93 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 94 struct pipe_buffer *buf) 95 { 96 struct page *page = buf->page; 97 int err; 98 99 if (!PageUptodate(page)) { 100 lock_page(page); 101 102 /* 103 * Page got truncated/unhashed. This will cause a 0-byte 104 * splice, if this is the first page. 105 */ 106 if (!page->mapping) { 107 err = -ENODATA; 108 goto error; 109 } 110 111 /* 112 * Uh oh, read-error from disk. 113 */ 114 if (!PageUptodate(page)) { 115 err = -EIO; 116 goto error; 117 } 118 119 /* 120 * Page is ok afterall, we are done. 121 */ 122 unlock_page(page); 123 } 124 125 return 0; 126 error: 127 unlock_page(page); 128 return err; 129 } 130 131 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 132 .can_merge = 0, 133 .map = generic_pipe_buf_map, 134 .unmap = generic_pipe_buf_unmap, 135 .confirm = page_cache_pipe_buf_confirm, 136 .release = page_cache_pipe_buf_release, 137 .steal = page_cache_pipe_buf_steal, 138 .get = generic_pipe_buf_get, 139 }; 140 141 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 145 return 1; 146 147 buf->flags |= PIPE_BUF_FLAG_LRU; 148 return generic_pipe_buf_steal(pipe, buf); 149 } 150 151 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 152 .can_merge = 0, 153 .map = generic_pipe_buf_map, 154 .unmap = generic_pipe_buf_unmap, 155 .confirm = generic_pipe_buf_confirm, 156 .release = page_cache_pipe_buf_release, 157 .steal = user_page_pipe_buf_steal, 158 .get = generic_pipe_buf_get, 159 }; 160 161 /** 162 * splice_to_pipe - fill passed data into a pipe 163 * @pipe: pipe to fill 164 * @spd: data to fill 165 * 166 * Description: 167 * @spd contains a map of pages and len/offset tuples, along with 168 * the struct pipe_buf_operations associated with these pages. This 169 * function will link that data to the pipe. 170 * 171 */ 172 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 173 struct splice_pipe_desc *spd) 174 { 175 unsigned int spd_pages = spd->nr_pages; 176 int ret, do_wakeup, page_nr; 177 178 ret = 0; 179 do_wakeup = 0; 180 page_nr = 0; 181 182 if (pipe->inode) 183 mutex_lock(&pipe->inode->i_mutex); 184 185 for (;;) { 186 if (!pipe->readers) { 187 send_sig(SIGPIPE, current, 0); 188 if (!ret) 189 ret = -EPIPE; 190 break; 191 } 192 193 if (pipe->nrbufs < PIPE_BUFFERS) { 194 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 195 struct pipe_buffer *buf = pipe->bufs + newbuf; 196 197 buf->page = spd->pages[page_nr]; 198 buf->offset = spd->partial[page_nr].offset; 199 buf->len = spd->partial[page_nr].len; 200 buf->private = spd->partial[page_nr].private; 201 buf->ops = spd->ops; 202 if (spd->flags & SPLICE_F_GIFT) 203 buf->flags |= PIPE_BUF_FLAG_GIFT; 204 205 pipe->nrbufs++; 206 page_nr++; 207 ret += buf->len; 208 209 if (pipe->inode) 210 do_wakeup = 1; 211 212 if (!--spd->nr_pages) 213 break; 214 if (pipe->nrbufs < PIPE_BUFFERS) 215 continue; 216 217 break; 218 } 219 220 if (spd->flags & SPLICE_F_NONBLOCK) { 221 if (!ret) 222 ret = -EAGAIN; 223 break; 224 } 225 226 if (signal_pending(current)) { 227 if (!ret) 228 ret = -ERESTARTSYS; 229 break; 230 } 231 232 if (do_wakeup) { 233 smp_mb(); 234 if (waitqueue_active(&pipe->wait)) 235 wake_up_interruptible_sync(&pipe->wait); 236 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 237 do_wakeup = 0; 238 } 239 240 pipe->waiting_writers++; 241 pipe_wait(pipe); 242 pipe->waiting_writers--; 243 } 244 245 if (pipe->inode) { 246 mutex_unlock(&pipe->inode->i_mutex); 247 248 if (do_wakeup) { 249 smp_mb(); 250 if (waitqueue_active(&pipe->wait)) 251 wake_up_interruptible(&pipe->wait); 252 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 } 254 } 255 256 while (page_nr < spd_pages) 257 page_cache_release(spd->pages[page_nr++]); 258 259 return ret; 260 } 261 262 static int 263 __generic_file_splice_read(struct file *in, loff_t *ppos, 264 struct pipe_inode_info *pipe, size_t len, 265 unsigned int flags) 266 { 267 struct address_space *mapping = in->f_mapping; 268 unsigned int loff, nr_pages, req_pages; 269 struct page *pages[PIPE_BUFFERS]; 270 struct partial_page partial[PIPE_BUFFERS]; 271 struct page *page; 272 pgoff_t index, end_index; 273 loff_t isize; 274 int error, page_nr; 275 struct splice_pipe_desc spd = { 276 .pages = pages, 277 .partial = partial, 278 .flags = flags, 279 .ops = &page_cache_pipe_buf_ops, 280 }; 281 282 index = *ppos >> PAGE_CACHE_SHIFT; 283 loff = *ppos & ~PAGE_CACHE_MASK; 284 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 285 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 286 287 /* 288 * Lookup the (hopefully) full range of pages we need. 289 */ 290 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 291 index += spd.nr_pages; 292 293 /* 294 * If find_get_pages_contig() returned fewer pages than we needed, 295 * readahead/allocate the rest and fill in the holes. 296 */ 297 if (spd.nr_pages < nr_pages) 298 page_cache_sync_readahead(mapping, &in->f_ra, in, 299 index, req_pages - spd.nr_pages); 300 301 error = 0; 302 while (spd.nr_pages < nr_pages) { 303 /* 304 * Page could be there, find_get_pages_contig() breaks on 305 * the first hole. 306 */ 307 page = find_get_page(mapping, index); 308 if (!page) { 309 /* 310 * page didn't exist, allocate one. 311 */ 312 page = page_cache_alloc_cold(mapping); 313 if (!page) 314 break; 315 316 error = add_to_page_cache_lru(page, mapping, index, 317 GFP_KERNEL); 318 if (unlikely(error)) { 319 page_cache_release(page); 320 if (error == -EEXIST) 321 continue; 322 break; 323 } 324 /* 325 * add_to_page_cache() locks the page, unlock it 326 * to avoid convoluting the logic below even more. 327 */ 328 unlock_page(page); 329 } 330 331 pages[spd.nr_pages++] = page; 332 index++; 333 } 334 335 /* 336 * Now loop over the map and see if we need to start IO on any 337 * pages, fill in the partial map, etc. 338 */ 339 index = *ppos >> PAGE_CACHE_SHIFT; 340 nr_pages = spd.nr_pages; 341 spd.nr_pages = 0; 342 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 343 unsigned int this_len; 344 345 if (!len) 346 break; 347 348 /* 349 * this_len is the max we'll use from this page 350 */ 351 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 352 page = pages[page_nr]; 353 354 if (PageReadahead(page)) 355 page_cache_async_readahead(mapping, &in->f_ra, in, 356 page, index, req_pages - page_nr); 357 358 /* 359 * If the page isn't uptodate, we may need to start io on it 360 */ 361 if (!PageUptodate(page)) { 362 /* 363 * If in nonblock mode then dont block on waiting 364 * for an in-flight io page 365 */ 366 if (flags & SPLICE_F_NONBLOCK) { 367 if (TestSetPageLocked(page)) 368 break; 369 } else 370 lock_page(page); 371 372 /* 373 * page was truncated, stop here. if this isn't the 374 * first page, we'll just complete what we already 375 * added 376 */ 377 if (!page->mapping) { 378 unlock_page(page); 379 break; 380 } 381 /* 382 * page was already under io and is now done, great 383 */ 384 if (PageUptodate(page)) { 385 unlock_page(page); 386 goto fill_it; 387 } 388 389 /* 390 * need to read in the page 391 */ 392 error = mapping->a_ops->readpage(in, page); 393 if (unlikely(error)) { 394 /* 395 * We really should re-lookup the page here, 396 * but it complicates things a lot. Instead 397 * lets just do what we already stored, and 398 * we'll get it the next time we are called. 399 */ 400 if (error == AOP_TRUNCATED_PAGE) 401 error = 0; 402 403 break; 404 } 405 } 406 fill_it: 407 /* 408 * i_size must be checked after PageUptodate. 409 */ 410 isize = i_size_read(mapping->host); 411 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 412 if (unlikely(!isize || index > end_index)) 413 break; 414 415 /* 416 * if this is the last page, see if we need to shrink 417 * the length and stop 418 */ 419 if (end_index == index) { 420 unsigned int plen; 421 422 /* 423 * max good bytes in this page 424 */ 425 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 426 if (plen <= loff) 427 break; 428 429 /* 430 * force quit after adding this page 431 */ 432 this_len = min(this_len, plen - loff); 433 len = this_len; 434 } 435 436 partial[page_nr].offset = loff; 437 partial[page_nr].len = this_len; 438 len -= this_len; 439 loff = 0; 440 spd.nr_pages++; 441 index++; 442 } 443 444 /* 445 * Release any pages at the end, if we quit early. 'page_nr' is how far 446 * we got, 'nr_pages' is how many pages are in the map. 447 */ 448 while (page_nr < nr_pages) 449 page_cache_release(pages[page_nr++]); 450 in->f_ra.prev_index = index; 451 452 if (spd.nr_pages) 453 return splice_to_pipe(pipe, &spd); 454 455 return error; 456 } 457 458 /** 459 * generic_file_splice_read - splice data from file to a pipe 460 * @in: file to splice from 461 * @ppos: position in @in 462 * @pipe: pipe to splice to 463 * @len: number of bytes to splice 464 * @flags: splice modifier flags 465 * 466 * Description: 467 * Will read pages from given file and fill them into a pipe. Can be 468 * used as long as the address_space operations for the source implements 469 * a readpage() hook. 470 * 471 */ 472 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 473 struct pipe_inode_info *pipe, size_t len, 474 unsigned int flags) 475 { 476 ssize_t spliced; 477 int ret; 478 loff_t isize, left; 479 480 isize = i_size_read(in->f_mapping->host); 481 if (unlikely(*ppos >= isize)) 482 return 0; 483 484 left = isize - *ppos; 485 if (unlikely(left < len)) 486 len = left; 487 488 ret = 0; 489 spliced = 0; 490 while (len && !spliced) { 491 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 492 493 if (ret < 0) 494 break; 495 else if (!ret) { 496 if (spliced) 497 break; 498 if (flags & SPLICE_F_NONBLOCK) { 499 ret = -EAGAIN; 500 break; 501 } 502 } 503 504 *ppos += ret; 505 len -= ret; 506 spliced += ret; 507 } 508 509 if (spliced) 510 return spliced; 511 512 return ret; 513 } 514 515 EXPORT_SYMBOL(generic_file_splice_read); 516 517 /* 518 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 519 * using sendpage(). Return the number of bytes sent. 520 */ 521 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 522 struct pipe_buffer *buf, struct splice_desc *sd) 523 { 524 struct file *file = sd->u.file; 525 loff_t pos = sd->pos; 526 int ret, more; 527 528 ret = buf->ops->confirm(pipe, buf); 529 if (!ret) { 530 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 531 532 ret = file->f_op->sendpage(file, buf->page, buf->offset, 533 sd->len, &pos, more); 534 } 535 536 return ret; 537 } 538 539 /* 540 * This is a little more tricky than the file -> pipe splicing. There are 541 * basically three cases: 542 * 543 * - Destination page already exists in the address space and there 544 * are users of it. For that case we have no other option that 545 * copying the data. Tough luck. 546 * - Destination page already exists in the address space, but there 547 * are no users of it. Make sure it's uptodate, then drop it. Fall 548 * through to last case. 549 * - Destination page does not exist, we can add the pipe page to 550 * the page cache and avoid the copy. 551 * 552 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 553 * sd->flags), we attempt to migrate pages from the pipe to the output 554 * file address space page cache. This is possible if no one else has 555 * the pipe page referenced outside of the pipe and page cache. If 556 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 557 * a new page in the output file page cache and fill/dirty that. 558 */ 559 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 560 struct splice_desc *sd) 561 { 562 struct file *file = sd->u.file; 563 struct address_space *mapping = file->f_mapping; 564 unsigned int offset, this_len; 565 struct page *page; 566 pgoff_t index; 567 int ret; 568 569 /* 570 * make sure the data in this buffer is uptodate 571 */ 572 ret = buf->ops->confirm(pipe, buf); 573 if (unlikely(ret)) 574 return ret; 575 576 index = sd->pos >> PAGE_CACHE_SHIFT; 577 offset = sd->pos & ~PAGE_CACHE_MASK; 578 579 this_len = sd->len; 580 if (this_len + offset > PAGE_CACHE_SIZE) 581 this_len = PAGE_CACHE_SIZE - offset; 582 583 find_page: 584 page = find_lock_page(mapping, index); 585 if (!page) { 586 ret = -ENOMEM; 587 page = page_cache_alloc_cold(mapping); 588 if (unlikely(!page)) 589 goto out_ret; 590 591 /* 592 * This will also lock the page 593 */ 594 ret = add_to_page_cache_lru(page, mapping, index, 595 GFP_KERNEL); 596 if (unlikely(ret)) 597 goto out_release; 598 } 599 600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); 601 if (unlikely(ret)) { 602 loff_t isize = i_size_read(mapping->host); 603 604 if (ret != AOP_TRUNCATED_PAGE) 605 unlock_page(page); 606 page_cache_release(page); 607 if (ret == AOP_TRUNCATED_PAGE) 608 goto find_page; 609 610 /* 611 * prepare_write() may have instantiated a few blocks 612 * outside i_size. Trim these off again. 613 */ 614 if (sd->pos + this_len > isize) 615 vmtruncate(mapping->host, isize); 616 617 goto out_ret; 618 } 619 620 if (buf->page != page) { 621 /* 622 * Careful, ->map() uses KM_USER0! 623 */ 624 char *src = buf->ops->map(pipe, buf, 1); 625 char *dst = kmap_atomic(page, KM_USER1); 626 627 memcpy(dst + offset, src + buf->offset, this_len); 628 flush_dcache_page(page); 629 kunmap_atomic(dst, KM_USER1); 630 buf->ops->unmap(pipe, buf, src); 631 } 632 633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 634 if (ret) { 635 if (ret == AOP_TRUNCATED_PAGE) { 636 page_cache_release(page); 637 goto find_page; 638 } 639 if (ret < 0) 640 goto out; 641 /* 642 * Partial write has happened, so 'ret' already initialized by 643 * number of bytes written, Where is nothing we have to do here. 644 */ 645 } else 646 ret = this_len; 647 /* 648 * Return the number of bytes written and mark page as 649 * accessed, we are now done! 650 */ 651 mark_page_accessed(page); 652 out: 653 unlock_page(page); 654 out_release: 655 page_cache_release(page); 656 out_ret: 657 return ret; 658 } 659 660 /** 661 * __splice_from_pipe - splice data from a pipe to given actor 662 * @pipe: pipe to splice from 663 * @sd: information to @actor 664 * @actor: handler that splices the data 665 * 666 * Description: 667 * This function does little more than loop over the pipe and call 668 * @actor to do the actual moving of a single struct pipe_buffer to 669 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 670 * pipe_to_user. 671 * 672 */ 673 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 674 splice_actor *actor) 675 { 676 int ret, do_wakeup, err; 677 678 ret = 0; 679 do_wakeup = 0; 680 681 for (;;) { 682 if (pipe->nrbufs) { 683 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 684 const struct pipe_buf_operations *ops = buf->ops; 685 686 sd->len = buf->len; 687 if (sd->len > sd->total_len) 688 sd->len = sd->total_len; 689 690 err = actor(pipe, buf, sd); 691 if (err <= 0) { 692 if (!ret && err != -ENODATA) 693 ret = err; 694 695 break; 696 } 697 698 ret += err; 699 buf->offset += err; 700 buf->len -= err; 701 702 sd->len -= err; 703 sd->pos += err; 704 sd->total_len -= err; 705 if (sd->len) 706 continue; 707 708 if (!buf->len) { 709 buf->ops = NULL; 710 ops->release(pipe, buf); 711 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 712 pipe->nrbufs--; 713 if (pipe->inode) 714 do_wakeup = 1; 715 } 716 717 if (!sd->total_len) 718 break; 719 } 720 721 if (pipe->nrbufs) 722 continue; 723 if (!pipe->writers) 724 break; 725 if (!pipe->waiting_writers) { 726 if (ret) 727 break; 728 } 729 730 if (sd->flags & SPLICE_F_NONBLOCK) { 731 if (!ret) 732 ret = -EAGAIN; 733 break; 734 } 735 736 if (signal_pending(current)) { 737 if (!ret) 738 ret = -ERESTARTSYS; 739 break; 740 } 741 742 if (do_wakeup) { 743 smp_mb(); 744 if (waitqueue_active(&pipe->wait)) 745 wake_up_interruptible_sync(&pipe->wait); 746 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 747 do_wakeup = 0; 748 } 749 750 pipe_wait(pipe); 751 } 752 753 if (do_wakeup) { 754 smp_mb(); 755 if (waitqueue_active(&pipe->wait)) 756 wake_up_interruptible(&pipe->wait); 757 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 758 } 759 760 return ret; 761 } 762 EXPORT_SYMBOL(__splice_from_pipe); 763 764 /** 765 * splice_from_pipe - splice data from a pipe to a file 766 * @pipe: pipe to splice from 767 * @out: file to splice to 768 * @ppos: position in @out 769 * @len: how many bytes to splice 770 * @flags: splice modifier flags 771 * @actor: handler that splices the data 772 * 773 * Description: 774 * See __splice_from_pipe. This function locks the input and output inodes, 775 * otherwise it's identical to __splice_from_pipe(). 776 * 777 */ 778 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 779 loff_t *ppos, size_t len, unsigned int flags, 780 splice_actor *actor) 781 { 782 ssize_t ret; 783 struct inode *inode = out->f_mapping->host; 784 struct splice_desc sd = { 785 .total_len = len, 786 .flags = flags, 787 .pos = *ppos, 788 .u.file = out, 789 }; 790 791 /* 792 * The actor worker might be calling ->prepare_write and 793 * ->commit_write. Most of the time, these expect i_mutex to 794 * be held. Since this may result in an ABBA deadlock with 795 * pipe->inode, we have to order lock acquiry here. 796 */ 797 inode_double_lock(inode, pipe->inode); 798 ret = __splice_from_pipe(pipe, &sd, actor); 799 inode_double_unlock(inode, pipe->inode); 800 801 return ret; 802 } 803 804 /** 805 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 806 * @pipe: pipe info 807 * @out: file to write to 808 * @ppos: position in @out 809 * @len: number of bytes to splice 810 * @flags: splice modifier flags 811 * 812 * Description: 813 * Will either move or copy pages (determined by @flags options) from 814 * the given pipe inode to the given file. The caller is responsible 815 * for acquiring i_mutex on both inodes. 816 * 817 */ 818 ssize_t 819 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 820 loff_t *ppos, size_t len, unsigned int flags) 821 { 822 struct address_space *mapping = out->f_mapping; 823 struct inode *inode = mapping->host; 824 struct splice_desc sd = { 825 .total_len = len, 826 .flags = flags, 827 .pos = *ppos, 828 .u.file = out, 829 }; 830 ssize_t ret; 831 int err; 832 833 err = remove_suid(out->f_path.dentry); 834 if (unlikely(err)) 835 return err; 836 837 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 838 if (ret > 0) { 839 unsigned long nr_pages; 840 841 *ppos += ret; 842 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 843 844 /* 845 * If file or inode is SYNC and we actually wrote some data, 846 * sync it. 847 */ 848 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 849 err = generic_osync_inode(inode, mapping, 850 OSYNC_METADATA|OSYNC_DATA); 851 852 if (err) 853 ret = err; 854 } 855 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 856 } 857 858 return ret; 859 } 860 861 EXPORT_SYMBOL(generic_file_splice_write_nolock); 862 863 /** 864 * generic_file_splice_write - splice data from a pipe to a file 865 * @pipe: pipe info 866 * @out: file to write to 867 * @ppos: position in @out 868 * @len: number of bytes to splice 869 * @flags: splice modifier flags 870 * 871 * Description: 872 * Will either move or copy pages (determined by @flags options) from 873 * the given pipe inode to the given file. 874 * 875 */ 876 ssize_t 877 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 878 loff_t *ppos, size_t len, unsigned int flags) 879 { 880 struct address_space *mapping = out->f_mapping; 881 struct inode *inode = mapping->host; 882 ssize_t ret; 883 int err; 884 885 err = should_remove_suid(out->f_path.dentry); 886 if (unlikely(err)) { 887 mutex_lock(&inode->i_mutex); 888 err = __remove_suid(out->f_path.dentry, err); 889 mutex_unlock(&inode->i_mutex); 890 if (err) 891 return err; 892 } 893 894 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 895 if (ret > 0) { 896 unsigned long nr_pages; 897 898 *ppos += ret; 899 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 900 901 /* 902 * If file or inode is SYNC and we actually wrote some data, 903 * sync it. 904 */ 905 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 906 mutex_lock(&inode->i_mutex); 907 err = generic_osync_inode(inode, mapping, 908 OSYNC_METADATA|OSYNC_DATA); 909 mutex_unlock(&inode->i_mutex); 910 911 if (err) 912 ret = err; 913 } 914 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 915 } 916 917 return ret; 918 } 919 920 EXPORT_SYMBOL(generic_file_splice_write); 921 922 /** 923 * generic_splice_sendpage - splice data from a pipe to a socket 924 * @pipe: pipe to splice from 925 * @out: socket to write to 926 * @ppos: position in @out 927 * @len: number of bytes to splice 928 * @flags: splice modifier flags 929 * 930 * Description: 931 * Will send @len bytes from the pipe to a network socket. No data copying 932 * is involved. 933 * 934 */ 935 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 936 loff_t *ppos, size_t len, unsigned int flags) 937 { 938 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 939 } 940 941 EXPORT_SYMBOL(generic_splice_sendpage); 942 943 /* 944 * Attempt to initiate a splice from pipe to file. 945 */ 946 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 947 loff_t *ppos, size_t len, unsigned int flags) 948 { 949 int ret; 950 951 if (unlikely(!out->f_op || !out->f_op->splice_write)) 952 return -EINVAL; 953 954 if (unlikely(!(out->f_mode & FMODE_WRITE))) 955 return -EBADF; 956 957 ret = rw_verify_area(WRITE, out, ppos, len); 958 if (unlikely(ret < 0)) 959 return ret; 960 961 ret = security_file_permission(out, MAY_WRITE); 962 if (unlikely(ret < 0)) 963 return ret; 964 965 return out->f_op->splice_write(pipe, out, ppos, len, flags); 966 } 967 968 /* 969 * Attempt to initiate a splice from a file to a pipe. 970 */ 971 static long do_splice_to(struct file *in, loff_t *ppos, 972 struct pipe_inode_info *pipe, size_t len, 973 unsigned int flags) 974 { 975 int ret; 976 977 if (unlikely(!in->f_op || !in->f_op->splice_read)) 978 return -EINVAL; 979 980 if (unlikely(!(in->f_mode & FMODE_READ))) 981 return -EBADF; 982 983 ret = rw_verify_area(READ, in, ppos, len); 984 if (unlikely(ret < 0)) 985 return ret; 986 987 ret = security_file_permission(in, MAY_READ); 988 if (unlikely(ret < 0)) 989 return ret; 990 991 return in->f_op->splice_read(in, ppos, pipe, len, flags); 992 } 993 994 /** 995 * splice_direct_to_actor - splices data directly between two non-pipes 996 * @in: file to splice from 997 * @sd: actor information on where to splice to 998 * @actor: handles the data splicing 999 * 1000 * Description: 1001 * This is a special case helper to splice directly between two 1002 * points, without requiring an explicit pipe. Internally an allocated 1003 * pipe is cached in the process, and reused during the lifetime of 1004 * that process. 1005 * 1006 */ 1007 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1008 splice_direct_actor *actor) 1009 { 1010 struct pipe_inode_info *pipe; 1011 long ret, bytes; 1012 umode_t i_mode; 1013 size_t len; 1014 int i, flags; 1015 1016 /* 1017 * We require the input being a regular file, as we don't want to 1018 * randomly drop data for eg socket -> socket splicing. Use the 1019 * piped splicing for that! 1020 */ 1021 i_mode = in->f_path.dentry->d_inode->i_mode; 1022 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1023 return -EINVAL; 1024 1025 /* 1026 * neither in nor out is a pipe, setup an internal pipe attached to 1027 * 'out' and transfer the wanted data from 'in' to 'out' through that 1028 */ 1029 pipe = current->splice_pipe; 1030 if (unlikely(!pipe)) { 1031 pipe = alloc_pipe_info(NULL); 1032 if (!pipe) 1033 return -ENOMEM; 1034 1035 /* 1036 * We don't have an immediate reader, but we'll read the stuff 1037 * out of the pipe right after the splice_to_pipe(). So set 1038 * PIPE_READERS appropriately. 1039 */ 1040 pipe->readers = 1; 1041 1042 current->splice_pipe = pipe; 1043 } 1044 1045 /* 1046 * Do the splice. 1047 */ 1048 ret = 0; 1049 bytes = 0; 1050 len = sd->total_len; 1051 flags = sd->flags; 1052 1053 /* 1054 * Don't block on output, we have to drain the direct pipe. 1055 */ 1056 sd->flags &= ~SPLICE_F_NONBLOCK; 1057 1058 while (len) { 1059 size_t read_len; 1060 loff_t pos = sd->pos; 1061 1062 ret = do_splice_to(in, &pos, pipe, len, flags); 1063 if (unlikely(ret <= 0)) 1064 goto out_release; 1065 1066 read_len = ret; 1067 sd->total_len = read_len; 1068 1069 /* 1070 * NOTE: nonblocking mode only applies to the input. We 1071 * must not do the output in nonblocking mode as then we 1072 * could get stuck data in the internal pipe: 1073 */ 1074 ret = actor(pipe, sd); 1075 if (unlikely(ret <= 0)) 1076 goto out_release; 1077 1078 bytes += ret; 1079 len -= ret; 1080 sd->pos = pos; 1081 1082 if (ret < read_len) 1083 goto out_release; 1084 } 1085 1086 pipe->nrbufs = pipe->curbuf = 0; 1087 return bytes; 1088 1089 out_release: 1090 /* 1091 * If we did an incomplete transfer we must release 1092 * the pipe buffers in question: 1093 */ 1094 for (i = 0; i < PIPE_BUFFERS; i++) { 1095 struct pipe_buffer *buf = pipe->bufs + i; 1096 1097 if (buf->ops) { 1098 buf->ops->release(pipe, buf); 1099 buf->ops = NULL; 1100 } 1101 } 1102 pipe->nrbufs = pipe->curbuf = 0; 1103 1104 /* 1105 * If we transferred some data, return the number of bytes: 1106 */ 1107 if (bytes > 0) 1108 return bytes; 1109 1110 return ret; 1111 1112 } 1113 EXPORT_SYMBOL(splice_direct_to_actor); 1114 1115 static int direct_splice_actor(struct pipe_inode_info *pipe, 1116 struct splice_desc *sd) 1117 { 1118 struct file *file = sd->u.file; 1119 1120 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1121 } 1122 1123 /** 1124 * do_splice_direct - splices data directly between two files 1125 * @in: file to splice from 1126 * @ppos: input file offset 1127 * @out: file to splice to 1128 * @len: number of bytes to splice 1129 * @flags: splice modifier flags 1130 * 1131 * Description: 1132 * For use by do_sendfile(). splice can easily emulate sendfile, but 1133 * doing it in the application would incur an extra system call 1134 * (splice in + splice out, as compared to just sendfile()). So this helper 1135 * can splice directly through a process-private pipe. 1136 * 1137 */ 1138 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1139 size_t len, unsigned int flags) 1140 { 1141 struct splice_desc sd = { 1142 .len = len, 1143 .total_len = len, 1144 .flags = flags, 1145 .pos = *ppos, 1146 .u.file = out, 1147 }; 1148 long ret; 1149 1150 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1151 if (ret > 0) 1152 *ppos += ret; 1153 1154 return ret; 1155 } 1156 1157 /* 1158 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1159 * location, so checking ->i_pipe is not enough to verify that this is a 1160 * pipe. 1161 */ 1162 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1163 { 1164 if (S_ISFIFO(inode->i_mode)) 1165 return inode->i_pipe; 1166 1167 return NULL; 1168 } 1169 1170 /* 1171 * Determine where to splice to/from. 1172 */ 1173 static long do_splice(struct file *in, loff_t __user *off_in, 1174 struct file *out, loff_t __user *off_out, 1175 size_t len, unsigned int flags) 1176 { 1177 struct pipe_inode_info *pipe; 1178 loff_t offset, *off; 1179 long ret; 1180 1181 pipe = pipe_info(in->f_path.dentry->d_inode); 1182 if (pipe) { 1183 if (off_in) 1184 return -ESPIPE; 1185 if (off_out) { 1186 if (out->f_op->llseek == no_llseek) 1187 return -EINVAL; 1188 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1189 return -EFAULT; 1190 off = &offset; 1191 } else 1192 off = &out->f_pos; 1193 1194 ret = do_splice_from(pipe, out, off, len, flags); 1195 1196 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1197 ret = -EFAULT; 1198 1199 return ret; 1200 } 1201 1202 pipe = pipe_info(out->f_path.dentry->d_inode); 1203 if (pipe) { 1204 if (off_out) 1205 return -ESPIPE; 1206 if (off_in) { 1207 if (in->f_op->llseek == no_llseek) 1208 return -EINVAL; 1209 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1210 return -EFAULT; 1211 off = &offset; 1212 } else 1213 off = &in->f_pos; 1214 1215 ret = do_splice_to(in, off, pipe, len, flags); 1216 1217 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1218 ret = -EFAULT; 1219 1220 return ret; 1221 } 1222 1223 return -EINVAL; 1224 } 1225 1226 /* 1227 * Map an iov into an array of pages and offset/length tupples. With the 1228 * partial_page structure, we can map several non-contiguous ranges into 1229 * our ones pages[] map instead of splitting that operation into pieces. 1230 * Could easily be exported as a generic helper for other users, in which 1231 * case one would probably want to add a 'max_nr_pages' parameter as well. 1232 */ 1233 static int get_iovec_page_array(const struct iovec __user *iov, 1234 unsigned int nr_vecs, struct page **pages, 1235 struct partial_page *partial, int aligned) 1236 { 1237 int buffers = 0, error = 0; 1238 1239 /* 1240 * It's ok to take the mmap_sem for reading, even 1241 * across a "get_user()". 1242 */ 1243 down_read(¤t->mm->mmap_sem); 1244 1245 while (nr_vecs) { 1246 unsigned long off, npages; 1247 void __user *base; 1248 size_t len; 1249 int i; 1250 1251 /* 1252 * Get user address base and length for this iovec. 1253 */ 1254 error = get_user(base, &iov->iov_base); 1255 if (unlikely(error)) 1256 break; 1257 error = get_user(len, &iov->iov_len); 1258 if (unlikely(error)) 1259 break; 1260 1261 /* 1262 * Sanity check this iovec. 0 read succeeds. 1263 */ 1264 if (unlikely(!len)) 1265 break; 1266 error = -EFAULT; 1267 if (unlikely(!base)) 1268 break; 1269 1270 /* 1271 * Get this base offset and number of pages, then map 1272 * in the user pages. 1273 */ 1274 off = (unsigned long) base & ~PAGE_MASK; 1275 1276 /* 1277 * If asked for alignment, the offset must be zero and the 1278 * length a multiple of the PAGE_SIZE. 1279 */ 1280 error = -EINVAL; 1281 if (aligned && (off || len & ~PAGE_MASK)) 1282 break; 1283 1284 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1285 if (npages > PIPE_BUFFERS - buffers) 1286 npages = PIPE_BUFFERS - buffers; 1287 1288 error = get_user_pages(current, current->mm, 1289 (unsigned long) base, npages, 0, 0, 1290 &pages[buffers], NULL); 1291 1292 if (unlikely(error <= 0)) 1293 break; 1294 1295 /* 1296 * Fill this contiguous range into the partial page map. 1297 */ 1298 for (i = 0; i < error; i++) { 1299 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1300 1301 partial[buffers].offset = off; 1302 partial[buffers].len = plen; 1303 1304 off = 0; 1305 len -= plen; 1306 buffers++; 1307 } 1308 1309 /* 1310 * We didn't complete this iov, stop here since it probably 1311 * means we have to move some of this into a pipe to 1312 * be able to continue. 1313 */ 1314 if (len) 1315 break; 1316 1317 /* 1318 * Don't continue if we mapped fewer pages than we asked for, 1319 * or if we mapped the max number of pages that we have 1320 * room for. 1321 */ 1322 if (error < npages || buffers == PIPE_BUFFERS) 1323 break; 1324 1325 nr_vecs--; 1326 iov++; 1327 } 1328 1329 up_read(¤t->mm->mmap_sem); 1330 1331 if (buffers) 1332 return buffers; 1333 1334 return error; 1335 } 1336 1337 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1338 struct splice_desc *sd) 1339 { 1340 char *src; 1341 int ret; 1342 1343 ret = buf->ops->confirm(pipe, buf); 1344 if (unlikely(ret)) 1345 return ret; 1346 1347 /* 1348 * See if we can use the atomic maps, by prefaulting in the 1349 * pages and doing an atomic copy 1350 */ 1351 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1352 src = buf->ops->map(pipe, buf, 1); 1353 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1354 sd->len); 1355 buf->ops->unmap(pipe, buf, src); 1356 if (!ret) { 1357 ret = sd->len; 1358 goto out; 1359 } 1360 } 1361 1362 /* 1363 * No dice, use slow non-atomic map and copy 1364 */ 1365 src = buf->ops->map(pipe, buf, 0); 1366 1367 ret = sd->len; 1368 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1369 ret = -EFAULT; 1370 1371 out: 1372 if (ret > 0) 1373 sd->u.userptr += ret; 1374 buf->ops->unmap(pipe, buf, src); 1375 return ret; 1376 } 1377 1378 /* 1379 * For lack of a better implementation, implement vmsplice() to userspace 1380 * as a simple copy of the pipes pages to the user iov. 1381 */ 1382 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1383 unsigned long nr_segs, unsigned int flags) 1384 { 1385 struct pipe_inode_info *pipe; 1386 struct splice_desc sd; 1387 ssize_t size; 1388 int error; 1389 long ret; 1390 1391 pipe = pipe_info(file->f_path.dentry->d_inode); 1392 if (!pipe) 1393 return -EBADF; 1394 1395 if (pipe->inode) 1396 mutex_lock(&pipe->inode->i_mutex); 1397 1398 error = ret = 0; 1399 while (nr_segs) { 1400 void __user *base; 1401 size_t len; 1402 1403 /* 1404 * Get user address base and length for this iovec. 1405 */ 1406 error = get_user(base, &iov->iov_base); 1407 if (unlikely(error)) 1408 break; 1409 error = get_user(len, &iov->iov_len); 1410 if (unlikely(error)) 1411 break; 1412 1413 /* 1414 * Sanity check this iovec. 0 read succeeds. 1415 */ 1416 if (unlikely(!len)) 1417 break; 1418 if (unlikely(!base)) { 1419 error = -EFAULT; 1420 break; 1421 } 1422 1423 sd.len = 0; 1424 sd.total_len = len; 1425 sd.flags = flags; 1426 sd.u.userptr = base; 1427 sd.pos = 0; 1428 1429 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1430 if (size < 0) { 1431 if (!ret) 1432 ret = size; 1433 1434 break; 1435 } 1436 1437 ret += size; 1438 1439 if (size < len) 1440 break; 1441 1442 nr_segs--; 1443 iov++; 1444 } 1445 1446 if (pipe->inode) 1447 mutex_unlock(&pipe->inode->i_mutex); 1448 1449 if (!ret) 1450 ret = error; 1451 1452 return ret; 1453 } 1454 1455 /* 1456 * vmsplice splices a user address range into a pipe. It can be thought of 1457 * as splice-from-memory, where the regular splice is splice-from-file (or 1458 * to file). In both cases the output is a pipe, naturally. 1459 */ 1460 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1461 unsigned long nr_segs, unsigned int flags) 1462 { 1463 struct pipe_inode_info *pipe; 1464 struct page *pages[PIPE_BUFFERS]; 1465 struct partial_page partial[PIPE_BUFFERS]; 1466 struct splice_pipe_desc spd = { 1467 .pages = pages, 1468 .partial = partial, 1469 .flags = flags, 1470 .ops = &user_page_pipe_buf_ops, 1471 }; 1472 1473 pipe = pipe_info(file->f_path.dentry->d_inode); 1474 if (!pipe) 1475 return -EBADF; 1476 1477 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1478 flags & SPLICE_F_GIFT); 1479 if (spd.nr_pages <= 0) 1480 return spd.nr_pages; 1481 1482 return splice_to_pipe(pipe, &spd); 1483 } 1484 1485 /* 1486 * Note that vmsplice only really supports true splicing _from_ user memory 1487 * to a pipe, not the other way around. Splicing from user memory is a simple 1488 * operation that can be supported without any funky alignment restrictions 1489 * or nasty vm tricks. We simply map in the user memory and fill them into 1490 * a pipe. The reverse isn't quite as easy, though. There are two possible 1491 * solutions for that: 1492 * 1493 * - memcpy() the data internally, at which point we might as well just 1494 * do a regular read() on the buffer anyway. 1495 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1496 * has restriction limitations on both ends of the pipe). 1497 * 1498 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1499 * 1500 */ 1501 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1502 unsigned long nr_segs, unsigned int flags) 1503 { 1504 struct file *file; 1505 long error; 1506 int fput; 1507 1508 if (unlikely(nr_segs > UIO_MAXIOV)) 1509 return -EINVAL; 1510 else if (unlikely(!nr_segs)) 1511 return 0; 1512 1513 error = -EBADF; 1514 file = fget_light(fd, &fput); 1515 if (file) { 1516 if (file->f_mode & FMODE_WRITE) 1517 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1518 else if (file->f_mode & FMODE_READ) 1519 error = vmsplice_to_user(file, iov, nr_segs, flags); 1520 1521 fput_light(file, fput); 1522 } 1523 1524 return error; 1525 } 1526 1527 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1528 int fd_out, loff_t __user *off_out, 1529 size_t len, unsigned int flags) 1530 { 1531 long error; 1532 struct file *in, *out; 1533 int fput_in, fput_out; 1534 1535 if (unlikely(!len)) 1536 return 0; 1537 1538 error = -EBADF; 1539 in = fget_light(fd_in, &fput_in); 1540 if (in) { 1541 if (in->f_mode & FMODE_READ) { 1542 out = fget_light(fd_out, &fput_out); 1543 if (out) { 1544 if (out->f_mode & FMODE_WRITE) 1545 error = do_splice(in, off_in, 1546 out, off_out, 1547 len, flags); 1548 fput_light(out, fput_out); 1549 } 1550 } 1551 1552 fput_light(in, fput_in); 1553 } 1554 1555 return error; 1556 } 1557 1558 /* 1559 * Make sure there's data to read. Wait for input if we can, otherwise 1560 * return an appropriate error. 1561 */ 1562 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1563 { 1564 int ret; 1565 1566 /* 1567 * Check ->nrbufs without the inode lock first. This function 1568 * is speculative anyways, so missing one is ok. 1569 */ 1570 if (pipe->nrbufs) 1571 return 0; 1572 1573 ret = 0; 1574 mutex_lock(&pipe->inode->i_mutex); 1575 1576 while (!pipe->nrbufs) { 1577 if (signal_pending(current)) { 1578 ret = -ERESTARTSYS; 1579 break; 1580 } 1581 if (!pipe->writers) 1582 break; 1583 if (!pipe->waiting_writers) { 1584 if (flags & SPLICE_F_NONBLOCK) { 1585 ret = -EAGAIN; 1586 break; 1587 } 1588 } 1589 pipe_wait(pipe); 1590 } 1591 1592 mutex_unlock(&pipe->inode->i_mutex); 1593 return ret; 1594 } 1595 1596 /* 1597 * Make sure there's writeable room. Wait for room if we can, otherwise 1598 * return an appropriate error. 1599 */ 1600 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1601 { 1602 int ret; 1603 1604 /* 1605 * Check ->nrbufs without the inode lock first. This function 1606 * is speculative anyways, so missing one is ok. 1607 */ 1608 if (pipe->nrbufs < PIPE_BUFFERS) 1609 return 0; 1610 1611 ret = 0; 1612 mutex_lock(&pipe->inode->i_mutex); 1613 1614 while (pipe->nrbufs >= PIPE_BUFFERS) { 1615 if (!pipe->readers) { 1616 send_sig(SIGPIPE, current, 0); 1617 ret = -EPIPE; 1618 break; 1619 } 1620 if (flags & SPLICE_F_NONBLOCK) { 1621 ret = -EAGAIN; 1622 break; 1623 } 1624 if (signal_pending(current)) { 1625 ret = -ERESTARTSYS; 1626 break; 1627 } 1628 pipe->waiting_writers++; 1629 pipe_wait(pipe); 1630 pipe->waiting_writers--; 1631 } 1632 1633 mutex_unlock(&pipe->inode->i_mutex); 1634 return ret; 1635 } 1636 1637 /* 1638 * Link contents of ipipe to opipe. 1639 */ 1640 static int link_pipe(struct pipe_inode_info *ipipe, 1641 struct pipe_inode_info *opipe, 1642 size_t len, unsigned int flags) 1643 { 1644 struct pipe_buffer *ibuf, *obuf; 1645 int ret = 0, i = 0, nbuf; 1646 1647 /* 1648 * Potential ABBA deadlock, work around it by ordering lock 1649 * grabbing by inode address. Otherwise two different processes 1650 * could deadlock (one doing tee from A -> B, the other from B -> A). 1651 */ 1652 inode_double_lock(ipipe->inode, opipe->inode); 1653 1654 do { 1655 if (!opipe->readers) { 1656 send_sig(SIGPIPE, current, 0); 1657 if (!ret) 1658 ret = -EPIPE; 1659 break; 1660 } 1661 1662 /* 1663 * If we have iterated all input buffers or ran out of 1664 * output room, break. 1665 */ 1666 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1667 break; 1668 1669 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1670 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1671 1672 /* 1673 * Get a reference to this pipe buffer, 1674 * so we can copy the contents over. 1675 */ 1676 ibuf->ops->get(ipipe, ibuf); 1677 1678 obuf = opipe->bufs + nbuf; 1679 *obuf = *ibuf; 1680 1681 /* 1682 * Don't inherit the gift flag, we need to 1683 * prevent multiple steals of this page. 1684 */ 1685 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1686 1687 if (obuf->len > len) 1688 obuf->len = len; 1689 1690 opipe->nrbufs++; 1691 ret += obuf->len; 1692 len -= obuf->len; 1693 i++; 1694 } while (len); 1695 1696 inode_double_unlock(ipipe->inode, opipe->inode); 1697 1698 /* 1699 * If we put data in the output pipe, wakeup any potential readers. 1700 */ 1701 if (ret > 0) { 1702 smp_mb(); 1703 if (waitqueue_active(&opipe->wait)) 1704 wake_up_interruptible(&opipe->wait); 1705 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1706 } 1707 1708 return ret; 1709 } 1710 1711 /* 1712 * This is a tee(1) implementation that works on pipes. It doesn't copy 1713 * any data, it simply references the 'in' pages on the 'out' pipe. 1714 * The 'flags' used are the SPLICE_F_* variants, currently the only 1715 * applicable one is SPLICE_F_NONBLOCK. 1716 */ 1717 static long do_tee(struct file *in, struct file *out, size_t len, 1718 unsigned int flags) 1719 { 1720 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1721 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1722 int ret = -EINVAL; 1723 1724 /* 1725 * Duplicate the contents of ipipe to opipe without actually 1726 * copying the data. 1727 */ 1728 if (ipipe && opipe && ipipe != opipe) { 1729 /* 1730 * Keep going, unless we encounter an error. The ipipe/opipe 1731 * ordering doesn't really matter. 1732 */ 1733 ret = link_ipipe_prep(ipipe, flags); 1734 if (!ret) { 1735 ret = link_opipe_prep(opipe, flags); 1736 if (!ret) { 1737 ret = link_pipe(ipipe, opipe, len, flags); 1738 if (!ret && (flags & SPLICE_F_NONBLOCK)) 1739 ret = -EAGAIN; 1740 } 1741 } 1742 } 1743 1744 return ret; 1745 } 1746 1747 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1748 { 1749 struct file *in; 1750 int error, fput_in; 1751 1752 if (unlikely(!len)) 1753 return 0; 1754 1755 error = -EBADF; 1756 in = fget_light(fdin, &fput_in); 1757 if (in) { 1758 if (in->f_mode & FMODE_READ) { 1759 int fput_out; 1760 struct file *out = fget_light(fdout, &fput_out); 1761 1762 if (out) { 1763 if (out->f_mode & FMODE_WRITE) 1764 error = do_tee(in, out, len, flags); 1765 fput_light(out, fput_out); 1766 } 1767 } 1768 fput_light(in, fput_in); 1769 } 1770 1771 return error; 1772 } 1773