1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/mm_inline.h> 25 #include <linux/swap.h> 26 #include <linux/writeback.h> 27 #include <linux/buffer_head.h> 28 #include <linux/module.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 33 /* 34 * Attempt to steal a page from a pipe buffer. This should perhaps go into 35 * a vm helper function, it's already simplified quite a bit by the 36 * addition of remove_mapping(). If success is returned, the caller may 37 * attempt to reuse this page for another destination. 38 */ 39 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 40 struct pipe_buffer *buf) 41 { 42 struct page *page = buf->page; 43 struct address_space *mapping; 44 45 lock_page(page); 46 47 mapping = page_mapping(page); 48 if (mapping) { 49 WARN_ON(!PageUptodate(page)); 50 51 /* 52 * At least for ext2 with nobh option, we need to wait on 53 * writeback completing on this page, since we'll remove it 54 * from the pagecache. Otherwise truncate wont wait on the 55 * page, allowing the disk blocks to be reused by someone else 56 * before we actually wrote our data to them. fs corruption 57 * ensues. 58 */ 59 wait_on_page_writeback(page); 60 61 if (PagePrivate(page)) 62 try_to_release_page(page, GFP_KERNEL); 63 64 /* 65 * If we succeeded in removing the mapping, set LRU flag 66 * and return good. 67 */ 68 if (remove_mapping(mapping, page)) { 69 buf->flags |= PIPE_BUF_FLAG_LRU; 70 return 0; 71 } 72 } 73 74 /* 75 * Raced with truncate or failed to remove page from current 76 * address space, unlock and return failure. 77 */ 78 unlock_page(page); 79 return 1; 80 } 81 82 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 83 struct pipe_buffer *buf) 84 { 85 page_cache_release(buf->page); 86 buf->flags &= ~PIPE_BUF_FLAG_LRU; 87 } 88 89 /* 90 * Check whether the contents of buf is OK to access. Since the content 91 * is a page cache page, IO may be in flight. 92 */ 93 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 94 struct pipe_buffer *buf) 95 { 96 struct page *page = buf->page; 97 int err; 98 99 if (!PageUptodate(page)) { 100 lock_page(page); 101 102 /* 103 * Page got truncated/unhashed. This will cause a 0-byte 104 * splice, if this is the first page. 105 */ 106 if (!page->mapping) { 107 err = -ENODATA; 108 goto error; 109 } 110 111 /* 112 * Uh oh, read-error from disk. 113 */ 114 if (!PageUptodate(page)) { 115 err = -EIO; 116 goto error; 117 } 118 119 /* 120 * Page is ok afterall, we are done. 121 */ 122 unlock_page(page); 123 } 124 125 return 0; 126 error: 127 unlock_page(page); 128 return err; 129 } 130 131 static const struct pipe_buf_operations page_cache_pipe_buf_ops = { 132 .can_merge = 0, 133 .map = generic_pipe_buf_map, 134 .unmap = generic_pipe_buf_unmap, 135 .confirm = page_cache_pipe_buf_confirm, 136 .release = page_cache_pipe_buf_release, 137 .steal = page_cache_pipe_buf_steal, 138 .get = generic_pipe_buf_get, 139 }; 140 141 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 145 return 1; 146 147 buf->flags |= PIPE_BUF_FLAG_LRU; 148 return generic_pipe_buf_steal(pipe, buf); 149 } 150 151 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 152 .can_merge = 0, 153 .map = generic_pipe_buf_map, 154 .unmap = generic_pipe_buf_unmap, 155 .confirm = generic_pipe_buf_confirm, 156 .release = page_cache_pipe_buf_release, 157 .steal = user_page_pipe_buf_steal, 158 .get = generic_pipe_buf_get, 159 }; 160 161 /** 162 * splice_to_pipe - fill passed data into a pipe 163 * @pipe: pipe to fill 164 * @spd: data to fill 165 * 166 * Description: 167 * @spd contains a map of pages and len/offset tuples, along with 168 * the struct pipe_buf_operations associated with these pages. This 169 * function will link that data to the pipe. 170 * 171 */ 172 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 173 struct splice_pipe_desc *spd) 174 { 175 unsigned int spd_pages = spd->nr_pages; 176 int ret, do_wakeup, page_nr; 177 178 ret = 0; 179 do_wakeup = 0; 180 page_nr = 0; 181 182 if (pipe->inode) 183 mutex_lock(&pipe->inode->i_mutex); 184 185 for (;;) { 186 if (!pipe->readers) { 187 send_sig(SIGPIPE, current, 0); 188 if (!ret) 189 ret = -EPIPE; 190 break; 191 } 192 193 if (pipe->nrbufs < PIPE_BUFFERS) { 194 int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); 195 struct pipe_buffer *buf = pipe->bufs + newbuf; 196 197 buf->page = spd->pages[page_nr]; 198 buf->offset = spd->partial[page_nr].offset; 199 buf->len = spd->partial[page_nr].len; 200 buf->private = spd->partial[page_nr].private; 201 buf->ops = spd->ops; 202 if (spd->flags & SPLICE_F_GIFT) 203 buf->flags |= PIPE_BUF_FLAG_GIFT; 204 205 pipe->nrbufs++; 206 page_nr++; 207 ret += buf->len; 208 209 if (pipe->inode) 210 do_wakeup = 1; 211 212 if (!--spd->nr_pages) 213 break; 214 if (pipe->nrbufs < PIPE_BUFFERS) 215 continue; 216 217 break; 218 } 219 220 if (spd->flags & SPLICE_F_NONBLOCK) { 221 if (!ret) 222 ret = -EAGAIN; 223 break; 224 } 225 226 if (signal_pending(current)) { 227 if (!ret) 228 ret = -ERESTARTSYS; 229 break; 230 } 231 232 if (do_wakeup) { 233 smp_mb(); 234 if (waitqueue_active(&pipe->wait)) 235 wake_up_interruptible_sync(&pipe->wait); 236 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 237 do_wakeup = 0; 238 } 239 240 pipe->waiting_writers++; 241 pipe_wait(pipe); 242 pipe->waiting_writers--; 243 } 244 245 if (pipe->inode) { 246 mutex_unlock(&pipe->inode->i_mutex); 247 248 if (do_wakeup) { 249 smp_mb(); 250 if (waitqueue_active(&pipe->wait)) 251 wake_up_interruptible(&pipe->wait); 252 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 253 } 254 } 255 256 while (page_nr < spd_pages) 257 spd->spd_release(spd, page_nr++); 258 259 return ret; 260 } 261 262 static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) 263 { 264 page_cache_release(spd->pages[i]); 265 } 266 267 static int 268 __generic_file_splice_read(struct file *in, loff_t *ppos, 269 struct pipe_inode_info *pipe, size_t len, 270 unsigned int flags) 271 { 272 struct address_space *mapping = in->f_mapping; 273 unsigned int loff, nr_pages, req_pages; 274 struct page *pages[PIPE_BUFFERS]; 275 struct partial_page partial[PIPE_BUFFERS]; 276 struct page *page; 277 pgoff_t index, end_index; 278 loff_t isize; 279 int error, page_nr; 280 struct splice_pipe_desc spd = { 281 .pages = pages, 282 .partial = partial, 283 .flags = flags, 284 .ops = &page_cache_pipe_buf_ops, 285 .spd_release = spd_release_page, 286 }; 287 288 index = *ppos >> PAGE_CACHE_SHIFT; 289 loff = *ppos & ~PAGE_CACHE_MASK; 290 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 291 nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); 292 293 /* 294 * Lookup the (hopefully) full range of pages we need. 295 */ 296 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); 297 index += spd.nr_pages; 298 299 /* 300 * If find_get_pages_contig() returned fewer pages than we needed, 301 * readahead/allocate the rest and fill in the holes. 302 */ 303 if (spd.nr_pages < nr_pages) 304 page_cache_sync_readahead(mapping, &in->f_ra, in, 305 index, req_pages - spd.nr_pages); 306 307 error = 0; 308 while (spd.nr_pages < nr_pages) { 309 /* 310 * Page could be there, find_get_pages_contig() breaks on 311 * the first hole. 312 */ 313 page = find_get_page(mapping, index); 314 if (!page) { 315 /* 316 * page didn't exist, allocate one. 317 */ 318 page = page_cache_alloc_cold(mapping); 319 if (!page) 320 break; 321 322 error = add_to_page_cache_lru(page, mapping, index, 323 mapping_gfp_mask(mapping)); 324 if (unlikely(error)) { 325 page_cache_release(page); 326 if (error == -EEXIST) 327 continue; 328 break; 329 } 330 /* 331 * add_to_page_cache() locks the page, unlock it 332 * to avoid convoluting the logic below even more. 333 */ 334 unlock_page(page); 335 } 336 337 pages[spd.nr_pages++] = page; 338 index++; 339 } 340 341 /* 342 * Now loop over the map and see if we need to start IO on any 343 * pages, fill in the partial map, etc. 344 */ 345 index = *ppos >> PAGE_CACHE_SHIFT; 346 nr_pages = spd.nr_pages; 347 spd.nr_pages = 0; 348 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 349 unsigned int this_len; 350 351 if (!len) 352 break; 353 354 /* 355 * this_len is the max we'll use from this page 356 */ 357 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 358 page = pages[page_nr]; 359 360 if (PageReadahead(page)) 361 page_cache_async_readahead(mapping, &in->f_ra, in, 362 page, index, req_pages - page_nr); 363 364 /* 365 * If the page isn't uptodate, we may need to start io on it 366 */ 367 if (!PageUptodate(page)) { 368 /* 369 * If in nonblock mode then dont block on waiting 370 * for an in-flight io page 371 */ 372 if (flags & SPLICE_F_NONBLOCK) { 373 if (TestSetPageLocked(page)) { 374 error = -EAGAIN; 375 break; 376 } 377 } else 378 lock_page(page); 379 380 /* 381 * page was truncated, stop here. if this isn't the 382 * first page, we'll just complete what we already 383 * added 384 */ 385 if (!page->mapping) { 386 unlock_page(page); 387 break; 388 } 389 /* 390 * page was already under io and is now done, great 391 */ 392 if (PageUptodate(page)) { 393 unlock_page(page); 394 goto fill_it; 395 } 396 397 /* 398 * need to read in the page 399 */ 400 error = mapping->a_ops->readpage(in, page); 401 if (unlikely(error)) { 402 /* 403 * We really should re-lookup the page here, 404 * but it complicates things a lot. Instead 405 * lets just do what we already stored, and 406 * we'll get it the next time we are called. 407 */ 408 if (error == AOP_TRUNCATED_PAGE) 409 error = 0; 410 411 break; 412 } 413 } 414 fill_it: 415 /* 416 * i_size must be checked after PageUptodate. 417 */ 418 isize = i_size_read(mapping->host); 419 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 420 if (unlikely(!isize || index > end_index)) 421 break; 422 423 /* 424 * if this is the last page, see if we need to shrink 425 * the length and stop 426 */ 427 if (end_index == index) { 428 unsigned int plen; 429 430 /* 431 * max good bytes in this page 432 */ 433 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 434 if (plen <= loff) 435 break; 436 437 /* 438 * force quit after adding this page 439 */ 440 this_len = min(this_len, plen - loff); 441 len = this_len; 442 } 443 444 partial[page_nr].offset = loff; 445 partial[page_nr].len = this_len; 446 len -= this_len; 447 loff = 0; 448 spd.nr_pages++; 449 index++; 450 } 451 452 /* 453 * Release any pages at the end, if we quit early. 'page_nr' is how far 454 * we got, 'nr_pages' is how many pages are in the map. 455 */ 456 while (page_nr < nr_pages) 457 page_cache_release(pages[page_nr++]); 458 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 459 460 if (spd.nr_pages) 461 return splice_to_pipe(pipe, &spd); 462 463 return error; 464 } 465 466 /** 467 * generic_file_splice_read - splice data from file to a pipe 468 * @in: file to splice from 469 * @ppos: position in @in 470 * @pipe: pipe to splice to 471 * @len: number of bytes to splice 472 * @flags: splice modifier flags 473 * 474 * Description: 475 * Will read pages from given file and fill them into a pipe. Can be 476 * used as long as the address_space operations for the source implements 477 * a readpage() hook. 478 * 479 */ 480 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 481 struct pipe_inode_info *pipe, size_t len, 482 unsigned int flags) 483 { 484 loff_t isize, left; 485 int ret; 486 487 isize = i_size_read(in->f_mapping->host); 488 if (unlikely(*ppos >= isize)) 489 return 0; 490 491 left = isize - *ppos; 492 if (unlikely(left < len)) 493 len = left; 494 495 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 496 if (ret > 0) 497 *ppos += ret; 498 499 return ret; 500 } 501 502 EXPORT_SYMBOL(generic_file_splice_read); 503 504 /* 505 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 506 * using sendpage(). Return the number of bytes sent. 507 */ 508 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 509 struct pipe_buffer *buf, struct splice_desc *sd) 510 { 511 struct file *file = sd->u.file; 512 loff_t pos = sd->pos; 513 int ret, more; 514 515 ret = buf->ops->confirm(pipe, buf); 516 if (!ret) { 517 more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; 518 519 ret = file->f_op->sendpage(file, buf->page, buf->offset, 520 sd->len, &pos, more); 521 } 522 523 return ret; 524 } 525 526 /* 527 * This is a little more tricky than the file -> pipe splicing. There are 528 * basically three cases: 529 * 530 * - Destination page already exists in the address space and there 531 * are users of it. For that case we have no other option that 532 * copying the data. Tough luck. 533 * - Destination page already exists in the address space, but there 534 * are no users of it. Make sure it's uptodate, then drop it. Fall 535 * through to last case. 536 * - Destination page does not exist, we can add the pipe page to 537 * the page cache and avoid the copy. 538 * 539 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 540 * sd->flags), we attempt to migrate pages from the pipe to the output 541 * file address space page cache. This is possible if no one else has 542 * the pipe page referenced outside of the pipe and page cache. If 543 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 544 * a new page in the output file page cache and fill/dirty that. 545 */ 546 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 547 struct splice_desc *sd) 548 { 549 struct file *file = sd->u.file; 550 struct address_space *mapping = file->f_mapping; 551 unsigned int offset, this_len; 552 struct page *page; 553 void *fsdata; 554 int ret; 555 556 /* 557 * make sure the data in this buffer is uptodate 558 */ 559 ret = buf->ops->confirm(pipe, buf); 560 if (unlikely(ret)) 561 return ret; 562 563 offset = sd->pos & ~PAGE_CACHE_MASK; 564 565 this_len = sd->len; 566 if (this_len + offset > PAGE_CACHE_SIZE) 567 this_len = PAGE_CACHE_SIZE - offset; 568 569 ret = pagecache_write_begin(file, mapping, sd->pos, this_len, 570 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 571 if (unlikely(ret)) 572 goto out; 573 574 if (buf->page != page) { 575 /* 576 * Careful, ->map() uses KM_USER0! 577 */ 578 char *src = buf->ops->map(pipe, buf, 1); 579 char *dst = kmap_atomic(page, KM_USER1); 580 581 memcpy(dst + offset, src + buf->offset, this_len); 582 flush_dcache_page(page); 583 kunmap_atomic(dst, KM_USER1); 584 buf->ops->unmap(pipe, buf, src); 585 } 586 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, 587 page, fsdata); 588 out: 589 return ret; 590 } 591 592 /** 593 * __splice_from_pipe - splice data from a pipe to given actor 594 * @pipe: pipe to splice from 595 * @sd: information to @actor 596 * @actor: handler that splices the data 597 * 598 * Description: 599 * This function does little more than loop over the pipe and call 600 * @actor to do the actual moving of a single struct pipe_buffer to 601 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 602 * pipe_to_user. 603 * 604 */ 605 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 606 splice_actor *actor) 607 { 608 int ret, do_wakeup, err; 609 610 ret = 0; 611 do_wakeup = 0; 612 613 for (;;) { 614 if (pipe->nrbufs) { 615 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 616 const struct pipe_buf_operations *ops = buf->ops; 617 618 sd->len = buf->len; 619 if (sd->len > sd->total_len) 620 sd->len = sd->total_len; 621 622 err = actor(pipe, buf, sd); 623 if (err <= 0) { 624 if (!ret && err != -ENODATA) 625 ret = err; 626 627 break; 628 } 629 630 ret += err; 631 buf->offset += err; 632 buf->len -= err; 633 634 sd->len -= err; 635 sd->pos += err; 636 sd->total_len -= err; 637 if (sd->len) 638 continue; 639 640 if (!buf->len) { 641 buf->ops = NULL; 642 ops->release(pipe, buf); 643 pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); 644 pipe->nrbufs--; 645 if (pipe->inode) 646 do_wakeup = 1; 647 } 648 649 if (!sd->total_len) 650 break; 651 } 652 653 if (pipe->nrbufs) 654 continue; 655 if (!pipe->writers) 656 break; 657 if (!pipe->waiting_writers) { 658 if (ret) 659 break; 660 } 661 662 if (sd->flags & SPLICE_F_NONBLOCK) { 663 if (!ret) 664 ret = -EAGAIN; 665 break; 666 } 667 668 if (signal_pending(current)) { 669 if (!ret) 670 ret = -ERESTARTSYS; 671 break; 672 } 673 674 if (do_wakeup) { 675 smp_mb(); 676 if (waitqueue_active(&pipe->wait)) 677 wake_up_interruptible_sync(&pipe->wait); 678 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 679 do_wakeup = 0; 680 } 681 682 pipe_wait(pipe); 683 } 684 685 if (do_wakeup) { 686 smp_mb(); 687 if (waitqueue_active(&pipe->wait)) 688 wake_up_interruptible(&pipe->wait); 689 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 690 } 691 692 return ret; 693 } 694 EXPORT_SYMBOL(__splice_from_pipe); 695 696 /** 697 * splice_from_pipe - splice data from a pipe to a file 698 * @pipe: pipe to splice from 699 * @out: file to splice to 700 * @ppos: position in @out 701 * @len: how many bytes to splice 702 * @flags: splice modifier flags 703 * @actor: handler that splices the data 704 * 705 * Description: 706 * See __splice_from_pipe. This function locks the input and output inodes, 707 * otherwise it's identical to __splice_from_pipe(). 708 * 709 */ 710 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 711 loff_t *ppos, size_t len, unsigned int flags, 712 splice_actor *actor) 713 { 714 ssize_t ret; 715 struct inode *inode = out->f_mapping->host; 716 struct splice_desc sd = { 717 .total_len = len, 718 .flags = flags, 719 .pos = *ppos, 720 .u.file = out, 721 }; 722 723 /* 724 * The actor worker might be calling ->prepare_write and 725 * ->commit_write. Most of the time, these expect i_mutex to 726 * be held. Since this may result in an ABBA deadlock with 727 * pipe->inode, we have to order lock acquiry here. 728 */ 729 inode_double_lock(inode, pipe->inode); 730 ret = __splice_from_pipe(pipe, &sd, actor); 731 inode_double_unlock(inode, pipe->inode); 732 733 return ret; 734 } 735 736 /** 737 * generic_file_splice_write_nolock - generic_file_splice_write without mutexes 738 * @pipe: pipe info 739 * @out: file to write to 740 * @ppos: position in @out 741 * @len: number of bytes to splice 742 * @flags: splice modifier flags 743 * 744 * Description: 745 * Will either move or copy pages (determined by @flags options) from 746 * the given pipe inode to the given file. The caller is responsible 747 * for acquiring i_mutex on both inodes. 748 * 749 */ 750 ssize_t 751 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, 752 loff_t *ppos, size_t len, unsigned int flags) 753 { 754 struct address_space *mapping = out->f_mapping; 755 struct inode *inode = mapping->host; 756 struct splice_desc sd = { 757 .total_len = len, 758 .flags = flags, 759 .pos = *ppos, 760 .u.file = out, 761 }; 762 ssize_t ret; 763 int err; 764 765 err = remove_suid(out->f_path.dentry); 766 if (unlikely(err)) 767 return err; 768 769 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 770 if (ret > 0) { 771 unsigned long nr_pages; 772 773 *ppos += ret; 774 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 775 776 /* 777 * If file or inode is SYNC and we actually wrote some data, 778 * sync it. 779 */ 780 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 781 err = generic_osync_inode(inode, mapping, 782 OSYNC_METADATA|OSYNC_DATA); 783 784 if (err) 785 ret = err; 786 } 787 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 788 } 789 790 return ret; 791 } 792 793 EXPORT_SYMBOL(generic_file_splice_write_nolock); 794 795 /** 796 * generic_file_splice_write - splice data from a pipe to a file 797 * @pipe: pipe info 798 * @out: file to write to 799 * @ppos: position in @out 800 * @len: number of bytes to splice 801 * @flags: splice modifier flags 802 * 803 * Description: 804 * Will either move or copy pages (determined by @flags options) from 805 * the given pipe inode to the given file. 806 * 807 */ 808 ssize_t 809 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 810 loff_t *ppos, size_t len, unsigned int flags) 811 { 812 struct address_space *mapping = out->f_mapping; 813 struct inode *inode = mapping->host; 814 int killsuid, killpriv; 815 ssize_t ret; 816 int err = 0; 817 818 killpriv = security_inode_need_killpriv(out->f_path.dentry); 819 killsuid = should_remove_suid(out->f_path.dentry); 820 if (unlikely(killsuid || killpriv)) { 821 mutex_lock(&inode->i_mutex); 822 if (killpriv) 823 err = security_inode_killpriv(out->f_path.dentry); 824 if (!err && killsuid) 825 err = __remove_suid(out->f_path.dentry, killsuid); 826 mutex_unlock(&inode->i_mutex); 827 if (err) 828 return err; 829 } 830 831 ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); 832 if (ret > 0) { 833 unsigned long nr_pages; 834 835 *ppos += ret; 836 nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 837 838 /* 839 * If file or inode is SYNC and we actually wrote some data, 840 * sync it. 841 */ 842 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { 843 mutex_lock(&inode->i_mutex); 844 err = generic_osync_inode(inode, mapping, 845 OSYNC_METADATA|OSYNC_DATA); 846 mutex_unlock(&inode->i_mutex); 847 848 if (err) 849 ret = err; 850 } 851 balance_dirty_pages_ratelimited_nr(mapping, nr_pages); 852 } 853 854 return ret; 855 } 856 857 EXPORT_SYMBOL(generic_file_splice_write); 858 859 /** 860 * generic_splice_sendpage - splice data from a pipe to a socket 861 * @pipe: pipe to splice from 862 * @out: socket to write to 863 * @ppos: position in @out 864 * @len: number of bytes to splice 865 * @flags: splice modifier flags 866 * 867 * Description: 868 * Will send @len bytes from the pipe to a network socket. No data copying 869 * is involved. 870 * 871 */ 872 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 873 loff_t *ppos, size_t len, unsigned int flags) 874 { 875 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 876 } 877 878 EXPORT_SYMBOL(generic_splice_sendpage); 879 880 /* 881 * Attempt to initiate a splice from pipe to file. 882 */ 883 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 884 loff_t *ppos, size_t len, unsigned int flags) 885 { 886 int ret; 887 888 if (unlikely(!out->f_op || !out->f_op->splice_write)) 889 return -EINVAL; 890 891 if (unlikely(!(out->f_mode & FMODE_WRITE))) 892 return -EBADF; 893 894 ret = rw_verify_area(WRITE, out, ppos, len); 895 if (unlikely(ret < 0)) 896 return ret; 897 898 return out->f_op->splice_write(pipe, out, ppos, len, flags); 899 } 900 901 /* 902 * Attempt to initiate a splice from a file to a pipe. 903 */ 904 static long do_splice_to(struct file *in, loff_t *ppos, 905 struct pipe_inode_info *pipe, size_t len, 906 unsigned int flags) 907 { 908 int ret; 909 910 if (unlikely(!in->f_op || !in->f_op->splice_read)) 911 return -EINVAL; 912 913 if (unlikely(!(in->f_mode & FMODE_READ))) 914 return -EBADF; 915 916 ret = rw_verify_area(READ, in, ppos, len); 917 if (unlikely(ret < 0)) 918 return ret; 919 920 return in->f_op->splice_read(in, ppos, pipe, len, flags); 921 } 922 923 /** 924 * splice_direct_to_actor - splices data directly between two non-pipes 925 * @in: file to splice from 926 * @sd: actor information on where to splice to 927 * @actor: handles the data splicing 928 * 929 * Description: 930 * This is a special case helper to splice directly between two 931 * points, without requiring an explicit pipe. Internally an allocated 932 * pipe is cached in the process, and reused during the lifetime of 933 * that process. 934 * 935 */ 936 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 937 splice_direct_actor *actor) 938 { 939 struct pipe_inode_info *pipe; 940 long ret, bytes; 941 umode_t i_mode; 942 size_t len; 943 int i, flags; 944 945 /* 946 * We require the input being a regular file, as we don't want to 947 * randomly drop data for eg socket -> socket splicing. Use the 948 * piped splicing for that! 949 */ 950 i_mode = in->f_path.dentry->d_inode->i_mode; 951 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 952 return -EINVAL; 953 954 /* 955 * neither in nor out is a pipe, setup an internal pipe attached to 956 * 'out' and transfer the wanted data from 'in' to 'out' through that 957 */ 958 pipe = current->splice_pipe; 959 if (unlikely(!pipe)) { 960 pipe = alloc_pipe_info(NULL); 961 if (!pipe) 962 return -ENOMEM; 963 964 /* 965 * We don't have an immediate reader, but we'll read the stuff 966 * out of the pipe right after the splice_to_pipe(). So set 967 * PIPE_READERS appropriately. 968 */ 969 pipe->readers = 1; 970 971 current->splice_pipe = pipe; 972 } 973 974 /* 975 * Do the splice. 976 */ 977 ret = 0; 978 bytes = 0; 979 len = sd->total_len; 980 flags = sd->flags; 981 982 /* 983 * Don't block on output, we have to drain the direct pipe. 984 */ 985 sd->flags &= ~SPLICE_F_NONBLOCK; 986 987 while (len) { 988 size_t read_len; 989 loff_t pos = sd->pos; 990 991 ret = do_splice_to(in, &pos, pipe, len, flags); 992 if (unlikely(ret <= 0)) 993 goto out_release; 994 995 read_len = ret; 996 sd->total_len = read_len; 997 998 /* 999 * NOTE: nonblocking mode only applies to the input. We 1000 * must not do the output in nonblocking mode as then we 1001 * could get stuck data in the internal pipe: 1002 */ 1003 ret = actor(pipe, sd); 1004 if (unlikely(ret <= 0)) 1005 goto out_release; 1006 1007 bytes += ret; 1008 len -= ret; 1009 sd->pos = pos; 1010 1011 if (ret < read_len) 1012 goto out_release; 1013 } 1014 1015 done: 1016 pipe->nrbufs = pipe->curbuf = 0; 1017 file_accessed(in); 1018 return bytes; 1019 1020 out_release: 1021 /* 1022 * If we did an incomplete transfer we must release 1023 * the pipe buffers in question: 1024 */ 1025 for (i = 0; i < PIPE_BUFFERS; i++) { 1026 struct pipe_buffer *buf = pipe->bufs + i; 1027 1028 if (buf->ops) { 1029 buf->ops->release(pipe, buf); 1030 buf->ops = NULL; 1031 } 1032 } 1033 1034 if (!bytes) 1035 bytes = ret; 1036 1037 goto done; 1038 } 1039 EXPORT_SYMBOL(splice_direct_to_actor); 1040 1041 static int direct_splice_actor(struct pipe_inode_info *pipe, 1042 struct splice_desc *sd) 1043 { 1044 struct file *file = sd->u.file; 1045 1046 return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); 1047 } 1048 1049 /** 1050 * do_splice_direct - splices data directly between two files 1051 * @in: file to splice from 1052 * @ppos: input file offset 1053 * @out: file to splice to 1054 * @len: number of bytes to splice 1055 * @flags: splice modifier flags 1056 * 1057 * Description: 1058 * For use by do_sendfile(). splice can easily emulate sendfile, but 1059 * doing it in the application would incur an extra system call 1060 * (splice in + splice out, as compared to just sendfile()). So this helper 1061 * can splice directly through a process-private pipe. 1062 * 1063 */ 1064 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1065 size_t len, unsigned int flags) 1066 { 1067 struct splice_desc sd = { 1068 .len = len, 1069 .total_len = len, 1070 .flags = flags, 1071 .pos = *ppos, 1072 .u.file = out, 1073 }; 1074 long ret; 1075 1076 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1077 if (ret > 0) 1078 *ppos += ret; 1079 1080 return ret; 1081 } 1082 1083 /* 1084 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1085 * location, so checking ->i_pipe is not enough to verify that this is a 1086 * pipe. 1087 */ 1088 static inline struct pipe_inode_info *pipe_info(struct inode *inode) 1089 { 1090 if (S_ISFIFO(inode->i_mode)) 1091 return inode->i_pipe; 1092 1093 return NULL; 1094 } 1095 1096 /* 1097 * Determine where to splice to/from. 1098 */ 1099 static long do_splice(struct file *in, loff_t __user *off_in, 1100 struct file *out, loff_t __user *off_out, 1101 size_t len, unsigned int flags) 1102 { 1103 struct pipe_inode_info *pipe; 1104 loff_t offset, *off; 1105 long ret; 1106 1107 pipe = pipe_info(in->f_path.dentry->d_inode); 1108 if (pipe) { 1109 if (off_in) 1110 return -ESPIPE; 1111 if (off_out) { 1112 if (out->f_op->llseek == no_llseek) 1113 return -EINVAL; 1114 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1115 return -EFAULT; 1116 off = &offset; 1117 } else 1118 off = &out->f_pos; 1119 1120 ret = do_splice_from(pipe, out, off, len, flags); 1121 1122 if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) 1123 ret = -EFAULT; 1124 1125 return ret; 1126 } 1127 1128 pipe = pipe_info(out->f_path.dentry->d_inode); 1129 if (pipe) { 1130 if (off_out) 1131 return -ESPIPE; 1132 if (off_in) { 1133 if (in->f_op->llseek == no_llseek) 1134 return -EINVAL; 1135 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1136 return -EFAULT; 1137 off = &offset; 1138 } else 1139 off = &in->f_pos; 1140 1141 ret = do_splice_to(in, off, pipe, len, flags); 1142 1143 if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) 1144 ret = -EFAULT; 1145 1146 return ret; 1147 } 1148 1149 return -EINVAL; 1150 } 1151 1152 /* 1153 * Do a copy-from-user while holding the mmap_semaphore for reading, in a 1154 * manner safe from deadlocking with simultaneous mmap() (grabbing mmap_sem 1155 * for writing) and page faulting on the user memory pointed to by src. 1156 * This assumes that we will very rarely hit the partial != 0 path, or this 1157 * will not be a win. 1158 */ 1159 static int copy_from_user_mmap_sem(void *dst, const void __user *src, size_t n) 1160 { 1161 int partial; 1162 1163 if (!access_ok(VERIFY_READ, src, n)) 1164 return -EFAULT; 1165 1166 pagefault_disable(); 1167 partial = __copy_from_user_inatomic(dst, src, n); 1168 pagefault_enable(); 1169 1170 /* 1171 * Didn't copy everything, drop the mmap_sem and do a faulting copy 1172 */ 1173 if (unlikely(partial)) { 1174 up_read(¤t->mm->mmap_sem); 1175 partial = copy_from_user(dst, src, n); 1176 down_read(¤t->mm->mmap_sem); 1177 } 1178 1179 return partial; 1180 } 1181 1182 /* 1183 * Map an iov into an array of pages and offset/length tupples. With the 1184 * partial_page structure, we can map several non-contiguous ranges into 1185 * our ones pages[] map instead of splitting that operation into pieces. 1186 * Could easily be exported as a generic helper for other users, in which 1187 * case one would probably want to add a 'max_nr_pages' parameter as well. 1188 */ 1189 static int get_iovec_page_array(const struct iovec __user *iov, 1190 unsigned int nr_vecs, struct page **pages, 1191 struct partial_page *partial, int aligned) 1192 { 1193 int buffers = 0, error = 0; 1194 1195 down_read(¤t->mm->mmap_sem); 1196 1197 while (nr_vecs) { 1198 unsigned long off, npages; 1199 struct iovec entry; 1200 void __user *base; 1201 size_t len; 1202 int i; 1203 1204 error = -EFAULT; 1205 if (copy_from_user_mmap_sem(&entry, iov, sizeof(entry))) 1206 break; 1207 1208 base = entry.iov_base; 1209 len = entry.iov_len; 1210 1211 /* 1212 * Sanity check this iovec. 0 read succeeds. 1213 */ 1214 error = 0; 1215 if (unlikely(!len)) 1216 break; 1217 error = -EFAULT; 1218 if (!access_ok(VERIFY_READ, base, len)) 1219 break; 1220 1221 /* 1222 * Get this base offset and number of pages, then map 1223 * in the user pages. 1224 */ 1225 off = (unsigned long) base & ~PAGE_MASK; 1226 1227 /* 1228 * If asked for alignment, the offset must be zero and the 1229 * length a multiple of the PAGE_SIZE. 1230 */ 1231 error = -EINVAL; 1232 if (aligned && (off || len & ~PAGE_MASK)) 1233 break; 1234 1235 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1236 if (npages > PIPE_BUFFERS - buffers) 1237 npages = PIPE_BUFFERS - buffers; 1238 1239 error = get_user_pages(current, current->mm, 1240 (unsigned long) base, npages, 0, 0, 1241 &pages[buffers], NULL); 1242 1243 if (unlikely(error <= 0)) 1244 break; 1245 1246 /* 1247 * Fill this contiguous range into the partial page map. 1248 */ 1249 for (i = 0; i < error; i++) { 1250 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1251 1252 partial[buffers].offset = off; 1253 partial[buffers].len = plen; 1254 1255 off = 0; 1256 len -= plen; 1257 buffers++; 1258 } 1259 1260 /* 1261 * We didn't complete this iov, stop here since it probably 1262 * means we have to move some of this into a pipe to 1263 * be able to continue. 1264 */ 1265 if (len) 1266 break; 1267 1268 /* 1269 * Don't continue if we mapped fewer pages than we asked for, 1270 * or if we mapped the max number of pages that we have 1271 * room for. 1272 */ 1273 if (error < npages || buffers == PIPE_BUFFERS) 1274 break; 1275 1276 nr_vecs--; 1277 iov++; 1278 } 1279 1280 up_read(¤t->mm->mmap_sem); 1281 1282 if (buffers) 1283 return buffers; 1284 1285 return error; 1286 } 1287 1288 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1289 struct splice_desc *sd) 1290 { 1291 char *src; 1292 int ret; 1293 1294 ret = buf->ops->confirm(pipe, buf); 1295 if (unlikely(ret)) 1296 return ret; 1297 1298 /* 1299 * See if we can use the atomic maps, by prefaulting in the 1300 * pages and doing an atomic copy 1301 */ 1302 if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { 1303 src = buf->ops->map(pipe, buf, 1); 1304 ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, 1305 sd->len); 1306 buf->ops->unmap(pipe, buf, src); 1307 if (!ret) { 1308 ret = sd->len; 1309 goto out; 1310 } 1311 } 1312 1313 /* 1314 * No dice, use slow non-atomic map and copy 1315 */ 1316 src = buf->ops->map(pipe, buf, 0); 1317 1318 ret = sd->len; 1319 if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) 1320 ret = -EFAULT; 1321 1322 buf->ops->unmap(pipe, buf, src); 1323 out: 1324 if (ret > 0) 1325 sd->u.userptr += ret; 1326 return ret; 1327 } 1328 1329 /* 1330 * For lack of a better implementation, implement vmsplice() to userspace 1331 * as a simple copy of the pipes pages to the user iov. 1332 */ 1333 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, 1334 unsigned long nr_segs, unsigned int flags) 1335 { 1336 struct pipe_inode_info *pipe; 1337 struct splice_desc sd; 1338 ssize_t size; 1339 int error; 1340 long ret; 1341 1342 pipe = pipe_info(file->f_path.dentry->d_inode); 1343 if (!pipe) 1344 return -EBADF; 1345 1346 if (pipe->inode) 1347 mutex_lock(&pipe->inode->i_mutex); 1348 1349 error = ret = 0; 1350 while (nr_segs) { 1351 void __user *base; 1352 size_t len; 1353 1354 /* 1355 * Get user address base and length for this iovec. 1356 */ 1357 error = get_user(base, &iov->iov_base); 1358 if (unlikely(error)) 1359 break; 1360 error = get_user(len, &iov->iov_len); 1361 if (unlikely(error)) 1362 break; 1363 1364 /* 1365 * Sanity check this iovec. 0 read succeeds. 1366 */ 1367 if (unlikely(!len)) 1368 break; 1369 if (unlikely(!base)) { 1370 error = -EFAULT; 1371 break; 1372 } 1373 1374 if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { 1375 error = -EFAULT; 1376 break; 1377 } 1378 1379 sd.len = 0; 1380 sd.total_len = len; 1381 sd.flags = flags; 1382 sd.u.userptr = base; 1383 sd.pos = 0; 1384 1385 size = __splice_from_pipe(pipe, &sd, pipe_to_user); 1386 if (size < 0) { 1387 if (!ret) 1388 ret = size; 1389 1390 break; 1391 } 1392 1393 ret += size; 1394 1395 if (size < len) 1396 break; 1397 1398 nr_segs--; 1399 iov++; 1400 } 1401 1402 if (pipe->inode) 1403 mutex_unlock(&pipe->inode->i_mutex); 1404 1405 if (!ret) 1406 ret = error; 1407 1408 return ret; 1409 } 1410 1411 /* 1412 * vmsplice splices a user address range into a pipe. It can be thought of 1413 * as splice-from-memory, where the regular splice is splice-from-file (or 1414 * to file). In both cases the output is a pipe, naturally. 1415 */ 1416 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1417 unsigned long nr_segs, unsigned int flags) 1418 { 1419 struct pipe_inode_info *pipe; 1420 struct page *pages[PIPE_BUFFERS]; 1421 struct partial_page partial[PIPE_BUFFERS]; 1422 struct splice_pipe_desc spd = { 1423 .pages = pages, 1424 .partial = partial, 1425 .flags = flags, 1426 .ops = &user_page_pipe_buf_ops, 1427 .spd_release = spd_release_page, 1428 }; 1429 1430 pipe = pipe_info(file->f_path.dentry->d_inode); 1431 if (!pipe) 1432 return -EBADF; 1433 1434 spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, 1435 flags & SPLICE_F_GIFT); 1436 if (spd.nr_pages <= 0) 1437 return spd.nr_pages; 1438 1439 return splice_to_pipe(pipe, &spd); 1440 } 1441 1442 /* 1443 * Note that vmsplice only really supports true splicing _from_ user memory 1444 * to a pipe, not the other way around. Splicing from user memory is a simple 1445 * operation that can be supported without any funky alignment restrictions 1446 * or nasty vm tricks. We simply map in the user memory and fill them into 1447 * a pipe. The reverse isn't quite as easy, though. There are two possible 1448 * solutions for that: 1449 * 1450 * - memcpy() the data internally, at which point we might as well just 1451 * do a regular read() on the buffer anyway. 1452 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1453 * has restriction limitations on both ends of the pipe). 1454 * 1455 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1456 * 1457 */ 1458 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, 1459 unsigned long nr_segs, unsigned int flags) 1460 { 1461 struct file *file; 1462 long error; 1463 int fput; 1464 1465 if (unlikely(nr_segs > UIO_MAXIOV)) 1466 return -EINVAL; 1467 else if (unlikely(!nr_segs)) 1468 return 0; 1469 1470 error = -EBADF; 1471 file = fget_light(fd, &fput); 1472 if (file) { 1473 if (file->f_mode & FMODE_WRITE) 1474 error = vmsplice_to_pipe(file, iov, nr_segs, flags); 1475 else if (file->f_mode & FMODE_READ) 1476 error = vmsplice_to_user(file, iov, nr_segs, flags); 1477 1478 fput_light(file, fput); 1479 } 1480 1481 return error; 1482 } 1483 1484 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, 1485 int fd_out, loff_t __user *off_out, 1486 size_t len, unsigned int flags) 1487 { 1488 long error; 1489 struct file *in, *out; 1490 int fput_in, fput_out; 1491 1492 if (unlikely(!len)) 1493 return 0; 1494 1495 error = -EBADF; 1496 in = fget_light(fd_in, &fput_in); 1497 if (in) { 1498 if (in->f_mode & FMODE_READ) { 1499 out = fget_light(fd_out, &fput_out); 1500 if (out) { 1501 if (out->f_mode & FMODE_WRITE) 1502 error = do_splice(in, off_in, 1503 out, off_out, 1504 len, flags); 1505 fput_light(out, fput_out); 1506 } 1507 } 1508 1509 fput_light(in, fput_in); 1510 } 1511 1512 return error; 1513 } 1514 1515 /* 1516 * Make sure there's data to read. Wait for input if we can, otherwise 1517 * return an appropriate error. 1518 */ 1519 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1520 { 1521 int ret; 1522 1523 /* 1524 * Check ->nrbufs without the inode lock first. This function 1525 * is speculative anyways, so missing one is ok. 1526 */ 1527 if (pipe->nrbufs) 1528 return 0; 1529 1530 ret = 0; 1531 mutex_lock(&pipe->inode->i_mutex); 1532 1533 while (!pipe->nrbufs) { 1534 if (signal_pending(current)) { 1535 ret = -ERESTARTSYS; 1536 break; 1537 } 1538 if (!pipe->writers) 1539 break; 1540 if (!pipe->waiting_writers) { 1541 if (flags & SPLICE_F_NONBLOCK) { 1542 ret = -EAGAIN; 1543 break; 1544 } 1545 } 1546 pipe_wait(pipe); 1547 } 1548 1549 mutex_unlock(&pipe->inode->i_mutex); 1550 return ret; 1551 } 1552 1553 /* 1554 * Make sure there's writeable room. Wait for room if we can, otherwise 1555 * return an appropriate error. 1556 */ 1557 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1558 { 1559 int ret; 1560 1561 /* 1562 * Check ->nrbufs without the inode lock first. This function 1563 * is speculative anyways, so missing one is ok. 1564 */ 1565 if (pipe->nrbufs < PIPE_BUFFERS) 1566 return 0; 1567 1568 ret = 0; 1569 mutex_lock(&pipe->inode->i_mutex); 1570 1571 while (pipe->nrbufs >= PIPE_BUFFERS) { 1572 if (!pipe->readers) { 1573 send_sig(SIGPIPE, current, 0); 1574 ret = -EPIPE; 1575 break; 1576 } 1577 if (flags & SPLICE_F_NONBLOCK) { 1578 ret = -EAGAIN; 1579 break; 1580 } 1581 if (signal_pending(current)) { 1582 ret = -ERESTARTSYS; 1583 break; 1584 } 1585 pipe->waiting_writers++; 1586 pipe_wait(pipe); 1587 pipe->waiting_writers--; 1588 } 1589 1590 mutex_unlock(&pipe->inode->i_mutex); 1591 return ret; 1592 } 1593 1594 /* 1595 * Link contents of ipipe to opipe. 1596 */ 1597 static int link_pipe(struct pipe_inode_info *ipipe, 1598 struct pipe_inode_info *opipe, 1599 size_t len, unsigned int flags) 1600 { 1601 struct pipe_buffer *ibuf, *obuf; 1602 int ret = 0, i = 0, nbuf; 1603 1604 /* 1605 * Potential ABBA deadlock, work around it by ordering lock 1606 * grabbing by inode address. Otherwise two different processes 1607 * could deadlock (one doing tee from A -> B, the other from B -> A). 1608 */ 1609 inode_double_lock(ipipe->inode, opipe->inode); 1610 1611 do { 1612 if (!opipe->readers) { 1613 send_sig(SIGPIPE, current, 0); 1614 if (!ret) 1615 ret = -EPIPE; 1616 break; 1617 } 1618 1619 /* 1620 * If we have iterated all input buffers or ran out of 1621 * output room, break. 1622 */ 1623 if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) 1624 break; 1625 1626 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); 1627 nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); 1628 1629 /* 1630 * Get a reference to this pipe buffer, 1631 * so we can copy the contents over. 1632 */ 1633 ibuf->ops->get(ipipe, ibuf); 1634 1635 obuf = opipe->bufs + nbuf; 1636 *obuf = *ibuf; 1637 1638 /* 1639 * Don't inherit the gift flag, we need to 1640 * prevent multiple steals of this page. 1641 */ 1642 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1643 1644 if (obuf->len > len) 1645 obuf->len = len; 1646 1647 opipe->nrbufs++; 1648 ret += obuf->len; 1649 len -= obuf->len; 1650 i++; 1651 } while (len); 1652 1653 /* 1654 * return EAGAIN if we have the potential of some data in the 1655 * future, otherwise just return 0 1656 */ 1657 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1658 ret = -EAGAIN; 1659 1660 inode_double_unlock(ipipe->inode, opipe->inode); 1661 1662 /* 1663 * If we put data in the output pipe, wakeup any potential readers. 1664 */ 1665 if (ret > 0) { 1666 smp_mb(); 1667 if (waitqueue_active(&opipe->wait)) 1668 wake_up_interruptible(&opipe->wait); 1669 kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); 1670 } 1671 1672 return ret; 1673 } 1674 1675 /* 1676 * This is a tee(1) implementation that works on pipes. It doesn't copy 1677 * any data, it simply references the 'in' pages on the 'out' pipe. 1678 * The 'flags' used are the SPLICE_F_* variants, currently the only 1679 * applicable one is SPLICE_F_NONBLOCK. 1680 */ 1681 static long do_tee(struct file *in, struct file *out, size_t len, 1682 unsigned int flags) 1683 { 1684 struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); 1685 struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); 1686 int ret = -EINVAL; 1687 1688 /* 1689 * Duplicate the contents of ipipe to opipe without actually 1690 * copying the data. 1691 */ 1692 if (ipipe && opipe && ipipe != opipe) { 1693 /* 1694 * Keep going, unless we encounter an error. The ipipe/opipe 1695 * ordering doesn't really matter. 1696 */ 1697 ret = link_ipipe_prep(ipipe, flags); 1698 if (!ret) { 1699 ret = link_opipe_prep(opipe, flags); 1700 if (!ret) 1701 ret = link_pipe(ipipe, opipe, len, flags); 1702 } 1703 } 1704 1705 return ret; 1706 } 1707 1708 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) 1709 { 1710 struct file *in; 1711 int error, fput_in; 1712 1713 if (unlikely(!len)) 1714 return 0; 1715 1716 error = -EBADF; 1717 in = fget_light(fdin, &fput_in); 1718 if (in) { 1719 if (in->f_mode & FMODE_READ) { 1720 int fput_out; 1721 struct file *out = fget_light(fdout, &fput_out); 1722 1723 if (out) { 1724 if (out->f_mode & FMODE_WRITE) 1725 error = do_tee(in, out, len, flags); 1726 fput_light(out, fput_out); 1727 } 1728 } 1729 fput_light(in, fput_in); 1730 } 1731 1732 return error; 1733 } 1734