1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * "splice": joining two ropes together by interweaving their strands. 4 * 5 * This is the "extended pipe" functionality, where a pipe is used as 6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 7 * buffer that you can use to transfer data from one end to the other. 8 * 9 * The traditional unix read/write is extended with a "splice()" operation 10 * that transfers data buffers to or from a pipe buffer. 11 * 12 * Named by Larry McVoy, original implementation from Linus, extended by 13 * Jens to support splicing to files, network, direct splicing, etc and 14 * fixing lots of bugs. 15 * 16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 19 * 20 */ 21 #include <linux/bvec.h> 22 #include <linux/fs.h> 23 #include <linux/file.h> 24 #include <linux/pagemap.h> 25 #include <linux/splice.h> 26 #include <linux/memcontrol.h> 27 #include <linux/mm_inline.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/export.h> 31 #include <linux/syscalls.h> 32 #include <linux/uio.h> 33 #include <linux/fsnotify.h> 34 #include <linux/security.h> 35 #include <linux/gfp.h> 36 #include <linux/socket.h> 37 #include <linux/sched/signal.h> 38 39 #include "internal.h" 40 41 /* 42 * Attempt to steal a page from a pipe buffer. This should perhaps go into 43 * a vm helper function, it's already simplified quite a bit by the 44 * addition of remove_mapping(). If success is returned, the caller may 45 * attempt to reuse this page for another destination. 46 */ 47 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, 48 struct pipe_buffer *buf) 49 { 50 struct folio *folio = page_folio(buf->page); 51 struct address_space *mapping; 52 53 folio_lock(folio); 54 55 mapping = folio_mapping(folio); 56 if (mapping) { 57 WARN_ON(!folio_test_uptodate(folio)); 58 59 /* 60 * At least for ext2 with nobh option, we need to wait on 61 * writeback completing on this folio, since we'll remove it 62 * from the pagecache. Otherwise truncate wont wait on the 63 * folio, allowing the disk blocks to be reused by someone else 64 * before we actually wrote our data to them. fs corruption 65 * ensues. 66 */ 67 folio_wait_writeback(folio); 68 69 if (folio_has_private(folio) && 70 !filemap_release_folio(folio, GFP_KERNEL)) 71 goto out_unlock; 72 73 /* 74 * If we succeeded in removing the mapping, set LRU flag 75 * and return good. 76 */ 77 if (remove_mapping(mapping, folio)) { 78 buf->flags |= PIPE_BUF_FLAG_LRU; 79 return true; 80 } 81 } 82 83 /* 84 * Raced with truncate or failed to remove folio from current 85 * address space, unlock and return failure. 86 */ 87 out_unlock: 88 folio_unlock(folio); 89 return false; 90 } 91 92 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 93 struct pipe_buffer *buf) 94 { 95 put_page(buf->page); 96 buf->flags &= ~PIPE_BUF_FLAG_LRU; 97 } 98 99 /* 100 * Check whether the contents of buf is OK to access. Since the content 101 * is a page cache page, IO may be in flight. 102 */ 103 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 104 struct pipe_buffer *buf) 105 { 106 struct page *page = buf->page; 107 int err; 108 109 if (!PageUptodate(page)) { 110 lock_page(page); 111 112 /* 113 * Page got truncated/unhashed. This will cause a 0-byte 114 * splice, if this is the first page. 115 */ 116 if (!page->mapping) { 117 err = -ENODATA; 118 goto error; 119 } 120 121 /* 122 * Uh oh, read-error from disk. 123 */ 124 if (!PageUptodate(page)) { 125 err = -EIO; 126 goto error; 127 } 128 129 /* 130 * Page is ok afterall, we are done. 131 */ 132 unlock_page(page); 133 } 134 135 return 0; 136 error: 137 unlock_page(page); 138 return err; 139 } 140 141 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 142 .confirm = page_cache_pipe_buf_confirm, 143 .release = page_cache_pipe_buf_release, 144 .try_steal = page_cache_pipe_buf_try_steal, 145 .get = generic_pipe_buf_get, 146 }; 147 148 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, 149 struct pipe_buffer *buf) 150 { 151 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 152 return false; 153 154 buf->flags |= PIPE_BUF_FLAG_LRU; 155 return generic_pipe_buf_try_steal(pipe, buf); 156 } 157 158 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 159 .release = page_cache_pipe_buf_release, 160 .try_steal = user_page_pipe_buf_try_steal, 161 .get = generic_pipe_buf_get, 162 }; 163 164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 165 { 166 smp_mb(); 167 if (waitqueue_active(&pipe->rd_wait)) 168 wake_up_interruptible(&pipe->rd_wait); 169 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 170 } 171 172 /** 173 * splice_to_pipe - fill passed data into a pipe 174 * @pipe: pipe to fill 175 * @spd: data to fill 176 * 177 * Description: 178 * @spd contains a map of pages and len/offset tuples, along with 179 * the struct pipe_buf_operations associated with these pages. This 180 * function will link that data to the pipe. 181 * 182 */ 183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 184 struct splice_pipe_desc *spd) 185 { 186 unsigned int spd_pages = spd->nr_pages; 187 unsigned int tail = pipe->tail; 188 unsigned int head = pipe->head; 189 unsigned int mask = pipe->ring_size - 1; 190 int ret = 0, page_nr = 0; 191 192 if (!spd_pages) 193 return 0; 194 195 if (unlikely(!pipe->readers)) { 196 send_sig(SIGPIPE, current, 0); 197 ret = -EPIPE; 198 goto out; 199 } 200 201 while (!pipe_full(head, tail, pipe->max_usage)) { 202 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 203 204 buf->page = spd->pages[page_nr]; 205 buf->offset = spd->partial[page_nr].offset; 206 buf->len = spd->partial[page_nr].len; 207 buf->private = spd->partial[page_nr].private; 208 buf->ops = spd->ops; 209 buf->flags = 0; 210 211 head++; 212 pipe->head = head; 213 page_nr++; 214 ret += buf->len; 215 216 if (!--spd->nr_pages) 217 break; 218 } 219 220 if (!ret) 221 ret = -EAGAIN; 222 223 out: 224 while (page_nr < spd_pages) 225 spd->spd_release(spd, page_nr++); 226 227 return ret; 228 } 229 EXPORT_SYMBOL_GPL(splice_to_pipe); 230 231 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 232 { 233 unsigned int head = pipe->head; 234 unsigned int tail = pipe->tail; 235 unsigned int mask = pipe->ring_size - 1; 236 int ret; 237 238 if (unlikely(!pipe->readers)) { 239 send_sig(SIGPIPE, current, 0); 240 ret = -EPIPE; 241 } else if (pipe_full(head, tail, pipe->max_usage)) { 242 ret = -EAGAIN; 243 } else { 244 pipe->bufs[head & mask] = *buf; 245 pipe->head = head + 1; 246 return buf->len; 247 } 248 pipe_buf_release(pipe, buf); 249 return ret; 250 } 251 EXPORT_SYMBOL(add_to_pipe); 252 253 /* 254 * Check if we need to grow the arrays holding pages and partial page 255 * descriptions. 256 */ 257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 258 { 259 unsigned int max_usage = READ_ONCE(pipe->max_usage); 260 261 spd->nr_pages_max = max_usage; 262 if (max_usage <= PIPE_DEF_BUFFERS) 263 return 0; 264 265 spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); 266 spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), 267 GFP_KERNEL); 268 269 if (spd->pages && spd->partial) 270 return 0; 271 272 kfree(spd->pages); 273 kfree(spd->partial); 274 return -ENOMEM; 275 } 276 277 void splice_shrink_spd(struct splice_pipe_desc *spd) 278 { 279 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 280 return; 281 282 kfree(spd->pages); 283 kfree(spd->partial); 284 } 285 286 /* 287 * Splice data from an O_DIRECT file into pages and then add them to the output 288 * pipe. 289 */ 290 ssize_t direct_splice_read(struct file *in, loff_t *ppos, 291 struct pipe_inode_info *pipe, 292 size_t len, unsigned int flags) 293 { 294 struct iov_iter to; 295 struct bio_vec *bv; 296 struct kiocb kiocb; 297 struct page **pages; 298 ssize_t ret; 299 size_t used, npages, chunk, remain, reclaim; 300 int i; 301 302 /* Work out how much data we can actually add into the pipe */ 303 used = pipe_occupancy(pipe->head, pipe->tail); 304 npages = max_t(ssize_t, pipe->max_usage - used, 0); 305 len = min_t(size_t, len, npages * PAGE_SIZE); 306 npages = DIV_ROUND_UP(len, PAGE_SIZE); 307 308 bv = kzalloc(array_size(npages, sizeof(bv[0])) + 309 array_size(npages, sizeof(struct page *)), GFP_KERNEL); 310 if (!bv) 311 return -ENOMEM; 312 313 pages = (void *)(bv + npages); 314 npages = alloc_pages_bulk_array(GFP_USER, npages, pages); 315 if (!npages) { 316 kfree(bv); 317 return -ENOMEM; 318 } 319 320 remain = len = min_t(size_t, len, npages * PAGE_SIZE); 321 322 for (i = 0; i < npages; i++) { 323 chunk = min_t(size_t, PAGE_SIZE, remain); 324 bv[i].bv_page = pages[i]; 325 bv[i].bv_offset = 0; 326 bv[i].bv_len = chunk; 327 remain -= chunk; 328 } 329 330 /* Do the I/O */ 331 iov_iter_bvec(&to, ITER_DEST, bv, npages, len); 332 init_sync_kiocb(&kiocb, in); 333 kiocb.ki_pos = *ppos; 334 ret = call_read_iter(in, &kiocb, &to); 335 336 reclaim = npages * PAGE_SIZE; 337 remain = 0; 338 if (ret > 0) { 339 reclaim -= ret; 340 remain = ret; 341 *ppos = kiocb.ki_pos; 342 file_accessed(in); 343 } else if (ret < 0) { 344 /* 345 * callers of ->splice_read() expect -EAGAIN on 346 * "can't put anything in there", rather than -EFAULT. 347 */ 348 if (ret == -EFAULT) 349 ret = -EAGAIN; 350 } 351 352 /* Free any pages that didn't get touched at all. */ 353 reclaim /= PAGE_SIZE; 354 if (reclaim) { 355 npages -= reclaim; 356 release_pages(pages + npages, reclaim); 357 } 358 359 /* Push the remaining pages into the pipe. */ 360 for (i = 0; i < npages; i++) { 361 struct pipe_buffer *buf = pipe_head_buf(pipe); 362 363 chunk = min_t(size_t, remain, PAGE_SIZE); 364 *buf = (struct pipe_buffer) { 365 .ops = &default_pipe_buf_ops, 366 .page = bv[i].bv_page, 367 .offset = 0, 368 .len = chunk, 369 }; 370 pipe->head++; 371 remain -= chunk; 372 } 373 374 kfree(bv); 375 return ret; 376 } 377 EXPORT_SYMBOL(direct_splice_read); 378 379 /** 380 * generic_file_splice_read - splice data from file to a pipe 381 * @in: file to splice from 382 * @ppos: position in @in 383 * @pipe: pipe to splice to 384 * @len: number of bytes to splice 385 * @flags: splice modifier flags 386 * 387 * Description: 388 * Will read pages from given file and fill them into a pipe. Can be 389 * used as long as it has more or less sane ->read_iter(). 390 * 391 */ 392 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 393 struct pipe_inode_info *pipe, size_t len, 394 unsigned int flags) 395 { 396 struct iov_iter to; 397 struct kiocb kiocb; 398 int ret; 399 400 iov_iter_pipe(&to, ITER_DEST, pipe, len); 401 init_sync_kiocb(&kiocb, in); 402 kiocb.ki_pos = *ppos; 403 ret = call_read_iter(in, &kiocb, &to); 404 if (ret > 0) { 405 *ppos = kiocb.ki_pos; 406 file_accessed(in); 407 } else if (ret < 0) { 408 /* free what was emitted */ 409 pipe_discard_from(pipe, to.start_head); 410 /* 411 * callers of ->splice_read() expect -EAGAIN on 412 * "can't put anything in there", rather than -EFAULT. 413 */ 414 if (ret == -EFAULT) 415 ret = -EAGAIN; 416 } 417 418 return ret; 419 } 420 EXPORT_SYMBOL(generic_file_splice_read); 421 422 const struct pipe_buf_operations default_pipe_buf_ops = { 423 .release = generic_pipe_buf_release, 424 .try_steal = generic_pipe_buf_try_steal, 425 .get = generic_pipe_buf_get, 426 }; 427 428 /* Pipe buffer operations for a socket and similar. */ 429 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 430 .release = generic_pipe_buf_release, 431 .get = generic_pipe_buf_get, 432 }; 433 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 434 435 /* 436 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 437 * using sendpage(). Return the number of bytes sent. 438 */ 439 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 440 struct pipe_buffer *buf, struct splice_desc *sd) 441 { 442 struct file *file = sd->u.file; 443 loff_t pos = sd->pos; 444 int more; 445 446 if (!likely(file->f_op->sendpage)) 447 return -EINVAL; 448 449 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 450 451 if (sd->len < sd->total_len && 452 pipe_occupancy(pipe->head, pipe->tail) > 1) 453 more |= MSG_SENDPAGE_NOTLAST; 454 455 return file->f_op->sendpage(file, buf->page, buf->offset, 456 sd->len, &pos, more); 457 } 458 459 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 460 { 461 smp_mb(); 462 if (waitqueue_active(&pipe->wr_wait)) 463 wake_up_interruptible(&pipe->wr_wait); 464 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 465 } 466 467 /** 468 * splice_from_pipe_feed - feed available data from a pipe to a file 469 * @pipe: pipe to splice from 470 * @sd: information to @actor 471 * @actor: handler that splices the data 472 * 473 * Description: 474 * This function loops over the pipe and calls @actor to do the 475 * actual moving of a single struct pipe_buffer to the desired 476 * destination. It returns when there's no more buffers left in 477 * the pipe or if the requested number of bytes (@sd->total_len) 478 * have been copied. It returns a positive number (one) if the 479 * pipe needs to be filled with more data, zero if the required 480 * number of bytes have been copied and -errno on error. 481 * 482 * This, together with splice_from_pipe_{begin,end,next}, may be 483 * used to implement the functionality of __splice_from_pipe() when 484 * locking is required around copying the pipe buffers to the 485 * destination. 486 */ 487 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 488 splice_actor *actor) 489 { 490 unsigned int head = pipe->head; 491 unsigned int tail = pipe->tail; 492 unsigned int mask = pipe->ring_size - 1; 493 int ret; 494 495 while (!pipe_empty(head, tail)) { 496 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 497 498 sd->len = buf->len; 499 if (sd->len > sd->total_len) 500 sd->len = sd->total_len; 501 502 ret = pipe_buf_confirm(pipe, buf); 503 if (unlikely(ret)) { 504 if (ret == -ENODATA) 505 ret = 0; 506 return ret; 507 } 508 509 ret = actor(pipe, buf, sd); 510 if (ret <= 0) 511 return ret; 512 513 buf->offset += ret; 514 buf->len -= ret; 515 516 sd->num_spliced += ret; 517 sd->len -= ret; 518 sd->pos += ret; 519 sd->total_len -= ret; 520 521 if (!buf->len) { 522 pipe_buf_release(pipe, buf); 523 tail++; 524 pipe->tail = tail; 525 if (pipe->files) 526 sd->need_wakeup = true; 527 } 528 529 if (!sd->total_len) 530 return 0; 531 } 532 533 return 1; 534 } 535 536 /* We know we have a pipe buffer, but maybe it's empty? */ 537 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) 538 { 539 unsigned int tail = pipe->tail; 540 unsigned int mask = pipe->ring_size - 1; 541 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 542 543 if (unlikely(!buf->len)) { 544 pipe_buf_release(pipe, buf); 545 pipe->tail = tail+1; 546 return true; 547 } 548 549 return false; 550 } 551 552 /** 553 * splice_from_pipe_next - wait for some data to splice from 554 * @pipe: pipe to splice from 555 * @sd: information about the splice operation 556 * 557 * Description: 558 * This function will wait for some data and return a positive 559 * value (one) if pipe buffers are available. It will return zero 560 * or -errno if no more data needs to be spliced. 561 */ 562 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 563 { 564 /* 565 * Check for signal early to make process killable when there are 566 * always buffers available 567 */ 568 if (signal_pending(current)) 569 return -ERESTARTSYS; 570 571 repeat: 572 while (pipe_empty(pipe->head, pipe->tail)) { 573 if (!pipe->writers) 574 return 0; 575 576 if (sd->num_spliced) 577 return 0; 578 579 if (sd->flags & SPLICE_F_NONBLOCK) 580 return -EAGAIN; 581 582 if (signal_pending(current)) 583 return -ERESTARTSYS; 584 585 if (sd->need_wakeup) { 586 wakeup_pipe_writers(pipe); 587 sd->need_wakeup = false; 588 } 589 590 pipe_wait_readable(pipe); 591 } 592 593 if (eat_empty_buffer(pipe)) 594 goto repeat; 595 596 return 1; 597 } 598 599 /** 600 * splice_from_pipe_begin - start splicing from pipe 601 * @sd: information about the splice operation 602 * 603 * Description: 604 * This function should be called before a loop containing 605 * splice_from_pipe_next() and splice_from_pipe_feed() to 606 * initialize the necessary fields of @sd. 607 */ 608 static void splice_from_pipe_begin(struct splice_desc *sd) 609 { 610 sd->num_spliced = 0; 611 sd->need_wakeup = false; 612 } 613 614 /** 615 * splice_from_pipe_end - finish splicing from pipe 616 * @pipe: pipe to splice from 617 * @sd: information about the splice operation 618 * 619 * Description: 620 * This function will wake up pipe writers if necessary. It should 621 * be called after a loop containing splice_from_pipe_next() and 622 * splice_from_pipe_feed(). 623 */ 624 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 625 { 626 if (sd->need_wakeup) 627 wakeup_pipe_writers(pipe); 628 } 629 630 /** 631 * __splice_from_pipe - splice data from a pipe to given actor 632 * @pipe: pipe to splice from 633 * @sd: information to @actor 634 * @actor: handler that splices the data 635 * 636 * Description: 637 * This function does little more than loop over the pipe and call 638 * @actor to do the actual moving of a single struct pipe_buffer to 639 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 640 * pipe_to_user. 641 * 642 */ 643 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 644 splice_actor *actor) 645 { 646 int ret; 647 648 splice_from_pipe_begin(sd); 649 do { 650 cond_resched(); 651 ret = splice_from_pipe_next(pipe, sd); 652 if (ret > 0) 653 ret = splice_from_pipe_feed(pipe, sd, actor); 654 } while (ret > 0); 655 splice_from_pipe_end(pipe, sd); 656 657 return sd->num_spliced ? sd->num_spliced : ret; 658 } 659 EXPORT_SYMBOL(__splice_from_pipe); 660 661 /** 662 * splice_from_pipe - splice data from a pipe to a file 663 * @pipe: pipe to splice from 664 * @out: file to splice to 665 * @ppos: position in @out 666 * @len: how many bytes to splice 667 * @flags: splice modifier flags 668 * @actor: handler that splices the data 669 * 670 * Description: 671 * See __splice_from_pipe. This function locks the pipe inode, 672 * otherwise it's identical to __splice_from_pipe(). 673 * 674 */ 675 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 676 loff_t *ppos, size_t len, unsigned int flags, 677 splice_actor *actor) 678 { 679 ssize_t ret; 680 struct splice_desc sd = { 681 .total_len = len, 682 .flags = flags, 683 .pos = *ppos, 684 .u.file = out, 685 }; 686 687 pipe_lock(pipe); 688 ret = __splice_from_pipe(pipe, &sd, actor); 689 pipe_unlock(pipe); 690 691 return ret; 692 } 693 694 /** 695 * iter_file_splice_write - splice data from a pipe to a file 696 * @pipe: pipe info 697 * @out: file to write to 698 * @ppos: position in @out 699 * @len: number of bytes to splice 700 * @flags: splice modifier flags 701 * 702 * Description: 703 * Will either move or copy pages (determined by @flags options) from 704 * the given pipe inode to the given file. 705 * This one is ->write_iter-based. 706 * 707 */ 708 ssize_t 709 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 710 loff_t *ppos, size_t len, unsigned int flags) 711 { 712 struct splice_desc sd = { 713 .total_len = len, 714 .flags = flags, 715 .pos = *ppos, 716 .u.file = out, 717 }; 718 int nbufs = pipe->max_usage; 719 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 720 GFP_KERNEL); 721 ssize_t ret; 722 723 if (unlikely(!array)) 724 return -ENOMEM; 725 726 pipe_lock(pipe); 727 728 splice_from_pipe_begin(&sd); 729 while (sd.total_len) { 730 struct iov_iter from; 731 unsigned int head, tail, mask; 732 size_t left; 733 int n; 734 735 ret = splice_from_pipe_next(pipe, &sd); 736 if (ret <= 0) 737 break; 738 739 if (unlikely(nbufs < pipe->max_usage)) { 740 kfree(array); 741 nbufs = pipe->max_usage; 742 array = kcalloc(nbufs, sizeof(struct bio_vec), 743 GFP_KERNEL); 744 if (!array) { 745 ret = -ENOMEM; 746 break; 747 } 748 } 749 750 head = pipe->head; 751 tail = pipe->tail; 752 mask = pipe->ring_size - 1; 753 754 /* build the vector */ 755 left = sd.total_len; 756 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { 757 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 758 size_t this_len = buf->len; 759 760 /* zero-length bvecs are not supported, skip them */ 761 if (!this_len) 762 continue; 763 this_len = min(this_len, left); 764 765 ret = pipe_buf_confirm(pipe, buf); 766 if (unlikely(ret)) { 767 if (ret == -ENODATA) 768 ret = 0; 769 goto done; 770 } 771 772 bvec_set_page(&array[n], buf->page, this_len, 773 buf->offset); 774 left -= this_len; 775 n++; 776 } 777 778 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); 779 ret = vfs_iter_write(out, &from, &sd.pos, 0); 780 if (ret <= 0) 781 break; 782 783 sd.num_spliced += ret; 784 sd.total_len -= ret; 785 *ppos = sd.pos; 786 787 /* dismiss the fully eaten buffers, adjust the partial one */ 788 tail = pipe->tail; 789 while (ret) { 790 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 791 if (ret >= buf->len) { 792 ret -= buf->len; 793 buf->len = 0; 794 pipe_buf_release(pipe, buf); 795 tail++; 796 pipe->tail = tail; 797 if (pipe->files) 798 sd.need_wakeup = true; 799 } else { 800 buf->offset += ret; 801 buf->len -= ret; 802 ret = 0; 803 } 804 } 805 } 806 done: 807 kfree(array); 808 splice_from_pipe_end(pipe, &sd); 809 810 pipe_unlock(pipe); 811 812 if (sd.num_spliced) 813 ret = sd.num_spliced; 814 815 return ret; 816 } 817 818 EXPORT_SYMBOL(iter_file_splice_write); 819 820 /** 821 * generic_splice_sendpage - splice data from a pipe to a socket 822 * @pipe: pipe to splice from 823 * @out: socket to write to 824 * @ppos: position in @out 825 * @len: number of bytes to splice 826 * @flags: splice modifier flags 827 * 828 * Description: 829 * Will send @len bytes from the pipe to a network socket. No data copying 830 * is involved. 831 * 832 */ 833 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 834 loff_t *ppos, size_t len, unsigned int flags) 835 { 836 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 837 } 838 839 EXPORT_SYMBOL(generic_splice_sendpage); 840 841 static int warn_unsupported(struct file *file, const char *op) 842 { 843 pr_debug_ratelimited( 844 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 845 op, file, current->pid, current->comm); 846 return -EINVAL; 847 } 848 849 /* 850 * Attempt to initiate a splice from pipe to file. 851 */ 852 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 853 loff_t *ppos, size_t len, unsigned int flags) 854 { 855 if (unlikely(!out->f_op->splice_write)) 856 return warn_unsupported(out, "write"); 857 return out->f_op->splice_write(pipe, out, ppos, len, flags); 858 } 859 860 /* 861 * Attempt to initiate a splice from a file to a pipe. 862 */ 863 static long do_splice_to(struct file *in, loff_t *ppos, 864 struct pipe_inode_info *pipe, size_t len, 865 unsigned int flags) 866 { 867 unsigned int p_space; 868 int ret; 869 870 if (unlikely(!(in->f_mode & FMODE_READ))) 871 return -EBADF; 872 873 /* Don't try to read more the pipe has space for. */ 874 p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); 875 len = min_t(size_t, len, p_space << PAGE_SHIFT); 876 877 ret = rw_verify_area(READ, in, ppos, len); 878 if (unlikely(ret < 0)) 879 return ret; 880 881 if (unlikely(len > MAX_RW_COUNT)) 882 len = MAX_RW_COUNT; 883 884 if (unlikely(!in->f_op->splice_read)) 885 return warn_unsupported(in, "read"); 886 return in->f_op->splice_read(in, ppos, pipe, len, flags); 887 } 888 889 /** 890 * splice_direct_to_actor - splices data directly between two non-pipes 891 * @in: file to splice from 892 * @sd: actor information on where to splice to 893 * @actor: handles the data splicing 894 * 895 * Description: 896 * This is a special case helper to splice directly between two 897 * points, without requiring an explicit pipe. Internally an allocated 898 * pipe is cached in the process, and reused during the lifetime of 899 * that process. 900 * 901 */ 902 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 903 splice_direct_actor *actor) 904 { 905 struct pipe_inode_info *pipe; 906 long ret, bytes; 907 size_t len; 908 int i, flags, more; 909 910 /* 911 * We require the input to be seekable, as we don't want to randomly 912 * drop data for eg socket -> socket splicing. Use the piped splicing 913 * for that! 914 */ 915 if (unlikely(!(in->f_mode & FMODE_LSEEK))) 916 return -EINVAL; 917 918 /* 919 * neither in nor out is a pipe, setup an internal pipe attached to 920 * 'out' and transfer the wanted data from 'in' to 'out' through that 921 */ 922 pipe = current->splice_pipe; 923 if (unlikely(!pipe)) { 924 pipe = alloc_pipe_info(); 925 if (!pipe) 926 return -ENOMEM; 927 928 /* 929 * We don't have an immediate reader, but we'll read the stuff 930 * out of the pipe right after the splice_to_pipe(). So set 931 * PIPE_READERS appropriately. 932 */ 933 pipe->readers = 1; 934 935 current->splice_pipe = pipe; 936 } 937 938 /* 939 * Do the splice. 940 */ 941 bytes = 0; 942 len = sd->total_len; 943 flags = sd->flags; 944 945 /* 946 * Don't block on output, we have to drain the direct pipe. 947 */ 948 sd->flags &= ~SPLICE_F_NONBLOCK; 949 more = sd->flags & SPLICE_F_MORE; 950 951 WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); 952 953 while (len) { 954 size_t read_len; 955 loff_t pos = sd->pos, prev_pos = pos; 956 957 ret = do_splice_to(in, &pos, pipe, len, flags); 958 if (unlikely(ret <= 0)) 959 goto out_release; 960 961 read_len = ret; 962 sd->total_len = read_len; 963 964 /* 965 * If more data is pending, set SPLICE_F_MORE 966 * If this is the last data and SPLICE_F_MORE was not set 967 * initially, clears it. 968 */ 969 if (read_len < len) 970 sd->flags |= SPLICE_F_MORE; 971 else if (!more) 972 sd->flags &= ~SPLICE_F_MORE; 973 /* 974 * NOTE: nonblocking mode only applies to the input. We 975 * must not do the output in nonblocking mode as then we 976 * could get stuck data in the internal pipe: 977 */ 978 ret = actor(pipe, sd); 979 if (unlikely(ret <= 0)) { 980 sd->pos = prev_pos; 981 goto out_release; 982 } 983 984 bytes += ret; 985 len -= ret; 986 sd->pos = pos; 987 988 if (ret < read_len) { 989 sd->pos = prev_pos + ret; 990 goto out_release; 991 } 992 } 993 994 done: 995 pipe->tail = pipe->head = 0; 996 file_accessed(in); 997 return bytes; 998 999 out_release: 1000 /* 1001 * If we did an incomplete transfer we must release 1002 * the pipe buffers in question: 1003 */ 1004 for (i = 0; i < pipe->ring_size; i++) { 1005 struct pipe_buffer *buf = &pipe->bufs[i]; 1006 1007 if (buf->ops) 1008 pipe_buf_release(pipe, buf); 1009 } 1010 1011 if (!bytes) 1012 bytes = ret; 1013 1014 goto done; 1015 } 1016 EXPORT_SYMBOL(splice_direct_to_actor); 1017 1018 static int direct_splice_actor(struct pipe_inode_info *pipe, 1019 struct splice_desc *sd) 1020 { 1021 struct file *file = sd->u.file; 1022 1023 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1024 sd->flags); 1025 } 1026 1027 /** 1028 * do_splice_direct - splices data directly between two files 1029 * @in: file to splice from 1030 * @ppos: input file offset 1031 * @out: file to splice to 1032 * @opos: output file offset 1033 * @len: number of bytes to splice 1034 * @flags: splice modifier flags 1035 * 1036 * Description: 1037 * For use by do_sendfile(). splice can easily emulate sendfile, but 1038 * doing it in the application would incur an extra system call 1039 * (splice in + splice out, as compared to just sendfile()). So this helper 1040 * can splice directly through a process-private pipe. 1041 * 1042 */ 1043 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1044 loff_t *opos, size_t len, unsigned int flags) 1045 { 1046 struct splice_desc sd = { 1047 .len = len, 1048 .total_len = len, 1049 .flags = flags, 1050 .pos = *ppos, 1051 .u.file = out, 1052 .opos = opos, 1053 }; 1054 long ret; 1055 1056 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1057 return -EBADF; 1058 1059 if (unlikely(out->f_flags & O_APPEND)) 1060 return -EINVAL; 1061 1062 ret = rw_verify_area(WRITE, out, opos, len); 1063 if (unlikely(ret < 0)) 1064 return ret; 1065 1066 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1067 if (ret > 0) 1068 *ppos = sd.pos; 1069 1070 return ret; 1071 } 1072 EXPORT_SYMBOL(do_splice_direct); 1073 1074 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1075 { 1076 for (;;) { 1077 if (unlikely(!pipe->readers)) { 1078 send_sig(SIGPIPE, current, 0); 1079 return -EPIPE; 1080 } 1081 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1082 return 0; 1083 if (flags & SPLICE_F_NONBLOCK) 1084 return -EAGAIN; 1085 if (signal_pending(current)) 1086 return -ERESTARTSYS; 1087 pipe_wait_writable(pipe); 1088 } 1089 } 1090 1091 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1092 struct pipe_inode_info *opipe, 1093 size_t len, unsigned int flags); 1094 1095 long splice_file_to_pipe(struct file *in, 1096 struct pipe_inode_info *opipe, 1097 loff_t *offset, 1098 size_t len, unsigned int flags) 1099 { 1100 long ret; 1101 1102 pipe_lock(opipe); 1103 ret = wait_for_space(opipe, flags); 1104 if (!ret) 1105 ret = do_splice_to(in, offset, opipe, len, flags); 1106 pipe_unlock(opipe); 1107 if (ret > 0) 1108 wakeup_pipe_readers(opipe); 1109 return ret; 1110 } 1111 1112 /* 1113 * Determine where to splice to/from. 1114 */ 1115 long do_splice(struct file *in, loff_t *off_in, struct file *out, 1116 loff_t *off_out, size_t len, unsigned int flags) 1117 { 1118 struct pipe_inode_info *ipipe; 1119 struct pipe_inode_info *opipe; 1120 loff_t offset; 1121 long ret; 1122 1123 if (unlikely(!(in->f_mode & FMODE_READ) || 1124 !(out->f_mode & FMODE_WRITE))) 1125 return -EBADF; 1126 1127 ipipe = get_pipe_info(in, true); 1128 opipe = get_pipe_info(out, true); 1129 1130 if (ipipe && opipe) { 1131 if (off_in || off_out) 1132 return -ESPIPE; 1133 1134 /* Splicing to self would be fun, but... */ 1135 if (ipipe == opipe) 1136 return -EINVAL; 1137 1138 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1139 flags |= SPLICE_F_NONBLOCK; 1140 1141 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1142 } 1143 1144 if (ipipe) { 1145 if (off_in) 1146 return -ESPIPE; 1147 if (off_out) { 1148 if (!(out->f_mode & FMODE_PWRITE)) 1149 return -EINVAL; 1150 offset = *off_out; 1151 } else { 1152 offset = out->f_pos; 1153 } 1154 1155 if (unlikely(out->f_flags & O_APPEND)) 1156 return -EINVAL; 1157 1158 ret = rw_verify_area(WRITE, out, &offset, len); 1159 if (unlikely(ret < 0)) 1160 return ret; 1161 1162 if (in->f_flags & O_NONBLOCK) 1163 flags |= SPLICE_F_NONBLOCK; 1164 1165 file_start_write(out); 1166 ret = do_splice_from(ipipe, out, &offset, len, flags); 1167 file_end_write(out); 1168 1169 if (ret > 0) 1170 fsnotify_modify(out); 1171 1172 if (!off_out) 1173 out->f_pos = offset; 1174 else 1175 *off_out = offset; 1176 1177 return ret; 1178 } 1179 1180 if (opipe) { 1181 if (off_out) 1182 return -ESPIPE; 1183 if (off_in) { 1184 if (!(in->f_mode & FMODE_PREAD)) 1185 return -EINVAL; 1186 offset = *off_in; 1187 } else { 1188 offset = in->f_pos; 1189 } 1190 1191 if (out->f_flags & O_NONBLOCK) 1192 flags |= SPLICE_F_NONBLOCK; 1193 1194 ret = splice_file_to_pipe(in, opipe, &offset, len, flags); 1195 1196 if (ret > 0) 1197 fsnotify_access(in); 1198 1199 if (!off_in) 1200 in->f_pos = offset; 1201 else 1202 *off_in = offset; 1203 1204 return ret; 1205 } 1206 1207 return -EINVAL; 1208 } 1209 1210 static long __do_splice(struct file *in, loff_t __user *off_in, 1211 struct file *out, loff_t __user *off_out, 1212 size_t len, unsigned int flags) 1213 { 1214 struct pipe_inode_info *ipipe; 1215 struct pipe_inode_info *opipe; 1216 loff_t offset, *__off_in = NULL, *__off_out = NULL; 1217 long ret; 1218 1219 ipipe = get_pipe_info(in, true); 1220 opipe = get_pipe_info(out, true); 1221 1222 if (ipipe && off_in) 1223 return -ESPIPE; 1224 if (opipe && off_out) 1225 return -ESPIPE; 1226 1227 if (off_out) { 1228 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1229 return -EFAULT; 1230 __off_out = &offset; 1231 } 1232 if (off_in) { 1233 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1234 return -EFAULT; 1235 __off_in = &offset; 1236 } 1237 1238 ret = do_splice(in, __off_in, out, __off_out, len, flags); 1239 if (ret < 0) 1240 return ret; 1241 1242 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) 1243 return -EFAULT; 1244 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) 1245 return -EFAULT; 1246 1247 return ret; 1248 } 1249 1250 static int iter_to_pipe(struct iov_iter *from, 1251 struct pipe_inode_info *pipe, 1252 unsigned flags) 1253 { 1254 struct pipe_buffer buf = { 1255 .ops = &user_page_pipe_buf_ops, 1256 .flags = flags 1257 }; 1258 size_t total = 0; 1259 int ret = 0; 1260 1261 while (iov_iter_count(from)) { 1262 struct page *pages[16]; 1263 ssize_t left; 1264 size_t start; 1265 int i, n; 1266 1267 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); 1268 if (left <= 0) { 1269 ret = left; 1270 break; 1271 } 1272 1273 n = DIV_ROUND_UP(left + start, PAGE_SIZE); 1274 for (i = 0; i < n; i++) { 1275 int size = min_t(int, left, PAGE_SIZE - start); 1276 1277 buf.page = pages[i]; 1278 buf.offset = start; 1279 buf.len = size; 1280 ret = add_to_pipe(pipe, &buf); 1281 if (unlikely(ret < 0)) { 1282 iov_iter_revert(from, left); 1283 // this one got dropped by add_to_pipe() 1284 while (++i < n) 1285 put_page(pages[i]); 1286 goto out; 1287 } 1288 total += ret; 1289 left -= size; 1290 start = 0; 1291 } 1292 } 1293 out: 1294 return total ? total : ret; 1295 } 1296 1297 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1298 struct splice_desc *sd) 1299 { 1300 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1301 return n == sd->len ? n : -EFAULT; 1302 } 1303 1304 /* 1305 * For lack of a better implementation, implement vmsplice() to userspace 1306 * as a simple copy of the pipes pages to the user iov. 1307 */ 1308 static long vmsplice_to_user(struct file *file, struct iov_iter *iter, 1309 unsigned int flags) 1310 { 1311 struct pipe_inode_info *pipe = get_pipe_info(file, true); 1312 struct splice_desc sd = { 1313 .total_len = iov_iter_count(iter), 1314 .flags = flags, 1315 .u.data = iter 1316 }; 1317 long ret = 0; 1318 1319 if (!pipe) 1320 return -EBADF; 1321 1322 if (sd.total_len) { 1323 pipe_lock(pipe); 1324 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1325 pipe_unlock(pipe); 1326 } 1327 1328 return ret; 1329 } 1330 1331 /* 1332 * vmsplice splices a user address range into a pipe. It can be thought of 1333 * as splice-from-memory, where the regular splice is splice-from-file (or 1334 * to file). In both cases the output is a pipe, naturally. 1335 */ 1336 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1337 unsigned int flags) 1338 { 1339 struct pipe_inode_info *pipe; 1340 long ret = 0; 1341 unsigned buf_flag = 0; 1342 1343 if (flags & SPLICE_F_GIFT) 1344 buf_flag = PIPE_BUF_FLAG_GIFT; 1345 1346 pipe = get_pipe_info(file, true); 1347 if (!pipe) 1348 return -EBADF; 1349 1350 pipe_lock(pipe); 1351 ret = wait_for_space(pipe, flags); 1352 if (!ret) 1353 ret = iter_to_pipe(iter, pipe, buf_flag); 1354 pipe_unlock(pipe); 1355 if (ret > 0) 1356 wakeup_pipe_readers(pipe); 1357 return ret; 1358 } 1359 1360 static int vmsplice_type(struct fd f, int *type) 1361 { 1362 if (!f.file) 1363 return -EBADF; 1364 if (f.file->f_mode & FMODE_WRITE) { 1365 *type = ITER_SOURCE; 1366 } else if (f.file->f_mode & FMODE_READ) { 1367 *type = ITER_DEST; 1368 } else { 1369 fdput(f); 1370 return -EBADF; 1371 } 1372 return 0; 1373 } 1374 1375 /* 1376 * Note that vmsplice only really supports true splicing _from_ user memory 1377 * to a pipe, not the other way around. Splicing from user memory is a simple 1378 * operation that can be supported without any funky alignment restrictions 1379 * or nasty vm tricks. We simply map in the user memory and fill them into 1380 * a pipe. The reverse isn't quite as easy, though. There are two possible 1381 * solutions for that: 1382 * 1383 * - memcpy() the data internally, at which point we might as well just 1384 * do a regular read() on the buffer anyway. 1385 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1386 * has restriction limitations on both ends of the pipe). 1387 * 1388 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1389 * 1390 */ 1391 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, 1392 unsigned long, nr_segs, unsigned int, flags) 1393 { 1394 struct iovec iovstack[UIO_FASTIOV]; 1395 struct iovec *iov = iovstack; 1396 struct iov_iter iter; 1397 ssize_t error; 1398 struct fd f; 1399 int type; 1400 1401 if (unlikely(flags & ~SPLICE_F_ALL)) 1402 return -EINVAL; 1403 1404 f = fdget(fd); 1405 error = vmsplice_type(f, &type); 1406 if (error) 1407 return error; 1408 1409 error = import_iovec(type, uiov, nr_segs, 1410 ARRAY_SIZE(iovstack), &iov, &iter); 1411 if (error < 0) 1412 goto out_fdput; 1413 1414 if (!iov_iter_count(&iter)) 1415 error = 0; 1416 else if (type == ITER_SOURCE) 1417 error = vmsplice_to_pipe(f.file, &iter, flags); 1418 else 1419 error = vmsplice_to_user(f.file, &iter, flags); 1420 1421 kfree(iov); 1422 out_fdput: 1423 fdput(f); 1424 return error; 1425 } 1426 1427 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1428 int, fd_out, loff_t __user *, off_out, 1429 size_t, len, unsigned int, flags) 1430 { 1431 struct fd in, out; 1432 long error; 1433 1434 if (unlikely(!len)) 1435 return 0; 1436 1437 if (unlikely(flags & ~SPLICE_F_ALL)) 1438 return -EINVAL; 1439 1440 error = -EBADF; 1441 in = fdget(fd_in); 1442 if (in.file) { 1443 out = fdget(fd_out); 1444 if (out.file) { 1445 error = __do_splice(in.file, off_in, out.file, off_out, 1446 len, flags); 1447 fdput(out); 1448 } 1449 fdput(in); 1450 } 1451 return error; 1452 } 1453 1454 /* 1455 * Make sure there's data to read. Wait for input if we can, otherwise 1456 * return an appropriate error. 1457 */ 1458 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1459 { 1460 int ret; 1461 1462 /* 1463 * Check the pipe occupancy without the inode lock first. This function 1464 * is speculative anyways, so missing one is ok. 1465 */ 1466 if (!pipe_empty(pipe->head, pipe->tail)) 1467 return 0; 1468 1469 ret = 0; 1470 pipe_lock(pipe); 1471 1472 while (pipe_empty(pipe->head, pipe->tail)) { 1473 if (signal_pending(current)) { 1474 ret = -ERESTARTSYS; 1475 break; 1476 } 1477 if (!pipe->writers) 1478 break; 1479 if (flags & SPLICE_F_NONBLOCK) { 1480 ret = -EAGAIN; 1481 break; 1482 } 1483 pipe_wait_readable(pipe); 1484 } 1485 1486 pipe_unlock(pipe); 1487 return ret; 1488 } 1489 1490 /* 1491 * Make sure there's writeable room. Wait for room if we can, otherwise 1492 * return an appropriate error. 1493 */ 1494 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1495 { 1496 int ret; 1497 1498 /* 1499 * Check pipe occupancy without the inode lock first. This function 1500 * is speculative anyways, so missing one is ok. 1501 */ 1502 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1503 return 0; 1504 1505 ret = 0; 1506 pipe_lock(pipe); 1507 1508 while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 1509 if (!pipe->readers) { 1510 send_sig(SIGPIPE, current, 0); 1511 ret = -EPIPE; 1512 break; 1513 } 1514 if (flags & SPLICE_F_NONBLOCK) { 1515 ret = -EAGAIN; 1516 break; 1517 } 1518 if (signal_pending(current)) { 1519 ret = -ERESTARTSYS; 1520 break; 1521 } 1522 pipe_wait_writable(pipe); 1523 } 1524 1525 pipe_unlock(pipe); 1526 return ret; 1527 } 1528 1529 /* 1530 * Splice contents of ipipe to opipe. 1531 */ 1532 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1533 struct pipe_inode_info *opipe, 1534 size_t len, unsigned int flags) 1535 { 1536 struct pipe_buffer *ibuf, *obuf; 1537 unsigned int i_head, o_head; 1538 unsigned int i_tail, o_tail; 1539 unsigned int i_mask, o_mask; 1540 int ret = 0; 1541 bool input_wakeup = false; 1542 1543 1544 retry: 1545 ret = ipipe_prep(ipipe, flags); 1546 if (ret) 1547 return ret; 1548 1549 ret = opipe_prep(opipe, flags); 1550 if (ret) 1551 return ret; 1552 1553 /* 1554 * Potential ABBA deadlock, work around it by ordering lock 1555 * grabbing by pipe info address. Otherwise two different processes 1556 * could deadlock (one doing tee from A -> B, the other from B -> A). 1557 */ 1558 pipe_double_lock(ipipe, opipe); 1559 1560 i_tail = ipipe->tail; 1561 i_mask = ipipe->ring_size - 1; 1562 o_head = opipe->head; 1563 o_mask = opipe->ring_size - 1; 1564 1565 do { 1566 size_t o_len; 1567 1568 if (!opipe->readers) { 1569 send_sig(SIGPIPE, current, 0); 1570 if (!ret) 1571 ret = -EPIPE; 1572 break; 1573 } 1574 1575 i_head = ipipe->head; 1576 o_tail = opipe->tail; 1577 1578 if (pipe_empty(i_head, i_tail) && !ipipe->writers) 1579 break; 1580 1581 /* 1582 * Cannot make any progress, because either the input 1583 * pipe is empty or the output pipe is full. 1584 */ 1585 if (pipe_empty(i_head, i_tail) || 1586 pipe_full(o_head, o_tail, opipe->max_usage)) { 1587 /* Already processed some buffers, break */ 1588 if (ret) 1589 break; 1590 1591 if (flags & SPLICE_F_NONBLOCK) { 1592 ret = -EAGAIN; 1593 break; 1594 } 1595 1596 /* 1597 * We raced with another reader/writer and haven't 1598 * managed to process any buffers. A zero return 1599 * value means EOF, so retry instead. 1600 */ 1601 pipe_unlock(ipipe); 1602 pipe_unlock(opipe); 1603 goto retry; 1604 } 1605 1606 ibuf = &ipipe->bufs[i_tail & i_mask]; 1607 obuf = &opipe->bufs[o_head & o_mask]; 1608 1609 if (len >= ibuf->len) { 1610 /* 1611 * Simply move the whole buffer from ipipe to opipe 1612 */ 1613 *obuf = *ibuf; 1614 ibuf->ops = NULL; 1615 i_tail++; 1616 ipipe->tail = i_tail; 1617 input_wakeup = true; 1618 o_len = obuf->len; 1619 o_head++; 1620 opipe->head = o_head; 1621 } else { 1622 /* 1623 * Get a reference to this pipe buffer, 1624 * so we can copy the contents over. 1625 */ 1626 if (!pipe_buf_get(ipipe, ibuf)) { 1627 if (ret == 0) 1628 ret = -EFAULT; 1629 break; 1630 } 1631 *obuf = *ibuf; 1632 1633 /* 1634 * Don't inherit the gift and merge flags, we need to 1635 * prevent multiple steals of this page. 1636 */ 1637 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1638 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1639 1640 obuf->len = len; 1641 ibuf->offset += len; 1642 ibuf->len -= len; 1643 o_len = len; 1644 o_head++; 1645 opipe->head = o_head; 1646 } 1647 ret += o_len; 1648 len -= o_len; 1649 } while (len); 1650 1651 pipe_unlock(ipipe); 1652 pipe_unlock(opipe); 1653 1654 /* 1655 * If we put data in the output pipe, wakeup any potential readers. 1656 */ 1657 if (ret > 0) 1658 wakeup_pipe_readers(opipe); 1659 1660 if (input_wakeup) 1661 wakeup_pipe_writers(ipipe); 1662 1663 return ret; 1664 } 1665 1666 /* 1667 * Link contents of ipipe to opipe. 1668 */ 1669 static int link_pipe(struct pipe_inode_info *ipipe, 1670 struct pipe_inode_info *opipe, 1671 size_t len, unsigned int flags) 1672 { 1673 struct pipe_buffer *ibuf, *obuf; 1674 unsigned int i_head, o_head; 1675 unsigned int i_tail, o_tail; 1676 unsigned int i_mask, o_mask; 1677 int ret = 0; 1678 1679 /* 1680 * Potential ABBA deadlock, work around it by ordering lock 1681 * grabbing by pipe info address. Otherwise two different processes 1682 * could deadlock (one doing tee from A -> B, the other from B -> A). 1683 */ 1684 pipe_double_lock(ipipe, opipe); 1685 1686 i_tail = ipipe->tail; 1687 i_mask = ipipe->ring_size - 1; 1688 o_head = opipe->head; 1689 o_mask = opipe->ring_size - 1; 1690 1691 do { 1692 if (!opipe->readers) { 1693 send_sig(SIGPIPE, current, 0); 1694 if (!ret) 1695 ret = -EPIPE; 1696 break; 1697 } 1698 1699 i_head = ipipe->head; 1700 o_tail = opipe->tail; 1701 1702 /* 1703 * If we have iterated all input buffers or run out of 1704 * output room, break. 1705 */ 1706 if (pipe_empty(i_head, i_tail) || 1707 pipe_full(o_head, o_tail, opipe->max_usage)) 1708 break; 1709 1710 ibuf = &ipipe->bufs[i_tail & i_mask]; 1711 obuf = &opipe->bufs[o_head & o_mask]; 1712 1713 /* 1714 * Get a reference to this pipe buffer, 1715 * so we can copy the contents over. 1716 */ 1717 if (!pipe_buf_get(ipipe, ibuf)) { 1718 if (ret == 0) 1719 ret = -EFAULT; 1720 break; 1721 } 1722 1723 *obuf = *ibuf; 1724 1725 /* 1726 * Don't inherit the gift and merge flag, we need to prevent 1727 * multiple steals of this page. 1728 */ 1729 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1730 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1731 1732 if (obuf->len > len) 1733 obuf->len = len; 1734 ret += obuf->len; 1735 len -= obuf->len; 1736 1737 o_head++; 1738 opipe->head = o_head; 1739 i_tail++; 1740 } while (len); 1741 1742 pipe_unlock(ipipe); 1743 pipe_unlock(opipe); 1744 1745 /* 1746 * If we put data in the output pipe, wakeup any potential readers. 1747 */ 1748 if (ret > 0) 1749 wakeup_pipe_readers(opipe); 1750 1751 return ret; 1752 } 1753 1754 /* 1755 * This is a tee(1) implementation that works on pipes. It doesn't copy 1756 * any data, it simply references the 'in' pages on the 'out' pipe. 1757 * The 'flags' used are the SPLICE_F_* variants, currently the only 1758 * applicable one is SPLICE_F_NONBLOCK. 1759 */ 1760 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) 1761 { 1762 struct pipe_inode_info *ipipe = get_pipe_info(in, true); 1763 struct pipe_inode_info *opipe = get_pipe_info(out, true); 1764 int ret = -EINVAL; 1765 1766 if (unlikely(!(in->f_mode & FMODE_READ) || 1767 !(out->f_mode & FMODE_WRITE))) 1768 return -EBADF; 1769 1770 /* 1771 * Duplicate the contents of ipipe to opipe without actually 1772 * copying the data. 1773 */ 1774 if (ipipe && opipe && ipipe != opipe) { 1775 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1776 flags |= SPLICE_F_NONBLOCK; 1777 1778 /* 1779 * Keep going, unless we encounter an error. The ipipe/opipe 1780 * ordering doesn't really matter. 1781 */ 1782 ret = ipipe_prep(ipipe, flags); 1783 if (!ret) { 1784 ret = opipe_prep(opipe, flags); 1785 if (!ret) 1786 ret = link_pipe(ipipe, opipe, len, flags); 1787 } 1788 } 1789 1790 return ret; 1791 } 1792 1793 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1794 { 1795 struct fd in, out; 1796 int error; 1797 1798 if (unlikely(flags & ~SPLICE_F_ALL)) 1799 return -EINVAL; 1800 1801 if (unlikely(!len)) 1802 return 0; 1803 1804 error = -EBADF; 1805 in = fdget(fdin); 1806 if (in.file) { 1807 out = fdget(fdout); 1808 if (out.file) { 1809 error = do_tee(in.file, out.file, len, flags); 1810 fdput(out); 1811 } 1812 fdput(in); 1813 } 1814 1815 return error; 1816 } 1817