1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * "splice": joining two ropes together by interweaving their strands. 4 * 5 * This is the "extended pipe" functionality, where a pipe is used as 6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 7 * buffer that you can use to transfer data from one end to the other. 8 * 9 * The traditional unix read/write is extended with a "splice()" operation 10 * that transfers data buffers to or from a pipe buffer. 11 * 12 * Named by Larry McVoy, original implementation from Linus, extended by 13 * Jens to support splicing to files, network, direct splicing, etc and 14 * fixing lots of bugs. 15 * 16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 19 * 20 */ 21 #include <linux/bvec.h> 22 #include <linux/fs.h> 23 #include <linux/file.h> 24 #include <linux/pagemap.h> 25 #include <linux/splice.h> 26 #include <linux/memcontrol.h> 27 #include <linux/mm_inline.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/export.h> 31 #include <linux/syscalls.h> 32 #include <linux/uio.h> 33 #include <linux/security.h> 34 #include <linux/gfp.h> 35 #include <linux/socket.h> 36 #include <linux/sched/signal.h> 37 38 #include "internal.h" 39 40 /* 41 * Attempt to steal a page from a pipe buffer. This should perhaps go into 42 * a vm helper function, it's already simplified quite a bit by the 43 * addition of remove_mapping(). If success is returned, the caller may 44 * attempt to reuse this page for another destination. 45 */ 46 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, 47 struct pipe_buffer *buf) 48 { 49 struct folio *folio = page_folio(buf->page); 50 struct address_space *mapping; 51 52 folio_lock(folio); 53 54 mapping = folio_mapping(folio); 55 if (mapping) { 56 WARN_ON(!folio_test_uptodate(folio)); 57 58 /* 59 * At least for ext2 with nobh option, we need to wait on 60 * writeback completing on this folio, since we'll remove it 61 * from the pagecache. Otherwise truncate wont wait on the 62 * folio, allowing the disk blocks to be reused by someone else 63 * before we actually wrote our data to them. fs corruption 64 * ensues. 65 */ 66 folio_wait_writeback(folio); 67 68 if (folio_has_private(folio) && 69 !filemap_release_folio(folio, GFP_KERNEL)) 70 goto out_unlock; 71 72 /* 73 * If we succeeded in removing the mapping, set LRU flag 74 * and return good. 75 */ 76 if (remove_mapping(mapping, folio)) { 77 buf->flags |= PIPE_BUF_FLAG_LRU; 78 return true; 79 } 80 } 81 82 /* 83 * Raced with truncate or failed to remove folio from current 84 * address space, unlock and return failure. 85 */ 86 out_unlock: 87 folio_unlock(folio); 88 return false; 89 } 90 91 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 92 struct pipe_buffer *buf) 93 { 94 put_page(buf->page); 95 buf->flags &= ~PIPE_BUF_FLAG_LRU; 96 } 97 98 /* 99 * Check whether the contents of buf is OK to access. Since the content 100 * is a page cache page, IO may be in flight. 101 */ 102 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 103 struct pipe_buffer *buf) 104 { 105 struct page *page = buf->page; 106 int err; 107 108 if (!PageUptodate(page)) { 109 lock_page(page); 110 111 /* 112 * Page got truncated/unhashed. This will cause a 0-byte 113 * splice, if this is the first page. 114 */ 115 if (!page->mapping) { 116 err = -ENODATA; 117 goto error; 118 } 119 120 /* 121 * Uh oh, read-error from disk. 122 */ 123 if (!PageUptodate(page)) { 124 err = -EIO; 125 goto error; 126 } 127 128 /* 129 * Page is ok afterall, we are done. 130 */ 131 unlock_page(page); 132 } 133 134 return 0; 135 error: 136 unlock_page(page); 137 return err; 138 } 139 140 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 141 .confirm = page_cache_pipe_buf_confirm, 142 .release = page_cache_pipe_buf_release, 143 .try_steal = page_cache_pipe_buf_try_steal, 144 .get = generic_pipe_buf_get, 145 }; 146 147 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, 148 struct pipe_buffer *buf) 149 { 150 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 151 return false; 152 153 buf->flags |= PIPE_BUF_FLAG_LRU; 154 return generic_pipe_buf_try_steal(pipe, buf); 155 } 156 157 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 158 .release = page_cache_pipe_buf_release, 159 .try_steal = user_page_pipe_buf_try_steal, 160 .get = generic_pipe_buf_get, 161 }; 162 163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 164 { 165 smp_mb(); 166 if (waitqueue_active(&pipe->rd_wait)) 167 wake_up_interruptible(&pipe->rd_wait); 168 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 169 } 170 171 /** 172 * splice_to_pipe - fill passed data into a pipe 173 * @pipe: pipe to fill 174 * @spd: data to fill 175 * 176 * Description: 177 * @spd contains a map of pages and len/offset tuples, along with 178 * the struct pipe_buf_operations associated with these pages. This 179 * function will link that data to the pipe. 180 * 181 */ 182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 183 struct splice_pipe_desc *spd) 184 { 185 unsigned int spd_pages = spd->nr_pages; 186 unsigned int tail = pipe->tail; 187 unsigned int head = pipe->head; 188 unsigned int mask = pipe->ring_size - 1; 189 int ret = 0, page_nr = 0; 190 191 if (!spd_pages) 192 return 0; 193 194 if (unlikely(!pipe->readers)) { 195 send_sig(SIGPIPE, current, 0); 196 ret = -EPIPE; 197 goto out; 198 } 199 200 while (!pipe_full(head, tail, pipe->max_usage)) { 201 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 202 203 buf->page = spd->pages[page_nr]; 204 buf->offset = spd->partial[page_nr].offset; 205 buf->len = spd->partial[page_nr].len; 206 buf->private = spd->partial[page_nr].private; 207 buf->ops = spd->ops; 208 buf->flags = 0; 209 210 head++; 211 pipe->head = head; 212 page_nr++; 213 ret += buf->len; 214 215 if (!--spd->nr_pages) 216 break; 217 } 218 219 if (!ret) 220 ret = -EAGAIN; 221 222 out: 223 while (page_nr < spd_pages) 224 spd->spd_release(spd, page_nr++); 225 226 return ret; 227 } 228 EXPORT_SYMBOL_GPL(splice_to_pipe); 229 230 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 231 { 232 unsigned int head = pipe->head; 233 unsigned int tail = pipe->tail; 234 unsigned int mask = pipe->ring_size - 1; 235 int ret; 236 237 if (unlikely(!pipe->readers)) { 238 send_sig(SIGPIPE, current, 0); 239 ret = -EPIPE; 240 } else if (pipe_full(head, tail, pipe->max_usage)) { 241 ret = -EAGAIN; 242 } else { 243 pipe->bufs[head & mask] = *buf; 244 pipe->head = head + 1; 245 return buf->len; 246 } 247 pipe_buf_release(pipe, buf); 248 return ret; 249 } 250 EXPORT_SYMBOL(add_to_pipe); 251 252 /* 253 * Check if we need to grow the arrays holding pages and partial page 254 * descriptions. 255 */ 256 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 257 { 258 unsigned int max_usage = READ_ONCE(pipe->max_usage); 259 260 spd->nr_pages_max = max_usage; 261 if (max_usage <= PIPE_DEF_BUFFERS) 262 return 0; 263 264 spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); 265 spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), 266 GFP_KERNEL); 267 268 if (spd->pages && spd->partial) 269 return 0; 270 271 kfree(spd->pages); 272 kfree(spd->partial); 273 return -ENOMEM; 274 } 275 276 void splice_shrink_spd(struct splice_pipe_desc *spd) 277 { 278 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 279 return; 280 281 kfree(spd->pages); 282 kfree(spd->partial); 283 } 284 285 /* 286 * Splice data from an O_DIRECT file into pages and then add them to the output 287 * pipe. 288 */ 289 ssize_t direct_splice_read(struct file *in, loff_t *ppos, 290 struct pipe_inode_info *pipe, 291 size_t len, unsigned int flags) 292 { 293 struct iov_iter to; 294 struct bio_vec *bv; 295 struct kiocb kiocb; 296 struct page **pages; 297 ssize_t ret; 298 size_t used, npages, chunk, remain, reclaim; 299 int i; 300 301 /* Work out how much data we can actually add into the pipe */ 302 used = pipe_occupancy(pipe->head, pipe->tail); 303 npages = max_t(ssize_t, pipe->max_usage - used, 0); 304 len = min_t(size_t, len, npages * PAGE_SIZE); 305 npages = DIV_ROUND_UP(len, PAGE_SIZE); 306 307 bv = kzalloc(array_size(npages, sizeof(bv[0])) + 308 array_size(npages, sizeof(struct page *)), GFP_KERNEL); 309 if (!bv) 310 return -ENOMEM; 311 312 pages = (void *)(bv + npages); 313 npages = alloc_pages_bulk_array(GFP_USER, npages, pages); 314 if (!npages) { 315 kfree(bv); 316 return -ENOMEM; 317 } 318 319 remain = len = min_t(size_t, len, npages * PAGE_SIZE); 320 321 for (i = 0; i < npages; i++) { 322 chunk = min_t(size_t, PAGE_SIZE, remain); 323 bv[i].bv_page = pages[i]; 324 bv[i].bv_offset = 0; 325 bv[i].bv_len = chunk; 326 remain -= chunk; 327 } 328 329 /* Do the I/O */ 330 iov_iter_bvec(&to, ITER_DEST, bv, npages, len); 331 init_sync_kiocb(&kiocb, in); 332 kiocb.ki_pos = *ppos; 333 ret = call_read_iter(in, &kiocb, &to); 334 335 reclaim = npages * PAGE_SIZE; 336 remain = 0; 337 if (ret > 0) { 338 reclaim -= ret; 339 remain = ret; 340 *ppos = kiocb.ki_pos; 341 file_accessed(in); 342 } else if (ret < 0) { 343 /* 344 * callers of ->splice_read() expect -EAGAIN on 345 * "can't put anything in there", rather than -EFAULT. 346 */ 347 if (ret == -EFAULT) 348 ret = -EAGAIN; 349 } 350 351 /* Free any pages that didn't get touched at all. */ 352 reclaim /= PAGE_SIZE; 353 if (reclaim) { 354 npages -= reclaim; 355 release_pages(pages + npages, reclaim); 356 } 357 358 /* Push the remaining pages into the pipe. */ 359 for (i = 0; i < npages; i++) { 360 struct pipe_buffer *buf = pipe_head_buf(pipe); 361 362 chunk = min_t(size_t, remain, PAGE_SIZE); 363 *buf = (struct pipe_buffer) { 364 .ops = &default_pipe_buf_ops, 365 .page = bv[i].bv_page, 366 .offset = 0, 367 .len = chunk, 368 }; 369 pipe->head++; 370 remain -= chunk; 371 } 372 373 kfree(bv); 374 return ret; 375 } 376 EXPORT_SYMBOL(direct_splice_read); 377 378 /** 379 * generic_file_splice_read - splice data from file to a pipe 380 * @in: file to splice from 381 * @ppos: position in @in 382 * @pipe: pipe to splice to 383 * @len: number of bytes to splice 384 * @flags: splice modifier flags 385 * 386 * Description: 387 * Will read pages from given file and fill them into a pipe. Can be 388 * used as long as it has more or less sane ->read_iter(). 389 * 390 */ 391 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 392 struct pipe_inode_info *pipe, size_t len, 393 unsigned int flags) 394 { 395 struct iov_iter to; 396 struct kiocb kiocb; 397 int ret; 398 399 iov_iter_pipe(&to, ITER_DEST, pipe, len); 400 init_sync_kiocb(&kiocb, in); 401 kiocb.ki_pos = *ppos; 402 ret = call_read_iter(in, &kiocb, &to); 403 if (ret > 0) { 404 *ppos = kiocb.ki_pos; 405 file_accessed(in); 406 } else if (ret < 0) { 407 /* free what was emitted */ 408 pipe_discard_from(pipe, to.start_head); 409 /* 410 * callers of ->splice_read() expect -EAGAIN on 411 * "can't put anything in there", rather than -EFAULT. 412 */ 413 if (ret == -EFAULT) 414 ret = -EAGAIN; 415 } 416 417 return ret; 418 } 419 EXPORT_SYMBOL(generic_file_splice_read); 420 421 const struct pipe_buf_operations default_pipe_buf_ops = { 422 .release = generic_pipe_buf_release, 423 .try_steal = generic_pipe_buf_try_steal, 424 .get = generic_pipe_buf_get, 425 }; 426 427 /* Pipe buffer operations for a socket and similar. */ 428 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 429 .release = generic_pipe_buf_release, 430 .get = generic_pipe_buf_get, 431 }; 432 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 433 434 /* 435 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 436 * using sendpage(). Return the number of bytes sent. 437 */ 438 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 439 struct pipe_buffer *buf, struct splice_desc *sd) 440 { 441 struct file *file = sd->u.file; 442 loff_t pos = sd->pos; 443 int more; 444 445 if (!likely(file->f_op->sendpage)) 446 return -EINVAL; 447 448 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 449 450 if (sd->len < sd->total_len && 451 pipe_occupancy(pipe->head, pipe->tail) > 1) 452 more |= MSG_SENDPAGE_NOTLAST; 453 454 return file->f_op->sendpage(file, buf->page, buf->offset, 455 sd->len, &pos, more); 456 } 457 458 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 459 { 460 smp_mb(); 461 if (waitqueue_active(&pipe->wr_wait)) 462 wake_up_interruptible(&pipe->wr_wait); 463 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 464 } 465 466 /** 467 * splice_from_pipe_feed - feed available data from a pipe to a file 468 * @pipe: pipe to splice from 469 * @sd: information to @actor 470 * @actor: handler that splices the data 471 * 472 * Description: 473 * This function loops over the pipe and calls @actor to do the 474 * actual moving of a single struct pipe_buffer to the desired 475 * destination. It returns when there's no more buffers left in 476 * the pipe or if the requested number of bytes (@sd->total_len) 477 * have been copied. It returns a positive number (one) if the 478 * pipe needs to be filled with more data, zero if the required 479 * number of bytes have been copied and -errno on error. 480 * 481 * This, together with splice_from_pipe_{begin,end,next}, may be 482 * used to implement the functionality of __splice_from_pipe() when 483 * locking is required around copying the pipe buffers to the 484 * destination. 485 */ 486 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 487 splice_actor *actor) 488 { 489 unsigned int head = pipe->head; 490 unsigned int tail = pipe->tail; 491 unsigned int mask = pipe->ring_size - 1; 492 int ret; 493 494 while (!pipe_empty(head, tail)) { 495 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 496 497 sd->len = buf->len; 498 if (sd->len > sd->total_len) 499 sd->len = sd->total_len; 500 501 ret = pipe_buf_confirm(pipe, buf); 502 if (unlikely(ret)) { 503 if (ret == -ENODATA) 504 ret = 0; 505 return ret; 506 } 507 508 ret = actor(pipe, buf, sd); 509 if (ret <= 0) 510 return ret; 511 512 buf->offset += ret; 513 buf->len -= ret; 514 515 sd->num_spliced += ret; 516 sd->len -= ret; 517 sd->pos += ret; 518 sd->total_len -= ret; 519 520 if (!buf->len) { 521 pipe_buf_release(pipe, buf); 522 tail++; 523 pipe->tail = tail; 524 if (pipe->files) 525 sd->need_wakeup = true; 526 } 527 528 if (!sd->total_len) 529 return 0; 530 } 531 532 return 1; 533 } 534 535 /* We know we have a pipe buffer, but maybe it's empty? */ 536 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) 537 { 538 unsigned int tail = pipe->tail; 539 unsigned int mask = pipe->ring_size - 1; 540 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 541 542 if (unlikely(!buf->len)) { 543 pipe_buf_release(pipe, buf); 544 pipe->tail = tail+1; 545 return true; 546 } 547 548 return false; 549 } 550 551 /** 552 * splice_from_pipe_next - wait for some data to splice from 553 * @pipe: pipe to splice from 554 * @sd: information about the splice operation 555 * 556 * Description: 557 * This function will wait for some data and return a positive 558 * value (one) if pipe buffers are available. It will return zero 559 * or -errno if no more data needs to be spliced. 560 */ 561 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 562 { 563 /* 564 * Check for signal early to make process killable when there are 565 * always buffers available 566 */ 567 if (signal_pending(current)) 568 return -ERESTARTSYS; 569 570 repeat: 571 while (pipe_empty(pipe->head, pipe->tail)) { 572 if (!pipe->writers) 573 return 0; 574 575 if (sd->num_spliced) 576 return 0; 577 578 if (sd->flags & SPLICE_F_NONBLOCK) 579 return -EAGAIN; 580 581 if (signal_pending(current)) 582 return -ERESTARTSYS; 583 584 if (sd->need_wakeup) { 585 wakeup_pipe_writers(pipe); 586 sd->need_wakeup = false; 587 } 588 589 pipe_wait_readable(pipe); 590 } 591 592 if (eat_empty_buffer(pipe)) 593 goto repeat; 594 595 return 1; 596 } 597 598 /** 599 * splice_from_pipe_begin - start splicing from pipe 600 * @sd: information about the splice operation 601 * 602 * Description: 603 * This function should be called before a loop containing 604 * splice_from_pipe_next() and splice_from_pipe_feed() to 605 * initialize the necessary fields of @sd. 606 */ 607 static void splice_from_pipe_begin(struct splice_desc *sd) 608 { 609 sd->num_spliced = 0; 610 sd->need_wakeup = false; 611 } 612 613 /** 614 * splice_from_pipe_end - finish splicing from pipe 615 * @pipe: pipe to splice from 616 * @sd: information about the splice operation 617 * 618 * Description: 619 * This function will wake up pipe writers if necessary. It should 620 * be called after a loop containing splice_from_pipe_next() and 621 * splice_from_pipe_feed(). 622 */ 623 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 624 { 625 if (sd->need_wakeup) 626 wakeup_pipe_writers(pipe); 627 } 628 629 /** 630 * __splice_from_pipe - splice data from a pipe to given actor 631 * @pipe: pipe to splice from 632 * @sd: information to @actor 633 * @actor: handler that splices the data 634 * 635 * Description: 636 * This function does little more than loop over the pipe and call 637 * @actor to do the actual moving of a single struct pipe_buffer to 638 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 639 * pipe_to_user. 640 * 641 */ 642 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 643 splice_actor *actor) 644 { 645 int ret; 646 647 splice_from_pipe_begin(sd); 648 do { 649 cond_resched(); 650 ret = splice_from_pipe_next(pipe, sd); 651 if (ret > 0) 652 ret = splice_from_pipe_feed(pipe, sd, actor); 653 } while (ret > 0); 654 splice_from_pipe_end(pipe, sd); 655 656 return sd->num_spliced ? sd->num_spliced : ret; 657 } 658 EXPORT_SYMBOL(__splice_from_pipe); 659 660 /** 661 * splice_from_pipe - splice data from a pipe to a file 662 * @pipe: pipe to splice from 663 * @out: file to splice to 664 * @ppos: position in @out 665 * @len: how many bytes to splice 666 * @flags: splice modifier flags 667 * @actor: handler that splices the data 668 * 669 * Description: 670 * See __splice_from_pipe. This function locks the pipe inode, 671 * otherwise it's identical to __splice_from_pipe(). 672 * 673 */ 674 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 675 loff_t *ppos, size_t len, unsigned int flags, 676 splice_actor *actor) 677 { 678 ssize_t ret; 679 struct splice_desc sd = { 680 .total_len = len, 681 .flags = flags, 682 .pos = *ppos, 683 .u.file = out, 684 }; 685 686 pipe_lock(pipe); 687 ret = __splice_from_pipe(pipe, &sd, actor); 688 pipe_unlock(pipe); 689 690 return ret; 691 } 692 693 /** 694 * iter_file_splice_write - splice data from a pipe to a file 695 * @pipe: pipe info 696 * @out: file to write to 697 * @ppos: position in @out 698 * @len: number of bytes to splice 699 * @flags: splice modifier flags 700 * 701 * Description: 702 * Will either move or copy pages (determined by @flags options) from 703 * the given pipe inode to the given file. 704 * This one is ->write_iter-based. 705 * 706 */ 707 ssize_t 708 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 709 loff_t *ppos, size_t len, unsigned int flags) 710 { 711 struct splice_desc sd = { 712 .total_len = len, 713 .flags = flags, 714 .pos = *ppos, 715 .u.file = out, 716 }; 717 int nbufs = pipe->max_usage; 718 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 719 GFP_KERNEL); 720 ssize_t ret; 721 722 if (unlikely(!array)) 723 return -ENOMEM; 724 725 pipe_lock(pipe); 726 727 splice_from_pipe_begin(&sd); 728 while (sd.total_len) { 729 struct iov_iter from; 730 unsigned int head, tail, mask; 731 size_t left; 732 int n; 733 734 ret = splice_from_pipe_next(pipe, &sd); 735 if (ret <= 0) 736 break; 737 738 if (unlikely(nbufs < pipe->max_usage)) { 739 kfree(array); 740 nbufs = pipe->max_usage; 741 array = kcalloc(nbufs, sizeof(struct bio_vec), 742 GFP_KERNEL); 743 if (!array) { 744 ret = -ENOMEM; 745 break; 746 } 747 } 748 749 head = pipe->head; 750 tail = pipe->tail; 751 mask = pipe->ring_size - 1; 752 753 /* build the vector */ 754 left = sd.total_len; 755 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { 756 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 757 size_t this_len = buf->len; 758 759 /* zero-length bvecs are not supported, skip them */ 760 if (!this_len) 761 continue; 762 this_len = min(this_len, left); 763 764 ret = pipe_buf_confirm(pipe, buf); 765 if (unlikely(ret)) { 766 if (ret == -ENODATA) 767 ret = 0; 768 goto done; 769 } 770 771 bvec_set_page(&array[n], buf->page, this_len, 772 buf->offset); 773 left -= this_len; 774 n++; 775 } 776 777 iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); 778 ret = vfs_iter_write(out, &from, &sd.pos, 0); 779 if (ret <= 0) 780 break; 781 782 sd.num_spliced += ret; 783 sd.total_len -= ret; 784 *ppos = sd.pos; 785 786 /* dismiss the fully eaten buffers, adjust the partial one */ 787 tail = pipe->tail; 788 while (ret) { 789 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 790 if (ret >= buf->len) { 791 ret -= buf->len; 792 buf->len = 0; 793 pipe_buf_release(pipe, buf); 794 tail++; 795 pipe->tail = tail; 796 if (pipe->files) 797 sd.need_wakeup = true; 798 } else { 799 buf->offset += ret; 800 buf->len -= ret; 801 ret = 0; 802 } 803 } 804 } 805 done: 806 kfree(array); 807 splice_from_pipe_end(pipe, &sd); 808 809 pipe_unlock(pipe); 810 811 if (sd.num_spliced) 812 ret = sd.num_spliced; 813 814 return ret; 815 } 816 817 EXPORT_SYMBOL(iter_file_splice_write); 818 819 /** 820 * generic_splice_sendpage - splice data from a pipe to a socket 821 * @pipe: pipe to splice from 822 * @out: socket to write to 823 * @ppos: position in @out 824 * @len: number of bytes to splice 825 * @flags: splice modifier flags 826 * 827 * Description: 828 * Will send @len bytes from the pipe to a network socket. No data copying 829 * is involved. 830 * 831 */ 832 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 833 loff_t *ppos, size_t len, unsigned int flags) 834 { 835 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 836 } 837 838 EXPORT_SYMBOL(generic_splice_sendpage); 839 840 static int warn_unsupported(struct file *file, const char *op) 841 { 842 pr_debug_ratelimited( 843 "splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n", 844 op, file, current->pid, current->comm); 845 return -EINVAL; 846 } 847 848 /* 849 * Attempt to initiate a splice from pipe to file. 850 */ 851 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 852 loff_t *ppos, size_t len, unsigned int flags) 853 { 854 if (unlikely(!out->f_op->splice_write)) 855 return warn_unsupported(out, "write"); 856 return out->f_op->splice_write(pipe, out, ppos, len, flags); 857 } 858 859 /* 860 * Attempt to initiate a splice from a file to a pipe. 861 */ 862 static long do_splice_to(struct file *in, loff_t *ppos, 863 struct pipe_inode_info *pipe, size_t len, 864 unsigned int flags) 865 { 866 unsigned int p_space; 867 int ret; 868 869 if (unlikely(!(in->f_mode & FMODE_READ))) 870 return -EBADF; 871 872 /* Don't try to read more the pipe has space for. */ 873 p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); 874 len = min_t(size_t, len, p_space << PAGE_SHIFT); 875 876 ret = rw_verify_area(READ, in, ppos, len); 877 if (unlikely(ret < 0)) 878 return ret; 879 880 if (unlikely(len > MAX_RW_COUNT)) 881 len = MAX_RW_COUNT; 882 883 if (unlikely(!in->f_op->splice_read)) 884 return warn_unsupported(in, "read"); 885 return in->f_op->splice_read(in, ppos, pipe, len, flags); 886 } 887 888 /** 889 * splice_direct_to_actor - splices data directly between two non-pipes 890 * @in: file to splice from 891 * @sd: actor information on where to splice to 892 * @actor: handles the data splicing 893 * 894 * Description: 895 * This is a special case helper to splice directly between two 896 * points, without requiring an explicit pipe. Internally an allocated 897 * pipe is cached in the process, and reused during the lifetime of 898 * that process. 899 * 900 */ 901 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 902 splice_direct_actor *actor) 903 { 904 struct pipe_inode_info *pipe; 905 long ret, bytes; 906 size_t len; 907 int i, flags, more; 908 909 /* 910 * We require the input to be seekable, as we don't want to randomly 911 * drop data for eg socket -> socket splicing. Use the piped splicing 912 * for that! 913 */ 914 if (unlikely(!(in->f_mode & FMODE_LSEEK))) 915 return -EINVAL; 916 917 /* 918 * neither in nor out is a pipe, setup an internal pipe attached to 919 * 'out' and transfer the wanted data from 'in' to 'out' through that 920 */ 921 pipe = current->splice_pipe; 922 if (unlikely(!pipe)) { 923 pipe = alloc_pipe_info(); 924 if (!pipe) 925 return -ENOMEM; 926 927 /* 928 * We don't have an immediate reader, but we'll read the stuff 929 * out of the pipe right after the splice_to_pipe(). So set 930 * PIPE_READERS appropriately. 931 */ 932 pipe->readers = 1; 933 934 current->splice_pipe = pipe; 935 } 936 937 /* 938 * Do the splice. 939 */ 940 ret = 0; 941 bytes = 0; 942 len = sd->total_len; 943 flags = sd->flags; 944 945 /* 946 * Don't block on output, we have to drain the direct pipe. 947 */ 948 sd->flags &= ~SPLICE_F_NONBLOCK; 949 more = sd->flags & SPLICE_F_MORE; 950 951 WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); 952 953 while (len) { 954 size_t read_len; 955 loff_t pos = sd->pos, prev_pos = pos; 956 957 ret = do_splice_to(in, &pos, pipe, len, flags); 958 if (unlikely(ret <= 0)) 959 goto out_release; 960 961 read_len = ret; 962 sd->total_len = read_len; 963 964 /* 965 * If more data is pending, set SPLICE_F_MORE 966 * If this is the last data and SPLICE_F_MORE was not set 967 * initially, clears it. 968 */ 969 if (read_len < len) 970 sd->flags |= SPLICE_F_MORE; 971 else if (!more) 972 sd->flags &= ~SPLICE_F_MORE; 973 /* 974 * NOTE: nonblocking mode only applies to the input. We 975 * must not do the output in nonblocking mode as then we 976 * could get stuck data in the internal pipe: 977 */ 978 ret = actor(pipe, sd); 979 if (unlikely(ret <= 0)) { 980 sd->pos = prev_pos; 981 goto out_release; 982 } 983 984 bytes += ret; 985 len -= ret; 986 sd->pos = pos; 987 988 if (ret < read_len) { 989 sd->pos = prev_pos + ret; 990 goto out_release; 991 } 992 } 993 994 done: 995 pipe->tail = pipe->head = 0; 996 file_accessed(in); 997 return bytes; 998 999 out_release: 1000 /* 1001 * If we did an incomplete transfer we must release 1002 * the pipe buffers in question: 1003 */ 1004 for (i = 0; i < pipe->ring_size; i++) { 1005 struct pipe_buffer *buf = &pipe->bufs[i]; 1006 1007 if (buf->ops) 1008 pipe_buf_release(pipe, buf); 1009 } 1010 1011 if (!bytes) 1012 bytes = ret; 1013 1014 goto done; 1015 } 1016 EXPORT_SYMBOL(splice_direct_to_actor); 1017 1018 static int direct_splice_actor(struct pipe_inode_info *pipe, 1019 struct splice_desc *sd) 1020 { 1021 struct file *file = sd->u.file; 1022 1023 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1024 sd->flags); 1025 } 1026 1027 /** 1028 * do_splice_direct - splices data directly between two files 1029 * @in: file to splice from 1030 * @ppos: input file offset 1031 * @out: file to splice to 1032 * @opos: output file offset 1033 * @len: number of bytes to splice 1034 * @flags: splice modifier flags 1035 * 1036 * Description: 1037 * For use by do_sendfile(). splice can easily emulate sendfile, but 1038 * doing it in the application would incur an extra system call 1039 * (splice in + splice out, as compared to just sendfile()). So this helper 1040 * can splice directly through a process-private pipe. 1041 * 1042 */ 1043 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1044 loff_t *opos, size_t len, unsigned int flags) 1045 { 1046 struct splice_desc sd = { 1047 .len = len, 1048 .total_len = len, 1049 .flags = flags, 1050 .pos = *ppos, 1051 .u.file = out, 1052 .opos = opos, 1053 }; 1054 long ret; 1055 1056 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1057 return -EBADF; 1058 1059 if (unlikely(out->f_flags & O_APPEND)) 1060 return -EINVAL; 1061 1062 ret = rw_verify_area(WRITE, out, opos, len); 1063 if (unlikely(ret < 0)) 1064 return ret; 1065 1066 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1067 if (ret > 0) 1068 *ppos = sd.pos; 1069 1070 return ret; 1071 } 1072 EXPORT_SYMBOL(do_splice_direct); 1073 1074 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1075 { 1076 for (;;) { 1077 if (unlikely(!pipe->readers)) { 1078 send_sig(SIGPIPE, current, 0); 1079 return -EPIPE; 1080 } 1081 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1082 return 0; 1083 if (flags & SPLICE_F_NONBLOCK) 1084 return -EAGAIN; 1085 if (signal_pending(current)) 1086 return -ERESTARTSYS; 1087 pipe_wait_writable(pipe); 1088 } 1089 } 1090 1091 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1092 struct pipe_inode_info *opipe, 1093 size_t len, unsigned int flags); 1094 1095 long splice_file_to_pipe(struct file *in, 1096 struct pipe_inode_info *opipe, 1097 loff_t *offset, 1098 size_t len, unsigned int flags) 1099 { 1100 long ret; 1101 1102 pipe_lock(opipe); 1103 ret = wait_for_space(opipe, flags); 1104 if (!ret) 1105 ret = do_splice_to(in, offset, opipe, len, flags); 1106 pipe_unlock(opipe); 1107 if (ret > 0) 1108 wakeup_pipe_readers(opipe); 1109 return ret; 1110 } 1111 1112 /* 1113 * Determine where to splice to/from. 1114 */ 1115 long do_splice(struct file *in, loff_t *off_in, struct file *out, 1116 loff_t *off_out, size_t len, unsigned int flags) 1117 { 1118 struct pipe_inode_info *ipipe; 1119 struct pipe_inode_info *opipe; 1120 loff_t offset; 1121 long ret; 1122 1123 if (unlikely(!(in->f_mode & FMODE_READ) || 1124 !(out->f_mode & FMODE_WRITE))) 1125 return -EBADF; 1126 1127 ipipe = get_pipe_info(in, true); 1128 opipe = get_pipe_info(out, true); 1129 1130 if (ipipe && opipe) { 1131 if (off_in || off_out) 1132 return -ESPIPE; 1133 1134 /* Splicing to self would be fun, but... */ 1135 if (ipipe == opipe) 1136 return -EINVAL; 1137 1138 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1139 flags |= SPLICE_F_NONBLOCK; 1140 1141 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1142 } 1143 1144 if (ipipe) { 1145 if (off_in) 1146 return -ESPIPE; 1147 if (off_out) { 1148 if (!(out->f_mode & FMODE_PWRITE)) 1149 return -EINVAL; 1150 offset = *off_out; 1151 } else { 1152 offset = out->f_pos; 1153 } 1154 1155 if (unlikely(out->f_flags & O_APPEND)) 1156 return -EINVAL; 1157 1158 ret = rw_verify_area(WRITE, out, &offset, len); 1159 if (unlikely(ret < 0)) 1160 return ret; 1161 1162 if (in->f_flags & O_NONBLOCK) 1163 flags |= SPLICE_F_NONBLOCK; 1164 1165 file_start_write(out); 1166 ret = do_splice_from(ipipe, out, &offset, len, flags); 1167 file_end_write(out); 1168 1169 if (!off_out) 1170 out->f_pos = offset; 1171 else 1172 *off_out = offset; 1173 1174 return ret; 1175 } 1176 1177 if (opipe) { 1178 if (off_out) 1179 return -ESPIPE; 1180 if (off_in) { 1181 if (!(in->f_mode & FMODE_PREAD)) 1182 return -EINVAL; 1183 offset = *off_in; 1184 } else { 1185 offset = in->f_pos; 1186 } 1187 1188 if (out->f_flags & O_NONBLOCK) 1189 flags |= SPLICE_F_NONBLOCK; 1190 1191 ret = splice_file_to_pipe(in, opipe, &offset, len, flags); 1192 if (!off_in) 1193 in->f_pos = offset; 1194 else 1195 *off_in = offset; 1196 1197 return ret; 1198 } 1199 1200 return -EINVAL; 1201 } 1202 1203 static long __do_splice(struct file *in, loff_t __user *off_in, 1204 struct file *out, loff_t __user *off_out, 1205 size_t len, unsigned int flags) 1206 { 1207 struct pipe_inode_info *ipipe; 1208 struct pipe_inode_info *opipe; 1209 loff_t offset, *__off_in = NULL, *__off_out = NULL; 1210 long ret; 1211 1212 ipipe = get_pipe_info(in, true); 1213 opipe = get_pipe_info(out, true); 1214 1215 if (ipipe && off_in) 1216 return -ESPIPE; 1217 if (opipe && off_out) 1218 return -ESPIPE; 1219 1220 if (off_out) { 1221 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1222 return -EFAULT; 1223 __off_out = &offset; 1224 } 1225 if (off_in) { 1226 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1227 return -EFAULT; 1228 __off_in = &offset; 1229 } 1230 1231 ret = do_splice(in, __off_in, out, __off_out, len, flags); 1232 if (ret < 0) 1233 return ret; 1234 1235 if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t))) 1236 return -EFAULT; 1237 if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t))) 1238 return -EFAULT; 1239 1240 return ret; 1241 } 1242 1243 static int iter_to_pipe(struct iov_iter *from, 1244 struct pipe_inode_info *pipe, 1245 unsigned flags) 1246 { 1247 struct pipe_buffer buf = { 1248 .ops = &user_page_pipe_buf_ops, 1249 .flags = flags 1250 }; 1251 size_t total = 0; 1252 int ret = 0; 1253 1254 while (iov_iter_count(from)) { 1255 struct page *pages[16]; 1256 ssize_t left; 1257 size_t start; 1258 int i, n; 1259 1260 left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); 1261 if (left <= 0) { 1262 ret = left; 1263 break; 1264 } 1265 1266 n = DIV_ROUND_UP(left + start, PAGE_SIZE); 1267 for (i = 0; i < n; i++) { 1268 int size = min_t(int, left, PAGE_SIZE - start); 1269 1270 buf.page = pages[i]; 1271 buf.offset = start; 1272 buf.len = size; 1273 ret = add_to_pipe(pipe, &buf); 1274 if (unlikely(ret < 0)) { 1275 iov_iter_revert(from, left); 1276 // this one got dropped by add_to_pipe() 1277 while (++i < n) 1278 put_page(pages[i]); 1279 goto out; 1280 } 1281 total += ret; 1282 left -= size; 1283 start = 0; 1284 } 1285 } 1286 out: 1287 return total ? total : ret; 1288 } 1289 1290 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1291 struct splice_desc *sd) 1292 { 1293 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1294 return n == sd->len ? n : -EFAULT; 1295 } 1296 1297 /* 1298 * For lack of a better implementation, implement vmsplice() to userspace 1299 * as a simple copy of the pipes pages to the user iov. 1300 */ 1301 static long vmsplice_to_user(struct file *file, struct iov_iter *iter, 1302 unsigned int flags) 1303 { 1304 struct pipe_inode_info *pipe = get_pipe_info(file, true); 1305 struct splice_desc sd = { 1306 .total_len = iov_iter_count(iter), 1307 .flags = flags, 1308 .u.data = iter 1309 }; 1310 long ret = 0; 1311 1312 if (!pipe) 1313 return -EBADF; 1314 1315 if (sd.total_len) { 1316 pipe_lock(pipe); 1317 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1318 pipe_unlock(pipe); 1319 } 1320 1321 return ret; 1322 } 1323 1324 /* 1325 * vmsplice splices a user address range into a pipe. It can be thought of 1326 * as splice-from-memory, where the regular splice is splice-from-file (or 1327 * to file). In both cases the output is a pipe, naturally. 1328 */ 1329 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1330 unsigned int flags) 1331 { 1332 struct pipe_inode_info *pipe; 1333 long ret = 0; 1334 unsigned buf_flag = 0; 1335 1336 if (flags & SPLICE_F_GIFT) 1337 buf_flag = PIPE_BUF_FLAG_GIFT; 1338 1339 pipe = get_pipe_info(file, true); 1340 if (!pipe) 1341 return -EBADF; 1342 1343 pipe_lock(pipe); 1344 ret = wait_for_space(pipe, flags); 1345 if (!ret) 1346 ret = iter_to_pipe(iter, pipe, buf_flag); 1347 pipe_unlock(pipe); 1348 if (ret > 0) 1349 wakeup_pipe_readers(pipe); 1350 return ret; 1351 } 1352 1353 static int vmsplice_type(struct fd f, int *type) 1354 { 1355 if (!f.file) 1356 return -EBADF; 1357 if (f.file->f_mode & FMODE_WRITE) { 1358 *type = ITER_SOURCE; 1359 } else if (f.file->f_mode & FMODE_READ) { 1360 *type = ITER_DEST; 1361 } else { 1362 fdput(f); 1363 return -EBADF; 1364 } 1365 return 0; 1366 } 1367 1368 /* 1369 * Note that vmsplice only really supports true splicing _from_ user memory 1370 * to a pipe, not the other way around. Splicing from user memory is a simple 1371 * operation that can be supported without any funky alignment restrictions 1372 * or nasty vm tricks. We simply map in the user memory and fill them into 1373 * a pipe. The reverse isn't quite as easy, though. There are two possible 1374 * solutions for that: 1375 * 1376 * - memcpy() the data internally, at which point we might as well just 1377 * do a regular read() on the buffer anyway. 1378 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1379 * has restriction limitations on both ends of the pipe). 1380 * 1381 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1382 * 1383 */ 1384 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, 1385 unsigned long, nr_segs, unsigned int, flags) 1386 { 1387 struct iovec iovstack[UIO_FASTIOV]; 1388 struct iovec *iov = iovstack; 1389 struct iov_iter iter; 1390 ssize_t error; 1391 struct fd f; 1392 int type; 1393 1394 if (unlikely(flags & ~SPLICE_F_ALL)) 1395 return -EINVAL; 1396 1397 f = fdget(fd); 1398 error = vmsplice_type(f, &type); 1399 if (error) 1400 return error; 1401 1402 error = import_iovec(type, uiov, nr_segs, 1403 ARRAY_SIZE(iovstack), &iov, &iter); 1404 if (error < 0) 1405 goto out_fdput; 1406 1407 if (!iov_iter_count(&iter)) 1408 error = 0; 1409 else if (type == ITER_SOURCE) 1410 error = vmsplice_to_pipe(f.file, &iter, flags); 1411 else 1412 error = vmsplice_to_user(f.file, &iter, flags); 1413 1414 kfree(iov); 1415 out_fdput: 1416 fdput(f); 1417 return error; 1418 } 1419 1420 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1421 int, fd_out, loff_t __user *, off_out, 1422 size_t, len, unsigned int, flags) 1423 { 1424 struct fd in, out; 1425 long error; 1426 1427 if (unlikely(!len)) 1428 return 0; 1429 1430 if (unlikely(flags & ~SPLICE_F_ALL)) 1431 return -EINVAL; 1432 1433 error = -EBADF; 1434 in = fdget(fd_in); 1435 if (in.file) { 1436 out = fdget(fd_out); 1437 if (out.file) { 1438 error = __do_splice(in.file, off_in, out.file, off_out, 1439 len, flags); 1440 fdput(out); 1441 } 1442 fdput(in); 1443 } 1444 return error; 1445 } 1446 1447 /* 1448 * Make sure there's data to read. Wait for input if we can, otherwise 1449 * return an appropriate error. 1450 */ 1451 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1452 { 1453 int ret; 1454 1455 /* 1456 * Check the pipe occupancy without the inode lock first. This function 1457 * is speculative anyways, so missing one is ok. 1458 */ 1459 if (!pipe_empty(pipe->head, pipe->tail)) 1460 return 0; 1461 1462 ret = 0; 1463 pipe_lock(pipe); 1464 1465 while (pipe_empty(pipe->head, pipe->tail)) { 1466 if (signal_pending(current)) { 1467 ret = -ERESTARTSYS; 1468 break; 1469 } 1470 if (!pipe->writers) 1471 break; 1472 if (flags & SPLICE_F_NONBLOCK) { 1473 ret = -EAGAIN; 1474 break; 1475 } 1476 pipe_wait_readable(pipe); 1477 } 1478 1479 pipe_unlock(pipe); 1480 return ret; 1481 } 1482 1483 /* 1484 * Make sure there's writeable room. Wait for room if we can, otherwise 1485 * return an appropriate error. 1486 */ 1487 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1488 { 1489 int ret; 1490 1491 /* 1492 * Check pipe occupancy without the inode lock first. This function 1493 * is speculative anyways, so missing one is ok. 1494 */ 1495 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1496 return 0; 1497 1498 ret = 0; 1499 pipe_lock(pipe); 1500 1501 while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 1502 if (!pipe->readers) { 1503 send_sig(SIGPIPE, current, 0); 1504 ret = -EPIPE; 1505 break; 1506 } 1507 if (flags & SPLICE_F_NONBLOCK) { 1508 ret = -EAGAIN; 1509 break; 1510 } 1511 if (signal_pending(current)) { 1512 ret = -ERESTARTSYS; 1513 break; 1514 } 1515 pipe_wait_writable(pipe); 1516 } 1517 1518 pipe_unlock(pipe); 1519 return ret; 1520 } 1521 1522 /* 1523 * Splice contents of ipipe to opipe. 1524 */ 1525 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1526 struct pipe_inode_info *opipe, 1527 size_t len, unsigned int flags) 1528 { 1529 struct pipe_buffer *ibuf, *obuf; 1530 unsigned int i_head, o_head; 1531 unsigned int i_tail, o_tail; 1532 unsigned int i_mask, o_mask; 1533 int ret = 0; 1534 bool input_wakeup = false; 1535 1536 1537 retry: 1538 ret = ipipe_prep(ipipe, flags); 1539 if (ret) 1540 return ret; 1541 1542 ret = opipe_prep(opipe, flags); 1543 if (ret) 1544 return ret; 1545 1546 /* 1547 * Potential ABBA deadlock, work around it by ordering lock 1548 * grabbing by pipe info address. Otherwise two different processes 1549 * could deadlock (one doing tee from A -> B, the other from B -> A). 1550 */ 1551 pipe_double_lock(ipipe, opipe); 1552 1553 i_tail = ipipe->tail; 1554 i_mask = ipipe->ring_size - 1; 1555 o_head = opipe->head; 1556 o_mask = opipe->ring_size - 1; 1557 1558 do { 1559 size_t o_len; 1560 1561 if (!opipe->readers) { 1562 send_sig(SIGPIPE, current, 0); 1563 if (!ret) 1564 ret = -EPIPE; 1565 break; 1566 } 1567 1568 i_head = ipipe->head; 1569 o_tail = opipe->tail; 1570 1571 if (pipe_empty(i_head, i_tail) && !ipipe->writers) 1572 break; 1573 1574 /* 1575 * Cannot make any progress, because either the input 1576 * pipe is empty or the output pipe is full. 1577 */ 1578 if (pipe_empty(i_head, i_tail) || 1579 pipe_full(o_head, o_tail, opipe->max_usage)) { 1580 /* Already processed some buffers, break */ 1581 if (ret) 1582 break; 1583 1584 if (flags & SPLICE_F_NONBLOCK) { 1585 ret = -EAGAIN; 1586 break; 1587 } 1588 1589 /* 1590 * We raced with another reader/writer and haven't 1591 * managed to process any buffers. A zero return 1592 * value means EOF, so retry instead. 1593 */ 1594 pipe_unlock(ipipe); 1595 pipe_unlock(opipe); 1596 goto retry; 1597 } 1598 1599 ibuf = &ipipe->bufs[i_tail & i_mask]; 1600 obuf = &opipe->bufs[o_head & o_mask]; 1601 1602 if (len >= ibuf->len) { 1603 /* 1604 * Simply move the whole buffer from ipipe to opipe 1605 */ 1606 *obuf = *ibuf; 1607 ibuf->ops = NULL; 1608 i_tail++; 1609 ipipe->tail = i_tail; 1610 input_wakeup = true; 1611 o_len = obuf->len; 1612 o_head++; 1613 opipe->head = o_head; 1614 } else { 1615 /* 1616 * Get a reference to this pipe buffer, 1617 * so we can copy the contents over. 1618 */ 1619 if (!pipe_buf_get(ipipe, ibuf)) { 1620 if (ret == 0) 1621 ret = -EFAULT; 1622 break; 1623 } 1624 *obuf = *ibuf; 1625 1626 /* 1627 * Don't inherit the gift and merge flags, we need to 1628 * prevent multiple steals of this page. 1629 */ 1630 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1631 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1632 1633 obuf->len = len; 1634 ibuf->offset += len; 1635 ibuf->len -= len; 1636 o_len = len; 1637 o_head++; 1638 opipe->head = o_head; 1639 } 1640 ret += o_len; 1641 len -= o_len; 1642 } while (len); 1643 1644 pipe_unlock(ipipe); 1645 pipe_unlock(opipe); 1646 1647 /* 1648 * If we put data in the output pipe, wakeup any potential readers. 1649 */ 1650 if (ret > 0) 1651 wakeup_pipe_readers(opipe); 1652 1653 if (input_wakeup) 1654 wakeup_pipe_writers(ipipe); 1655 1656 return ret; 1657 } 1658 1659 /* 1660 * Link contents of ipipe to opipe. 1661 */ 1662 static int link_pipe(struct pipe_inode_info *ipipe, 1663 struct pipe_inode_info *opipe, 1664 size_t len, unsigned int flags) 1665 { 1666 struct pipe_buffer *ibuf, *obuf; 1667 unsigned int i_head, o_head; 1668 unsigned int i_tail, o_tail; 1669 unsigned int i_mask, o_mask; 1670 int ret = 0; 1671 1672 /* 1673 * Potential ABBA deadlock, work around it by ordering lock 1674 * grabbing by pipe info address. Otherwise two different processes 1675 * could deadlock (one doing tee from A -> B, the other from B -> A). 1676 */ 1677 pipe_double_lock(ipipe, opipe); 1678 1679 i_tail = ipipe->tail; 1680 i_mask = ipipe->ring_size - 1; 1681 o_head = opipe->head; 1682 o_mask = opipe->ring_size - 1; 1683 1684 do { 1685 if (!opipe->readers) { 1686 send_sig(SIGPIPE, current, 0); 1687 if (!ret) 1688 ret = -EPIPE; 1689 break; 1690 } 1691 1692 i_head = ipipe->head; 1693 o_tail = opipe->tail; 1694 1695 /* 1696 * If we have iterated all input buffers or run out of 1697 * output room, break. 1698 */ 1699 if (pipe_empty(i_head, i_tail) || 1700 pipe_full(o_head, o_tail, opipe->max_usage)) 1701 break; 1702 1703 ibuf = &ipipe->bufs[i_tail & i_mask]; 1704 obuf = &opipe->bufs[o_head & o_mask]; 1705 1706 /* 1707 * Get a reference to this pipe buffer, 1708 * so we can copy the contents over. 1709 */ 1710 if (!pipe_buf_get(ipipe, ibuf)) { 1711 if (ret == 0) 1712 ret = -EFAULT; 1713 break; 1714 } 1715 1716 *obuf = *ibuf; 1717 1718 /* 1719 * Don't inherit the gift and merge flag, we need to prevent 1720 * multiple steals of this page. 1721 */ 1722 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1723 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1724 1725 if (obuf->len > len) 1726 obuf->len = len; 1727 ret += obuf->len; 1728 len -= obuf->len; 1729 1730 o_head++; 1731 opipe->head = o_head; 1732 i_tail++; 1733 } while (len); 1734 1735 pipe_unlock(ipipe); 1736 pipe_unlock(opipe); 1737 1738 /* 1739 * If we put data in the output pipe, wakeup any potential readers. 1740 */ 1741 if (ret > 0) 1742 wakeup_pipe_readers(opipe); 1743 1744 return ret; 1745 } 1746 1747 /* 1748 * This is a tee(1) implementation that works on pipes. It doesn't copy 1749 * any data, it simply references the 'in' pages on the 'out' pipe. 1750 * The 'flags' used are the SPLICE_F_* variants, currently the only 1751 * applicable one is SPLICE_F_NONBLOCK. 1752 */ 1753 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) 1754 { 1755 struct pipe_inode_info *ipipe = get_pipe_info(in, true); 1756 struct pipe_inode_info *opipe = get_pipe_info(out, true); 1757 int ret = -EINVAL; 1758 1759 if (unlikely(!(in->f_mode & FMODE_READ) || 1760 !(out->f_mode & FMODE_WRITE))) 1761 return -EBADF; 1762 1763 /* 1764 * Duplicate the contents of ipipe to opipe without actually 1765 * copying the data. 1766 */ 1767 if (ipipe && opipe && ipipe != opipe) { 1768 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1769 flags |= SPLICE_F_NONBLOCK; 1770 1771 /* 1772 * Keep going, unless we encounter an error. The ipipe/opipe 1773 * ordering doesn't really matter. 1774 */ 1775 ret = ipipe_prep(ipipe, flags); 1776 if (!ret) { 1777 ret = opipe_prep(opipe, flags); 1778 if (!ret) 1779 ret = link_pipe(ipipe, opipe, len, flags); 1780 } 1781 } 1782 1783 return ret; 1784 } 1785 1786 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1787 { 1788 struct fd in, out; 1789 int error; 1790 1791 if (unlikely(flags & ~SPLICE_F_ALL)) 1792 return -EINVAL; 1793 1794 if (unlikely(!len)) 1795 return 0; 1796 1797 error = -EBADF; 1798 in = fdget(fdin); 1799 if (in.file) { 1800 out = fdget(fdout); 1801 if (out.file) { 1802 error = do_tee(in.file, out.file, len, flags); 1803 fdput(out); 1804 } 1805 fdput(in); 1806 } 1807 1808 return error; 1809 } 1810