1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * "splice": joining two ropes together by interweaving their strands. 4 * 5 * This is the "extended pipe" functionality, where a pipe is used as 6 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 7 * buffer that you can use to transfer data from one end to the other. 8 * 9 * The traditional unix read/write is extended with a "splice()" operation 10 * that transfers data buffers to or from a pipe buffer. 11 * 12 * Named by Larry McVoy, original implementation from Linus, extended by 13 * Jens to support splicing to files, network, direct splicing, etc and 14 * fixing lots of bugs. 15 * 16 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 17 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 18 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 19 * 20 */ 21 #include <linux/bvec.h> 22 #include <linux/fs.h> 23 #include <linux/file.h> 24 #include <linux/pagemap.h> 25 #include <linux/splice.h> 26 #include <linux/memcontrol.h> 27 #include <linux/mm_inline.h> 28 #include <linux/swap.h> 29 #include <linux/writeback.h> 30 #include <linux/export.h> 31 #include <linux/syscalls.h> 32 #include <linux/uio.h> 33 #include <linux/security.h> 34 #include <linux/gfp.h> 35 #include <linux/socket.h> 36 #include <linux/sched/signal.h> 37 38 #include "internal.h" 39 40 /* 41 * Attempt to steal a page from a pipe buffer. This should perhaps go into 42 * a vm helper function, it's already simplified quite a bit by the 43 * addition of remove_mapping(). If success is returned, the caller may 44 * attempt to reuse this page for another destination. 45 */ 46 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, 47 struct pipe_buffer *buf) 48 { 49 struct page *page = buf->page; 50 struct address_space *mapping; 51 52 lock_page(page); 53 54 mapping = page_mapping(page); 55 if (mapping) { 56 WARN_ON(!PageUptodate(page)); 57 58 /* 59 * At least for ext2 with nobh option, we need to wait on 60 * writeback completing on this page, since we'll remove it 61 * from the pagecache. Otherwise truncate wont wait on the 62 * page, allowing the disk blocks to be reused by someone else 63 * before we actually wrote our data to them. fs corruption 64 * ensues. 65 */ 66 wait_on_page_writeback(page); 67 68 if (page_has_private(page) && 69 !try_to_release_page(page, GFP_KERNEL)) 70 goto out_unlock; 71 72 /* 73 * If we succeeded in removing the mapping, set LRU flag 74 * and return good. 75 */ 76 if (remove_mapping(mapping, page)) { 77 buf->flags |= PIPE_BUF_FLAG_LRU; 78 return true; 79 } 80 } 81 82 /* 83 * Raced with truncate or failed to remove page from current 84 * address space, unlock and return failure. 85 */ 86 out_unlock: 87 unlock_page(page); 88 return false; 89 } 90 91 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 92 struct pipe_buffer *buf) 93 { 94 put_page(buf->page); 95 buf->flags &= ~PIPE_BUF_FLAG_LRU; 96 } 97 98 /* 99 * Check whether the contents of buf is OK to access. Since the content 100 * is a page cache page, IO may be in flight. 101 */ 102 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 103 struct pipe_buffer *buf) 104 { 105 struct page *page = buf->page; 106 int err; 107 108 if (!PageUptodate(page)) { 109 lock_page(page); 110 111 /* 112 * Page got truncated/unhashed. This will cause a 0-byte 113 * splice, if this is the first page. 114 */ 115 if (!page->mapping) { 116 err = -ENODATA; 117 goto error; 118 } 119 120 /* 121 * Uh oh, read-error from disk. 122 */ 123 if (!PageUptodate(page)) { 124 err = -EIO; 125 goto error; 126 } 127 128 /* 129 * Page is ok afterall, we are done. 130 */ 131 unlock_page(page); 132 } 133 134 return 0; 135 error: 136 unlock_page(page); 137 return err; 138 } 139 140 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 141 .confirm = page_cache_pipe_buf_confirm, 142 .release = page_cache_pipe_buf_release, 143 .try_steal = page_cache_pipe_buf_try_steal, 144 .get = generic_pipe_buf_get, 145 }; 146 147 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe, 148 struct pipe_buffer *buf) 149 { 150 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 151 return false; 152 153 buf->flags |= PIPE_BUF_FLAG_LRU; 154 return generic_pipe_buf_try_steal(pipe, buf); 155 } 156 157 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 158 .release = page_cache_pipe_buf_release, 159 .try_steal = user_page_pipe_buf_try_steal, 160 .get = generic_pipe_buf_get, 161 }; 162 163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 164 { 165 smp_mb(); 166 if (waitqueue_active(&pipe->rd_wait)) 167 wake_up_interruptible(&pipe->rd_wait); 168 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 169 } 170 171 /** 172 * splice_to_pipe - fill passed data into a pipe 173 * @pipe: pipe to fill 174 * @spd: data to fill 175 * 176 * Description: 177 * @spd contains a map of pages and len/offset tuples, along with 178 * the struct pipe_buf_operations associated with these pages. This 179 * function will link that data to the pipe. 180 * 181 */ 182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 183 struct splice_pipe_desc *spd) 184 { 185 unsigned int spd_pages = spd->nr_pages; 186 unsigned int tail = pipe->tail; 187 unsigned int head = pipe->head; 188 unsigned int mask = pipe->ring_size - 1; 189 int ret = 0, page_nr = 0; 190 191 if (!spd_pages) 192 return 0; 193 194 if (unlikely(!pipe->readers)) { 195 send_sig(SIGPIPE, current, 0); 196 ret = -EPIPE; 197 goto out; 198 } 199 200 while (!pipe_full(head, tail, pipe->max_usage)) { 201 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 202 203 buf->page = spd->pages[page_nr]; 204 buf->offset = spd->partial[page_nr].offset; 205 buf->len = spd->partial[page_nr].len; 206 buf->private = spd->partial[page_nr].private; 207 buf->ops = spd->ops; 208 buf->flags = 0; 209 210 head++; 211 pipe->head = head; 212 page_nr++; 213 ret += buf->len; 214 215 if (!--spd->nr_pages) 216 break; 217 } 218 219 if (!ret) 220 ret = -EAGAIN; 221 222 out: 223 while (page_nr < spd_pages) 224 spd->spd_release(spd, page_nr++); 225 226 return ret; 227 } 228 EXPORT_SYMBOL_GPL(splice_to_pipe); 229 230 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 231 { 232 unsigned int head = pipe->head; 233 unsigned int tail = pipe->tail; 234 unsigned int mask = pipe->ring_size - 1; 235 int ret; 236 237 if (unlikely(!pipe->readers)) { 238 send_sig(SIGPIPE, current, 0); 239 ret = -EPIPE; 240 } else if (pipe_full(head, tail, pipe->max_usage)) { 241 ret = -EAGAIN; 242 } else { 243 pipe->bufs[head & mask] = *buf; 244 pipe->head = head + 1; 245 return buf->len; 246 } 247 pipe_buf_release(pipe, buf); 248 return ret; 249 } 250 EXPORT_SYMBOL(add_to_pipe); 251 252 /* 253 * Check if we need to grow the arrays holding pages and partial page 254 * descriptions. 255 */ 256 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 257 { 258 unsigned int max_usage = READ_ONCE(pipe->max_usage); 259 260 spd->nr_pages_max = max_usage; 261 if (max_usage <= PIPE_DEF_BUFFERS) 262 return 0; 263 264 spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); 265 spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), 266 GFP_KERNEL); 267 268 if (spd->pages && spd->partial) 269 return 0; 270 271 kfree(spd->pages); 272 kfree(spd->partial); 273 return -ENOMEM; 274 } 275 276 void splice_shrink_spd(struct splice_pipe_desc *spd) 277 { 278 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 279 return; 280 281 kfree(spd->pages); 282 kfree(spd->partial); 283 } 284 285 /** 286 * generic_file_splice_read - splice data from file to a pipe 287 * @in: file to splice from 288 * @ppos: position in @in 289 * @pipe: pipe to splice to 290 * @len: number of bytes to splice 291 * @flags: splice modifier flags 292 * 293 * Description: 294 * Will read pages from given file and fill them into a pipe. Can be 295 * used as long as it has more or less sane ->read_iter(). 296 * 297 */ 298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 299 struct pipe_inode_info *pipe, size_t len, 300 unsigned int flags) 301 { 302 struct iov_iter to; 303 struct kiocb kiocb; 304 unsigned int i_head; 305 int ret; 306 307 iov_iter_pipe(&to, READ, pipe, len); 308 i_head = to.head; 309 init_sync_kiocb(&kiocb, in); 310 kiocb.ki_pos = *ppos; 311 ret = call_read_iter(in, &kiocb, &to); 312 if (ret > 0) { 313 *ppos = kiocb.ki_pos; 314 file_accessed(in); 315 } else if (ret < 0) { 316 to.head = i_head; 317 to.iov_offset = 0; 318 iov_iter_advance(&to, 0); /* to free what was emitted */ 319 /* 320 * callers of ->splice_read() expect -EAGAIN on 321 * "can't put anything in there", rather than -EFAULT. 322 */ 323 if (ret == -EFAULT) 324 ret = -EAGAIN; 325 } 326 327 return ret; 328 } 329 EXPORT_SYMBOL(generic_file_splice_read); 330 331 const struct pipe_buf_operations default_pipe_buf_ops = { 332 .release = generic_pipe_buf_release, 333 .try_steal = generic_pipe_buf_try_steal, 334 .get = generic_pipe_buf_get, 335 }; 336 337 /* Pipe buffer operations for a socket and similar. */ 338 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 339 .release = generic_pipe_buf_release, 340 .get = generic_pipe_buf_get, 341 }; 342 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 343 344 static ssize_t kernel_readv(struct file *file, const struct kvec *vec, 345 unsigned long vlen, loff_t offset) 346 { 347 mm_segment_t old_fs; 348 loff_t pos = offset; 349 ssize_t res; 350 351 old_fs = get_fs(); 352 set_fs(KERNEL_DS); 353 /* The cast to a user pointer is valid due to the set_fs() */ 354 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); 355 set_fs(old_fs); 356 357 return res; 358 } 359 360 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, 361 struct pipe_inode_info *pipe, size_t len, 362 unsigned int flags) 363 { 364 struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; 365 struct iov_iter to; 366 struct page **pages; 367 unsigned int nr_pages; 368 unsigned int mask; 369 size_t offset, base, copied = 0; 370 ssize_t res; 371 int i; 372 373 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 374 return -EAGAIN; 375 376 /* 377 * Try to keep page boundaries matching to source pagecache ones - 378 * it probably won't be much help, but... 379 */ 380 offset = *ppos & ~PAGE_MASK; 381 382 iov_iter_pipe(&to, READ, pipe, len + offset); 383 384 res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); 385 if (res <= 0) 386 return -ENOMEM; 387 388 nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); 389 390 vec = __vec; 391 if (nr_pages > PIPE_DEF_BUFFERS) { 392 vec = kmalloc_array(nr_pages, sizeof(struct kvec), GFP_KERNEL); 393 if (unlikely(!vec)) { 394 res = -ENOMEM; 395 goto out; 396 } 397 } 398 399 mask = pipe->ring_size - 1; 400 pipe->bufs[to.head & mask].offset = offset; 401 pipe->bufs[to.head & mask].len -= offset; 402 403 for (i = 0; i < nr_pages; i++) { 404 size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); 405 vec[i].iov_base = page_address(pages[i]) + offset; 406 vec[i].iov_len = this_len; 407 len -= this_len; 408 offset = 0; 409 } 410 411 res = kernel_readv(in, vec, nr_pages, *ppos); 412 if (res > 0) { 413 copied = res; 414 *ppos += res; 415 } 416 417 if (vec != __vec) 418 kfree(vec); 419 out: 420 for (i = 0; i < nr_pages; i++) 421 put_page(pages[i]); 422 kvfree(pages); 423 iov_iter_advance(&to, copied); /* truncates and discards */ 424 return res; 425 } 426 427 /* 428 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 429 * using sendpage(). Return the number of bytes sent. 430 */ 431 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 432 struct pipe_buffer *buf, struct splice_desc *sd) 433 { 434 struct file *file = sd->u.file; 435 loff_t pos = sd->pos; 436 int more; 437 438 if (!likely(file->f_op->sendpage)) 439 return -EINVAL; 440 441 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 442 443 if (sd->len < sd->total_len && 444 pipe_occupancy(pipe->head, pipe->tail) > 1) 445 more |= MSG_SENDPAGE_NOTLAST; 446 447 return file->f_op->sendpage(file, buf->page, buf->offset, 448 sd->len, &pos, more); 449 } 450 451 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 452 { 453 smp_mb(); 454 if (waitqueue_active(&pipe->wr_wait)) 455 wake_up_interruptible(&pipe->wr_wait); 456 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 457 } 458 459 /** 460 * splice_from_pipe_feed - feed available data from a pipe to a file 461 * @pipe: pipe to splice from 462 * @sd: information to @actor 463 * @actor: handler that splices the data 464 * 465 * Description: 466 * This function loops over the pipe and calls @actor to do the 467 * actual moving of a single struct pipe_buffer to the desired 468 * destination. It returns when there's no more buffers left in 469 * the pipe or if the requested number of bytes (@sd->total_len) 470 * have been copied. It returns a positive number (one) if the 471 * pipe needs to be filled with more data, zero if the required 472 * number of bytes have been copied and -errno on error. 473 * 474 * This, together with splice_from_pipe_{begin,end,next}, may be 475 * used to implement the functionality of __splice_from_pipe() when 476 * locking is required around copying the pipe buffers to the 477 * destination. 478 */ 479 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 480 splice_actor *actor) 481 { 482 unsigned int head = pipe->head; 483 unsigned int tail = pipe->tail; 484 unsigned int mask = pipe->ring_size - 1; 485 int ret; 486 487 while (!pipe_empty(head, tail)) { 488 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 489 490 sd->len = buf->len; 491 if (sd->len > sd->total_len) 492 sd->len = sd->total_len; 493 494 ret = pipe_buf_confirm(pipe, buf); 495 if (unlikely(ret)) { 496 if (ret == -ENODATA) 497 ret = 0; 498 return ret; 499 } 500 501 ret = actor(pipe, buf, sd); 502 if (ret <= 0) 503 return ret; 504 505 buf->offset += ret; 506 buf->len -= ret; 507 508 sd->num_spliced += ret; 509 sd->len -= ret; 510 sd->pos += ret; 511 sd->total_len -= ret; 512 513 if (!buf->len) { 514 pipe_buf_release(pipe, buf); 515 tail++; 516 pipe->tail = tail; 517 if (pipe->files) 518 sd->need_wakeup = true; 519 } 520 521 if (!sd->total_len) 522 return 0; 523 } 524 525 return 1; 526 } 527 528 /* We know we have a pipe buffer, but maybe it's empty? */ 529 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) 530 { 531 unsigned int tail = pipe->tail; 532 unsigned int mask = pipe->ring_size - 1; 533 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 534 535 if (unlikely(!buf->len)) { 536 pipe_buf_release(pipe, buf); 537 pipe->tail = tail+1; 538 return true; 539 } 540 541 return false; 542 } 543 544 /** 545 * splice_from_pipe_next - wait for some data to splice from 546 * @pipe: pipe to splice from 547 * @sd: information about the splice operation 548 * 549 * Description: 550 * This function will wait for some data and return a positive 551 * value (one) if pipe buffers are available. It will return zero 552 * or -errno if no more data needs to be spliced. 553 */ 554 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 555 { 556 /* 557 * Check for signal early to make process killable when there are 558 * always buffers available 559 */ 560 if (signal_pending(current)) 561 return -ERESTARTSYS; 562 563 repeat: 564 while (pipe_empty(pipe->head, pipe->tail)) { 565 if (!pipe->writers) 566 return 0; 567 568 if (sd->num_spliced) 569 return 0; 570 571 if (sd->flags & SPLICE_F_NONBLOCK) 572 return -EAGAIN; 573 574 if (signal_pending(current)) 575 return -ERESTARTSYS; 576 577 if (sd->need_wakeup) { 578 wakeup_pipe_writers(pipe); 579 sd->need_wakeup = false; 580 } 581 582 pipe_wait_readable(pipe); 583 } 584 585 if (eat_empty_buffer(pipe)) 586 goto repeat; 587 588 return 1; 589 } 590 591 /** 592 * splice_from_pipe_begin - start splicing from pipe 593 * @sd: information about the splice operation 594 * 595 * Description: 596 * This function should be called before a loop containing 597 * splice_from_pipe_next() and splice_from_pipe_feed() to 598 * initialize the necessary fields of @sd. 599 */ 600 static void splice_from_pipe_begin(struct splice_desc *sd) 601 { 602 sd->num_spliced = 0; 603 sd->need_wakeup = false; 604 } 605 606 /** 607 * splice_from_pipe_end - finish splicing from pipe 608 * @pipe: pipe to splice from 609 * @sd: information about the splice operation 610 * 611 * Description: 612 * This function will wake up pipe writers if necessary. It should 613 * be called after a loop containing splice_from_pipe_next() and 614 * splice_from_pipe_feed(). 615 */ 616 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 617 { 618 if (sd->need_wakeup) 619 wakeup_pipe_writers(pipe); 620 } 621 622 /** 623 * __splice_from_pipe - splice data from a pipe to given actor 624 * @pipe: pipe to splice from 625 * @sd: information to @actor 626 * @actor: handler that splices the data 627 * 628 * Description: 629 * This function does little more than loop over the pipe and call 630 * @actor to do the actual moving of a single struct pipe_buffer to 631 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 632 * pipe_to_user. 633 * 634 */ 635 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 636 splice_actor *actor) 637 { 638 int ret; 639 640 splice_from_pipe_begin(sd); 641 do { 642 cond_resched(); 643 ret = splice_from_pipe_next(pipe, sd); 644 if (ret > 0) 645 ret = splice_from_pipe_feed(pipe, sd, actor); 646 } while (ret > 0); 647 splice_from_pipe_end(pipe, sd); 648 649 return sd->num_spliced ? sd->num_spliced : ret; 650 } 651 EXPORT_SYMBOL(__splice_from_pipe); 652 653 /** 654 * splice_from_pipe - splice data from a pipe to a file 655 * @pipe: pipe to splice from 656 * @out: file to splice to 657 * @ppos: position in @out 658 * @len: how many bytes to splice 659 * @flags: splice modifier flags 660 * @actor: handler that splices the data 661 * 662 * Description: 663 * See __splice_from_pipe. This function locks the pipe inode, 664 * otherwise it's identical to __splice_from_pipe(). 665 * 666 */ 667 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 668 loff_t *ppos, size_t len, unsigned int flags, 669 splice_actor *actor) 670 { 671 ssize_t ret; 672 struct splice_desc sd = { 673 .total_len = len, 674 .flags = flags, 675 .pos = *ppos, 676 .u.file = out, 677 }; 678 679 pipe_lock(pipe); 680 ret = __splice_from_pipe(pipe, &sd, actor); 681 pipe_unlock(pipe); 682 683 return ret; 684 } 685 686 /** 687 * iter_file_splice_write - splice data from a pipe to a file 688 * @pipe: pipe info 689 * @out: file to write to 690 * @ppos: position in @out 691 * @len: number of bytes to splice 692 * @flags: splice modifier flags 693 * 694 * Description: 695 * Will either move or copy pages (determined by @flags options) from 696 * the given pipe inode to the given file. 697 * This one is ->write_iter-based. 698 * 699 */ 700 ssize_t 701 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 702 loff_t *ppos, size_t len, unsigned int flags) 703 { 704 struct splice_desc sd = { 705 .total_len = len, 706 .flags = flags, 707 .pos = *ppos, 708 .u.file = out, 709 }; 710 int nbufs = pipe->max_usage; 711 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 712 GFP_KERNEL); 713 ssize_t ret; 714 715 if (unlikely(!array)) 716 return -ENOMEM; 717 718 pipe_lock(pipe); 719 720 splice_from_pipe_begin(&sd); 721 while (sd.total_len) { 722 struct iov_iter from; 723 unsigned int head, tail, mask; 724 size_t left; 725 int n; 726 727 ret = splice_from_pipe_next(pipe, &sd); 728 if (ret <= 0) 729 break; 730 731 if (unlikely(nbufs < pipe->max_usage)) { 732 kfree(array); 733 nbufs = pipe->max_usage; 734 array = kcalloc(nbufs, sizeof(struct bio_vec), 735 GFP_KERNEL); 736 if (!array) { 737 ret = -ENOMEM; 738 break; 739 } 740 } 741 742 head = pipe->head; 743 tail = pipe->tail; 744 mask = pipe->ring_size - 1; 745 746 /* build the vector */ 747 left = sd.total_len; 748 for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++, n++) { 749 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 750 size_t this_len = buf->len; 751 752 if (this_len > left) 753 this_len = left; 754 755 ret = pipe_buf_confirm(pipe, buf); 756 if (unlikely(ret)) { 757 if (ret == -ENODATA) 758 ret = 0; 759 goto done; 760 } 761 762 array[n].bv_page = buf->page; 763 array[n].bv_len = this_len; 764 array[n].bv_offset = buf->offset; 765 left -= this_len; 766 } 767 768 iov_iter_bvec(&from, WRITE, array, n, sd.total_len - left); 769 ret = vfs_iter_write(out, &from, &sd.pos, 0); 770 if (ret <= 0) 771 break; 772 773 sd.num_spliced += ret; 774 sd.total_len -= ret; 775 *ppos = sd.pos; 776 777 /* dismiss the fully eaten buffers, adjust the partial one */ 778 tail = pipe->tail; 779 while (ret) { 780 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 781 if (ret >= buf->len) { 782 ret -= buf->len; 783 buf->len = 0; 784 pipe_buf_release(pipe, buf); 785 tail++; 786 pipe->tail = tail; 787 if (pipe->files) 788 sd.need_wakeup = true; 789 } else { 790 buf->offset += ret; 791 buf->len -= ret; 792 ret = 0; 793 } 794 } 795 } 796 done: 797 kfree(array); 798 splice_from_pipe_end(pipe, &sd); 799 800 pipe_unlock(pipe); 801 802 if (sd.num_spliced) 803 ret = sd.num_spliced; 804 805 return ret; 806 } 807 808 EXPORT_SYMBOL(iter_file_splice_write); 809 810 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 811 struct splice_desc *sd) 812 { 813 int ret; 814 void *data; 815 loff_t tmp = sd->pos; 816 817 data = kmap(buf->page); 818 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 819 kunmap(buf->page); 820 821 return ret; 822 } 823 824 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, 825 struct file *out, loff_t *ppos, 826 size_t len, unsigned int flags) 827 { 828 ssize_t ret; 829 830 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); 831 if (ret > 0) 832 *ppos += ret; 833 834 return ret; 835 } 836 837 /** 838 * generic_splice_sendpage - splice data from a pipe to a socket 839 * @pipe: pipe to splice from 840 * @out: socket to write to 841 * @ppos: position in @out 842 * @len: number of bytes to splice 843 * @flags: splice modifier flags 844 * 845 * Description: 846 * Will send @len bytes from the pipe to a network socket. No data copying 847 * is involved. 848 * 849 */ 850 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 851 loff_t *ppos, size_t len, unsigned int flags) 852 { 853 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 854 } 855 856 EXPORT_SYMBOL(generic_splice_sendpage); 857 858 /* 859 * Attempt to initiate a splice from pipe to file. 860 */ 861 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 862 loff_t *ppos, size_t len, unsigned int flags) 863 { 864 if (out->f_op->splice_write) 865 return out->f_op->splice_write(pipe, out, ppos, len, flags); 866 return default_file_splice_write(pipe, out, ppos, len, flags); 867 } 868 869 /* 870 * Attempt to initiate a splice from a file to a pipe. 871 */ 872 static long do_splice_to(struct file *in, loff_t *ppos, 873 struct pipe_inode_info *pipe, size_t len, 874 unsigned int flags) 875 { 876 int ret; 877 878 if (unlikely(!(in->f_mode & FMODE_READ))) 879 return -EBADF; 880 881 ret = rw_verify_area(READ, in, ppos, len); 882 if (unlikely(ret < 0)) 883 return ret; 884 885 if (unlikely(len > MAX_RW_COUNT)) 886 len = MAX_RW_COUNT; 887 888 if (in->f_op->splice_read) 889 return in->f_op->splice_read(in, ppos, pipe, len, flags); 890 return default_file_splice_read(in, ppos, pipe, len, flags); 891 } 892 893 /** 894 * splice_direct_to_actor - splices data directly between two non-pipes 895 * @in: file to splice from 896 * @sd: actor information on where to splice to 897 * @actor: handles the data splicing 898 * 899 * Description: 900 * This is a special case helper to splice directly between two 901 * points, without requiring an explicit pipe. Internally an allocated 902 * pipe is cached in the process, and reused during the lifetime of 903 * that process. 904 * 905 */ 906 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 907 splice_direct_actor *actor) 908 { 909 struct pipe_inode_info *pipe; 910 long ret, bytes; 911 umode_t i_mode; 912 size_t len; 913 int i, flags, more; 914 915 /* 916 * We require the input being a regular file, as we don't want to 917 * randomly drop data for eg socket -> socket splicing. Use the 918 * piped splicing for that! 919 */ 920 i_mode = file_inode(in)->i_mode; 921 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 922 return -EINVAL; 923 924 /* 925 * neither in nor out is a pipe, setup an internal pipe attached to 926 * 'out' and transfer the wanted data from 'in' to 'out' through that 927 */ 928 pipe = current->splice_pipe; 929 if (unlikely(!pipe)) { 930 pipe = alloc_pipe_info(); 931 if (!pipe) 932 return -ENOMEM; 933 934 /* 935 * We don't have an immediate reader, but we'll read the stuff 936 * out of the pipe right after the splice_to_pipe(). So set 937 * PIPE_READERS appropriately. 938 */ 939 pipe->readers = 1; 940 941 current->splice_pipe = pipe; 942 } 943 944 /* 945 * Do the splice. 946 */ 947 ret = 0; 948 bytes = 0; 949 len = sd->total_len; 950 flags = sd->flags; 951 952 /* 953 * Don't block on output, we have to drain the direct pipe. 954 */ 955 sd->flags &= ~SPLICE_F_NONBLOCK; 956 more = sd->flags & SPLICE_F_MORE; 957 958 WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); 959 960 while (len) { 961 unsigned int p_space; 962 size_t read_len; 963 loff_t pos = sd->pos, prev_pos = pos; 964 965 /* Don't try to read more the pipe has space for. */ 966 p_space = pipe->max_usage - 967 pipe_occupancy(pipe->head, pipe->tail); 968 read_len = min_t(size_t, len, p_space << PAGE_SHIFT); 969 ret = do_splice_to(in, &pos, pipe, read_len, flags); 970 if (unlikely(ret <= 0)) 971 goto out_release; 972 973 read_len = ret; 974 sd->total_len = read_len; 975 976 /* 977 * If more data is pending, set SPLICE_F_MORE 978 * If this is the last data and SPLICE_F_MORE was not set 979 * initially, clears it. 980 */ 981 if (read_len < len) 982 sd->flags |= SPLICE_F_MORE; 983 else if (!more) 984 sd->flags &= ~SPLICE_F_MORE; 985 /* 986 * NOTE: nonblocking mode only applies to the input. We 987 * must not do the output in nonblocking mode as then we 988 * could get stuck data in the internal pipe: 989 */ 990 ret = actor(pipe, sd); 991 if (unlikely(ret <= 0)) { 992 sd->pos = prev_pos; 993 goto out_release; 994 } 995 996 bytes += ret; 997 len -= ret; 998 sd->pos = pos; 999 1000 if (ret < read_len) { 1001 sd->pos = prev_pos + ret; 1002 goto out_release; 1003 } 1004 } 1005 1006 done: 1007 pipe->tail = pipe->head = 0; 1008 file_accessed(in); 1009 return bytes; 1010 1011 out_release: 1012 /* 1013 * If we did an incomplete transfer we must release 1014 * the pipe buffers in question: 1015 */ 1016 for (i = 0; i < pipe->ring_size; i++) { 1017 struct pipe_buffer *buf = &pipe->bufs[i]; 1018 1019 if (buf->ops) 1020 pipe_buf_release(pipe, buf); 1021 } 1022 1023 if (!bytes) 1024 bytes = ret; 1025 1026 goto done; 1027 } 1028 EXPORT_SYMBOL(splice_direct_to_actor); 1029 1030 static int direct_splice_actor(struct pipe_inode_info *pipe, 1031 struct splice_desc *sd) 1032 { 1033 struct file *file = sd->u.file; 1034 1035 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1036 sd->flags); 1037 } 1038 1039 /** 1040 * do_splice_direct - splices data directly between two files 1041 * @in: file to splice from 1042 * @ppos: input file offset 1043 * @out: file to splice to 1044 * @opos: output file offset 1045 * @len: number of bytes to splice 1046 * @flags: splice modifier flags 1047 * 1048 * Description: 1049 * For use by do_sendfile(). splice can easily emulate sendfile, but 1050 * doing it in the application would incur an extra system call 1051 * (splice in + splice out, as compared to just sendfile()). So this helper 1052 * can splice directly through a process-private pipe. 1053 * 1054 */ 1055 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1056 loff_t *opos, size_t len, unsigned int flags) 1057 { 1058 struct splice_desc sd = { 1059 .len = len, 1060 .total_len = len, 1061 .flags = flags, 1062 .pos = *ppos, 1063 .u.file = out, 1064 .opos = opos, 1065 }; 1066 long ret; 1067 1068 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1069 return -EBADF; 1070 1071 if (unlikely(out->f_flags & O_APPEND)) 1072 return -EINVAL; 1073 1074 ret = rw_verify_area(WRITE, out, opos, len); 1075 if (unlikely(ret < 0)) 1076 return ret; 1077 1078 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1079 if (ret > 0) 1080 *ppos = sd.pos; 1081 1082 return ret; 1083 } 1084 EXPORT_SYMBOL(do_splice_direct); 1085 1086 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1087 { 1088 for (;;) { 1089 if (unlikely(!pipe->readers)) { 1090 send_sig(SIGPIPE, current, 0); 1091 return -EPIPE; 1092 } 1093 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1094 return 0; 1095 if (flags & SPLICE_F_NONBLOCK) 1096 return -EAGAIN; 1097 if (signal_pending(current)) 1098 return -ERESTARTSYS; 1099 pipe_wait_writable(pipe); 1100 } 1101 } 1102 1103 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1104 struct pipe_inode_info *opipe, 1105 size_t len, unsigned int flags); 1106 1107 /* 1108 * Determine where to splice to/from. 1109 */ 1110 long do_splice(struct file *in, loff_t __user *off_in, 1111 struct file *out, loff_t __user *off_out, 1112 size_t len, unsigned int flags) 1113 { 1114 struct pipe_inode_info *ipipe; 1115 struct pipe_inode_info *opipe; 1116 loff_t offset; 1117 long ret; 1118 1119 if (unlikely(!(in->f_mode & FMODE_READ) || 1120 !(out->f_mode & FMODE_WRITE))) 1121 return -EBADF; 1122 1123 ipipe = get_pipe_info(in, true); 1124 opipe = get_pipe_info(out, true); 1125 1126 if (ipipe && opipe) { 1127 if (off_in || off_out) 1128 return -ESPIPE; 1129 1130 /* Splicing to self would be fun, but... */ 1131 if (ipipe == opipe) 1132 return -EINVAL; 1133 1134 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1135 flags |= SPLICE_F_NONBLOCK; 1136 1137 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1138 } 1139 1140 if (ipipe) { 1141 if (off_in) 1142 return -ESPIPE; 1143 if (off_out) { 1144 if (!(out->f_mode & FMODE_PWRITE)) 1145 return -EINVAL; 1146 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1147 return -EFAULT; 1148 } else { 1149 offset = out->f_pos; 1150 } 1151 1152 if (unlikely(out->f_flags & O_APPEND)) 1153 return -EINVAL; 1154 1155 ret = rw_verify_area(WRITE, out, &offset, len); 1156 if (unlikely(ret < 0)) 1157 return ret; 1158 1159 if (in->f_flags & O_NONBLOCK) 1160 flags |= SPLICE_F_NONBLOCK; 1161 1162 file_start_write(out); 1163 ret = do_splice_from(ipipe, out, &offset, len, flags); 1164 file_end_write(out); 1165 1166 if (!off_out) 1167 out->f_pos = offset; 1168 else if (copy_to_user(off_out, &offset, sizeof(loff_t))) 1169 ret = -EFAULT; 1170 1171 return ret; 1172 } 1173 1174 if (opipe) { 1175 if (off_out) 1176 return -ESPIPE; 1177 if (off_in) { 1178 if (!(in->f_mode & FMODE_PREAD)) 1179 return -EINVAL; 1180 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1181 return -EFAULT; 1182 } else { 1183 offset = in->f_pos; 1184 } 1185 1186 if (out->f_flags & O_NONBLOCK) 1187 flags |= SPLICE_F_NONBLOCK; 1188 1189 pipe_lock(opipe); 1190 ret = wait_for_space(opipe, flags); 1191 if (!ret) { 1192 unsigned int p_space; 1193 1194 /* Don't try to read more the pipe has space for. */ 1195 p_space = opipe->max_usage - pipe_occupancy(opipe->head, opipe->tail); 1196 len = min_t(size_t, len, p_space << PAGE_SHIFT); 1197 1198 ret = do_splice_to(in, &offset, opipe, len, flags); 1199 } 1200 pipe_unlock(opipe); 1201 if (ret > 0) 1202 wakeup_pipe_readers(opipe); 1203 if (!off_in) 1204 in->f_pos = offset; 1205 else if (copy_to_user(off_in, &offset, sizeof(loff_t))) 1206 ret = -EFAULT; 1207 1208 return ret; 1209 } 1210 1211 return -EINVAL; 1212 } 1213 1214 static int iter_to_pipe(struct iov_iter *from, 1215 struct pipe_inode_info *pipe, 1216 unsigned flags) 1217 { 1218 struct pipe_buffer buf = { 1219 .ops = &user_page_pipe_buf_ops, 1220 .flags = flags 1221 }; 1222 size_t total = 0; 1223 int ret = 0; 1224 bool failed = false; 1225 1226 while (iov_iter_count(from) && !failed) { 1227 struct page *pages[16]; 1228 ssize_t copied; 1229 size_t start; 1230 int n; 1231 1232 copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start); 1233 if (copied <= 0) { 1234 ret = copied; 1235 break; 1236 } 1237 1238 for (n = 0; copied; n++, start = 0) { 1239 int size = min_t(int, copied, PAGE_SIZE - start); 1240 if (!failed) { 1241 buf.page = pages[n]; 1242 buf.offset = start; 1243 buf.len = size; 1244 ret = add_to_pipe(pipe, &buf); 1245 if (unlikely(ret < 0)) { 1246 failed = true; 1247 } else { 1248 iov_iter_advance(from, ret); 1249 total += ret; 1250 } 1251 } else { 1252 put_page(pages[n]); 1253 } 1254 copied -= size; 1255 } 1256 } 1257 return total ? total : ret; 1258 } 1259 1260 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1261 struct splice_desc *sd) 1262 { 1263 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1264 return n == sd->len ? n : -EFAULT; 1265 } 1266 1267 /* 1268 * For lack of a better implementation, implement vmsplice() to userspace 1269 * as a simple copy of the pipes pages to the user iov. 1270 */ 1271 static long vmsplice_to_user(struct file *file, struct iov_iter *iter, 1272 unsigned int flags) 1273 { 1274 struct pipe_inode_info *pipe = get_pipe_info(file, true); 1275 struct splice_desc sd = { 1276 .total_len = iov_iter_count(iter), 1277 .flags = flags, 1278 .u.data = iter 1279 }; 1280 long ret = 0; 1281 1282 if (!pipe) 1283 return -EBADF; 1284 1285 if (sd.total_len) { 1286 pipe_lock(pipe); 1287 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1288 pipe_unlock(pipe); 1289 } 1290 1291 return ret; 1292 } 1293 1294 /* 1295 * vmsplice splices a user address range into a pipe. It can be thought of 1296 * as splice-from-memory, where the regular splice is splice-from-file (or 1297 * to file). In both cases the output is a pipe, naturally. 1298 */ 1299 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, 1300 unsigned int flags) 1301 { 1302 struct pipe_inode_info *pipe; 1303 long ret = 0; 1304 unsigned buf_flag = 0; 1305 1306 if (flags & SPLICE_F_GIFT) 1307 buf_flag = PIPE_BUF_FLAG_GIFT; 1308 1309 pipe = get_pipe_info(file, true); 1310 if (!pipe) 1311 return -EBADF; 1312 1313 pipe_lock(pipe); 1314 ret = wait_for_space(pipe, flags); 1315 if (!ret) 1316 ret = iter_to_pipe(iter, pipe, buf_flag); 1317 pipe_unlock(pipe); 1318 if (ret > 0) 1319 wakeup_pipe_readers(pipe); 1320 return ret; 1321 } 1322 1323 static int vmsplice_type(struct fd f, int *type) 1324 { 1325 if (!f.file) 1326 return -EBADF; 1327 if (f.file->f_mode & FMODE_WRITE) { 1328 *type = WRITE; 1329 } else if (f.file->f_mode & FMODE_READ) { 1330 *type = READ; 1331 } else { 1332 fdput(f); 1333 return -EBADF; 1334 } 1335 return 0; 1336 } 1337 1338 /* 1339 * Note that vmsplice only really supports true splicing _from_ user memory 1340 * to a pipe, not the other way around. Splicing from user memory is a simple 1341 * operation that can be supported without any funky alignment restrictions 1342 * or nasty vm tricks. We simply map in the user memory and fill them into 1343 * a pipe. The reverse isn't quite as easy, though. There are two possible 1344 * solutions for that: 1345 * 1346 * - memcpy() the data internally, at which point we might as well just 1347 * do a regular read() on the buffer anyway. 1348 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1349 * has restriction limitations on both ends of the pipe). 1350 * 1351 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1352 * 1353 */ 1354 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov, 1355 unsigned long, nr_segs, unsigned int, flags) 1356 { 1357 struct iovec iovstack[UIO_FASTIOV]; 1358 struct iovec *iov = iovstack; 1359 struct iov_iter iter; 1360 ssize_t error; 1361 struct fd f; 1362 int type; 1363 1364 if (unlikely(flags & ~SPLICE_F_ALL)) 1365 return -EINVAL; 1366 1367 f = fdget(fd); 1368 error = vmsplice_type(f, &type); 1369 if (error) 1370 return error; 1371 1372 error = import_iovec(type, uiov, nr_segs, 1373 ARRAY_SIZE(iovstack), &iov, &iter); 1374 if (error < 0) 1375 goto out_fdput; 1376 1377 if (!iov_iter_count(&iter)) 1378 error = 0; 1379 else if (iov_iter_rw(&iter) == WRITE) 1380 error = vmsplice_to_pipe(f.file, &iter, flags); 1381 else 1382 error = vmsplice_to_user(f.file, &iter, flags); 1383 1384 kfree(iov); 1385 out_fdput: 1386 fdput(f); 1387 return error; 1388 } 1389 1390 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1391 int, fd_out, loff_t __user *, off_out, 1392 size_t, len, unsigned int, flags) 1393 { 1394 struct fd in, out; 1395 long error; 1396 1397 if (unlikely(!len)) 1398 return 0; 1399 1400 if (unlikely(flags & ~SPLICE_F_ALL)) 1401 return -EINVAL; 1402 1403 error = -EBADF; 1404 in = fdget(fd_in); 1405 if (in.file) { 1406 out = fdget(fd_out); 1407 if (out.file) { 1408 error = do_splice(in.file, off_in, out.file, off_out, 1409 len, flags); 1410 fdput(out); 1411 } 1412 fdput(in); 1413 } 1414 return error; 1415 } 1416 1417 /* 1418 * Make sure there's data to read. Wait for input if we can, otherwise 1419 * return an appropriate error. 1420 */ 1421 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1422 { 1423 int ret; 1424 1425 /* 1426 * Check the pipe occupancy without the inode lock first. This function 1427 * is speculative anyways, so missing one is ok. 1428 */ 1429 if (!pipe_empty(pipe->head, pipe->tail)) 1430 return 0; 1431 1432 ret = 0; 1433 pipe_lock(pipe); 1434 1435 while (pipe_empty(pipe->head, pipe->tail)) { 1436 if (signal_pending(current)) { 1437 ret = -ERESTARTSYS; 1438 break; 1439 } 1440 if (!pipe->writers) 1441 break; 1442 if (flags & SPLICE_F_NONBLOCK) { 1443 ret = -EAGAIN; 1444 break; 1445 } 1446 pipe_wait_readable(pipe); 1447 } 1448 1449 pipe_unlock(pipe); 1450 return ret; 1451 } 1452 1453 /* 1454 * Make sure there's writeable room. Wait for room if we can, otherwise 1455 * return an appropriate error. 1456 */ 1457 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1458 { 1459 int ret; 1460 1461 /* 1462 * Check pipe occupancy without the inode lock first. This function 1463 * is speculative anyways, so missing one is ok. 1464 */ 1465 if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 1466 return 0; 1467 1468 ret = 0; 1469 pipe_lock(pipe); 1470 1471 while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { 1472 if (!pipe->readers) { 1473 send_sig(SIGPIPE, current, 0); 1474 ret = -EPIPE; 1475 break; 1476 } 1477 if (flags & SPLICE_F_NONBLOCK) { 1478 ret = -EAGAIN; 1479 break; 1480 } 1481 if (signal_pending(current)) { 1482 ret = -ERESTARTSYS; 1483 break; 1484 } 1485 pipe_wait_writable(pipe); 1486 } 1487 1488 pipe_unlock(pipe); 1489 return ret; 1490 } 1491 1492 /* 1493 * Splice contents of ipipe to opipe. 1494 */ 1495 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1496 struct pipe_inode_info *opipe, 1497 size_t len, unsigned int flags) 1498 { 1499 struct pipe_buffer *ibuf, *obuf; 1500 unsigned int i_head, o_head; 1501 unsigned int i_tail, o_tail; 1502 unsigned int i_mask, o_mask; 1503 int ret = 0; 1504 bool input_wakeup = false; 1505 1506 1507 retry: 1508 ret = ipipe_prep(ipipe, flags); 1509 if (ret) 1510 return ret; 1511 1512 ret = opipe_prep(opipe, flags); 1513 if (ret) 1514 return ret; 1515 1516 /* 1517 * Potential ABBA deadlock, work around it by ordering lock 1518 * grabbing by pipe info address. Otherwise two different processes 1519 * could deadlock (one doing tee from A -> B, the other from B -> A). 1520 */ 1521 pipe_double_lock(ipipe, opipe); 1522 1523 i_tail = ipipe->tail; 1524 i_mask = ipipe->ring_size - 1; 1525 o_head = opipe->head; 1526 o_mask = opipe->ring_size - 1; 1527 1528 do { 1529 size_t o_len; 1530 1531 if (!opipe->readers) { 1532 send_sig(SIGPIPE, current, 0); 1533 if (!ret) 1534 ret = -EPIPE; 1535 break; 1536 } 1537 1538 i_head = ipipe->head; 1539 o_tail = opipe->tail; 1540 1541 if (pipe_empty(i_head, i_tail) && !ipipe->writers) 1542 break; 1543 1544 /* 1545 * Cannot make any progress, because either the input 1546 * pipe is empty or the output pipe is full. 1547 */ 1548 if (pipe_empty(i_head, i_tail) || 1549 pipe_full(o_head, o_tail, opipe->max_usage)) { 1550 /* Already processed some buffers, break */ 1551 if (ret) 1552 break; 1553 1554 if (flags & SPLICE_F_NONBLOCK) { 1555 ret = -EAGAIN; 1556 break; 1557 } 1558 1559 /* 1560 * We raced with another reader/writer and haven't 1561 * managed to process any buffers. A zero return 1562 * value means EOF, so retry instead. 1563 */ 1564 pipe_unlock(ipipe); 1565 pipe_unlock(opipe); 1566 goto retry; 1567 } 1568 1569 ibuf = &ipipe->bufs[i_tail & i_mask]; 1570 obuf = &opipe->bufs[o_head & o_mask]; 1571 1572 if (len >= ibuf->len) { 1573 /* 1574 * Simply move the whole buffer from ipipe to opipe 1575 */ 1576 *obuf = *ibuf; 1577 ibuf->ops = NULL; 1578 i_tail++; 1579 ipipe->tail = i_tail; 1580 input_wakeup = true; 1581 o_len = obuf->len; 1582 o_head++; 1583 opipe->head = o_head; 1584 } else { 1585 /* 1586 * Get a reference to this pipe buffer, 1587 * so we can copy the contents over. 1588 */ 1589 if (!pipe_buf_get(ipipe, ibuf)) { 1590 if (ret == 0) 1591 ret = -EFAULT; 1592 break; 1593 } 1594 *obuf = *ibuf; 1595 1596 /* 1597 * Don't inherit the gift and merge flags, we need to 1598 * prevent multiple steals of this page. 1599 */ 1600 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1601 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1602 1603 obuf->len = len; 1604 ibuf->offset += len; 1605 ibuf->len -= len; 1606 o_len = len; 1607 o_head++; 1608 opipe->head = o_head; 1609 } 1610 ret += o_len; 1611 len -= o_len; 1612 } while (len); 1613 1614 pipe_unlock(ipipe); 1615 pipe_unlock(opipe); 1616 1617 /* 1618 * If we put data in the output pipe, wakeup any potential readers. 1619 */ 1620 if (ret > 0) 1621 wakeup_pipe_readers(opipe); 1622 1623 if (input_wakeup) 1624 wakeup_pipe_writers(ipipe); 1625 1626 return ret; 1627 } 1628 1629 /* 1630 * Link contents of ipipe to opipe. 1631 */ 1632 static int link_pipe(struct pipe_inode_info *ipipe, 1633 struct pipe_inode_info *opipe, 1634 size_t len, unsigned int flags) 1635 { 1636 struct pipe_buffer *ibuf, *obuf; 1637 unsigned int i_head, o_head; 1638 unsigned int i_tail, o_tail; 1639 unsigned int i_mask, o_mask; 1640 int ret = 0; 1641 1642 /* 1643 * Potential ABBA deadlock, work around it by ordering lock 1644 * grabbing by pipe info address. Otherwise two different processes 1645 * could deadlock (one doing tee from A -> B, the other from B -> A). 1646 */ 1647 pipe_double_lock(ipipe, opipe); 1648 1649 i_tail = ipipe->tail; 1650 i_mask = ipipe->ring_size - 1; 1651 o_head = opipe->head; 1652 o_mask = opipe->ring_size - 1; 1653 1654 do { 1655 if (!opipe->readers) { 1656 send_sig(SIGPIPE, current, 0); 1657 if (!ret) 1658 ret = -EPIPE; 1659 break; 1660 } 1661 1662 i_head = ipipe->head; 1663 o_tail = opipe->tail; 1664 1665 /* 1666 * If we have iterated all input buffers or run out of 1667 * output room, break. 1668 */ 1669 if (pipe_empty(i_head, i_tail) || 1670 pipe_full(o_head, o_tail, opipe->max_usage)) 1671 break; 1672 1673 ibuf = &ipipe->bufs[i_tail & i_mask]; 1674 obuf = &opipe->bufs[o_head & o_mask]; 1675 1676 /* 1677 * Get a reference to this pipe buffer, 1678 * so we can copy the contents over. 1679 */ 1680 if (!pipe_buf_get(ipipe, ibuf)) { 1681 if (ret == 0) 1682 ret = -EFAULT; 1683 break; 1684 } 1685 1686 *obuf = *ibuf; 1687 1688 /* 1689 * Don't inherit the gift and merge flag, we need to prevent 1690 * multiple steals of this page. 1691 */ 1692 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1693 obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; 1694 1695 if (obuf->len > len) 1696 obuf->len = len; 1697 ret += obuf->len; 1698 len -= obuf->len; 1699 1700 o_head++; 1701 opipe->head = o_head; 1702 i_tail++; 1703 } while (len); 1704 1705 pipe_unlock(ipipe); 1706 pipe_unlock(opipe); 1707 1708 /* 1709 * If we put data in the output pipe, wakeup any potential readers. 1710 */ 1711 if (ret > 0) 1712 wakeup_pipe_readers(opipe); 1713 1714 return ret; 1715 } 1716 1717 /* 1718 * This is a tee(1) implementation that works on pipes. It doesn't copy 1719 * any data, it simply references the 'in' pages on the 'out' pipe. 1720 * The 'flags' used are the SPLICE_F_* variants, currently the only 1721 * applicable one is SPLICE_F_NONBLOCK. 1722 */ 1723 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) 1724 { 1725 struct pipe_inode_info *ipipe = get_pipe_info(in, true); 1726 struct pipe_inode_info *opipe = get_pipe_info(out, true); 1727 int ret = -EINVAL; 1728 1729 if (unlikely(!(in->f_mode & FMODE_READ) || 1730 !(out->f_mode & FMODE_WRITE))) 1731 return -EBADF; 1732 1733 /* 1734 * Duplicate the contents of ipipe to opipe without actually 1735 * copying the data. 1736 */ 1737 if (ipipe && opipe && ipipe != opipe) { 1738 if ((in->f_flags | out->f_flags) & O_NONBLOCK) 1739 flags |= SPLICE_F_NONBLOCK; 1740 1741 /* 1742 * Keep going, unless we encounter an error. The ipipe/opipe 1743 * ordering doesn't really matter. 1744 */ 1745 ret = ipipe_prep(ipipe, flags); 1746 if (!ret) { 1747 ret = opipe_prep(opipe, flags); 1748 if (!ret) 1749 ret = link_pipe(ipipe, opipe, len, flags); 1750 } 1751 } 1752 1753 return ret; 1754 } 1755 1756 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1757 { 1758 struct fd in, out; 1759 int error; 1760 1761 if (unlikely(flags & ~SPLICE_F_ALL)) 1762 return -EINVAL; 1763 1764 if (unlikely(!len)) 1765 return 0; 1766 1767 error = -EBADF; 1768 in = fdget(fdin); 1769 if (in.file) { 1770 out = fdget(fdout); 1771 if (out.file) { 1772 error = do_tee(in.file, out.file, len, flags); 1773 fdput(out); 1774 } 1775 fdput(in); 1776 } 1777 1778 return error; 1779 } 1780