1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/bvec.h> 21 #include <linux/fs.h> 22 #include <linux/file.h> 23 #include <linux/pagemap.h> 24 #include <linux/splice.h> 25 #include <linux/memcontrol.h> 26 #include <linux/mm_inline.h> 27 #include <linux/swap.h> 28 #include <linux/writeback.h> 29 #include <linux/export.h> 30 #include <linux/syscalls.h> 31 #include <linux/uio.h> 32 #include <linux/security.h> 33 #include <linux/gfp.h> 34 #include <linux/socket.h> 35 #include <linux/compat.h> 36 #include "internal.h" 37 38 /* 39 * Attempt to steal a page from a pipe buffer. This should perhaps go into 40 * a vm helper function, it's already simplified quite a bit by the 41 * addition of remove_mapping(). If success is returned, the caller may 42 * attempt to reuse this page for another destination. 43 */ 44 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 45 struct pipe_buffer *buf) 46 { 47 struct page *page = buf->page; 48 struct address_space *mapping; 49 50 lock_page(page); 51 52 mapping = page_mapping(page); 53 if (mapping) { 54 WARN_ON(!PageUptodate(page)); 55 56 /* 57 * At least for ext2 with nobh option, we need to wait on 58 * writeback completing on this page, since we'll remove it 59 * from the pagecache. Otherwise truncate wont wait on the 60 * page, allowing the disk blocks to be reused by someone else 61 * before we actually wrote our data to them. fs corruption 62 * ensues. 63 */ 64 wait_on_page_writeback(page); 65 66 if (page_has_private(page) && 67 !try_to_release_page(page, GFP_KERNEL)) 68 goto out_unlock; 69 70 /* 71 * If we succeeded in removing the mapping, set LRU flag 72 * and return good. 73 */ 74 if (remove_mapping(mapping, page)) { 75 buf->flags |= PIPE_BUF_FLAG_LRU; 76 return 0; 77 } 78 } 79 80 /* 81 * Raced with truncate or failed to remove page from current 82 * address space, unlock and return failure. 83 */ 84 out_unlock: 85 unlock_page(page); 86 return 1; 87 } 88 89 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 90 struct pipe_buffer *buf) 91 { 92 put_page(buf->page); 93 buf->flags &= ~PIPE_BUF_FLAG_LRU; 94 } 95 96 /* 97 * Check whether the contents of buf is OK to access. Since the content 98 * is a page cache page, IO may be in flight. 99 */ 100 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 101 struct pipe_buffer *buf) 102 { 103 struct page *page = buf->page; 104 int err; 105 106 if (!PageUptodate(page)) { 107 lock_page(page); 108 109 /* 110 * Page got truncated/unhashed. This will cause a 0-byte 111 * splice, if this is the first page. 112 */ 113 if (!page->mapping) { 114 err = -ENODATA; 115 goto error; 116 } 117 118 /* 119 * Uh oh, read-error from disk. 120 */ 121 if (!PageUptodate(page)) { 122 err = -EIO; 123 goto error; 124 } 125 126 /* 127 * Page is ok afterall, we are done. 128 */ 129 unlock_page(page); 130 } 131 132 return 0; 133 error: 134 unlock_page(page); 135 return err; 136 } 137 138 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 139 .can_merge = 0, 140 .confirm = page_cache_pipe_buf_confirm, 141 .release = page_cache_pipe_buf_release, 142 .steal = page_cache_pipe_buf_steal, 143 .get = generic_pipe_buf_get, 144 }; 145 146 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 147 struct pipe_buffer *buf) 148 { 149 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 150 return 1; 151 152 buf->flags |= PIPE_BUF_FLAG_LRU; 153 return generic_pipe_buf_steal(pipe, buf); 154 } 155 156 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 157 .can_merge = 0, 158 .confirm = generic_pipe_buf_confirm, 159 .release = page_cache_pipe_buf_release, 160 .steal = user_page_pipe_buf_steal, 161 .get = generic_pipe_buf_get, 162 }; 163 164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 165 { 166 smp_mb(); 167 if (waitqueue_active(&pipe->wait)) 168 wake_up_interruptible(&pipe->wait); 169 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 170 } 171 172 /** 173 * splice_to_pipe - fill passed data into a pipe 174 * @pipe: pipe to fill 175 * @spd: data to fill 176 * 177 * Description: 178 * @spd contains a map of pages and len/offset tuples, along with 179 * the struct pipe_buf_operations associated with these pages. This 180 * function will link that data to the pipe. 181 * 182 */ 183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 184 struct splice_pipe_desc *spd) 185 { 186 unsigned int spd_pages = spd->nr_pages; 187 int ret = 0, page_nr = 0; 188 189 if (!spd_pages) 190 return 0; 191 192 if (unlikely(!pipe->readers)) { 193 send_sig(SIGPIPE, current, 0); 194 ret = -EPIPE; 195 goto out; 196 } 197 198 while (pipe->nrbufs < pipe->buffers) { 199 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); 200 struct pipe_buffer *buf = pipe->bufs + newbuf; 201 202 buf->page = spd->pages[page_nr]; 203 buf->offset = spd->partial[page_nr].offset; 204 buf->len = spd->partial[page_nr].len; 205 buf->private = spd->partial[page_nr].private; 206 buf->ops = spd->ops; 207 buf->flags = 0; 208 209 pipe->nrbufs++; 210 page_nr++; 211 ret += buf->len; 212 213 if (!--spd->nr_pages) 214 break; 215 } 216 217 if (!ret) 218 ret = -EAGAIN; 219 220 out: 221 while (page_nr < spd_pages) 222 spd->spd_release(spd, page_nr++); 223 224 return ret; 225 } 226 EXPORT_SYMBOL_GPL(splice_to_pipe); 227 228 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 229 { 230 int ret; 231 232 if (unlikely(!pipe->readers)) { 233 send_sig(SIGPIPE, current, 0); 234 ret = -EPIPE; 235 } else if (pipe->nrbufs == pipe->buffers) { 236 ret = -EAGAIN; 237 } else { 238 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); 239 pipe->bufs[newbuf] = *buf; 240 pipe->nrbufs++; 241 return buf->len; 242 } 243 pipe_buf_release(pipe, buf); 244 return ret; 245 } 246 EXPORT_SYMBOL(add_to_pipe); 247 248 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) 249 { 250 put_page(spd->pages[i]); 251 } 252 253 /* 254 * Check if we need to grow the arrays holding pages and partial page 255 * descriptions. 256 */ 257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 258 { 259 unsigned int buffers = ACCESS_ONCE(pipe->buffers); 260 261 spd->nr_pages_max = buffers; 262 if (buffers <= PIPE_DEF_BUFFERS) 263 return 0; 264 265 spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); 266 spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); 267 268 if (spd->pages && spd->partial) 269 return 0; 270 271 kfree(spd->pages); 272 kfree(spd->partial); 273 return -ENOMEM; 274 } 275 276 void splice_shrink_spd(struct splice_pipe_desc *spd) 277 { 278 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 279 return; 280 281 kfree(spd->pages); 282 kfree(spd->partial); 283 } 284 285 /** 286 * generic_file_splice_read - splice data from file to a pipe 287 * @in: file to splice from 288 * @ppos: position in @in 289 * @pipe: pipe to splice to 290 * @len: number of bytes to splice 291 * @flags: splice modifier flags 292 * 293 * Description: 294 * Will read pages from given file and fill them into a pipe. Can be 295 * used as long as it has more or less sane ->read_iter(). 296 * 297 */ 298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 299 struct pipe_inode_info *pipe, size_t len, 300 unsigned int flags) 301 { 302 struct iov_iter to; 303 struct kiocb kiocb; 304 int idx, ret; 305 306 iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len); 307 idx = to.idx; 308 init_sync_kiocb(&kiocb, in); 309 kiocb.ki_pos = *ppos; 310 ret = in->f_op->read_iter(&kiocb, &to); 311 if (ret > 0) { 312 *ppos = kiocb.ki_pos; 313 file_accessed(in); 314 } else if (ret < 0) { 315 to.idx = idx; 316 to.iov_offset = 0; 317 iov_iter_advance(&to, 0); /* to free what was emitted */ 318 /* 319 * callers of ->splice_read() expect -EAGAIN on 320 * "can't put anything in there", rather than -EFAULT. 321 */ 322 if (ret == -EFAULT) 323 ret = -EAGAIN; 324 } 325 326 return ret; 327 } 328 EXPORT_SYMBOL(generic_file_splice_read); 329 330 const struct pipe_buf_operations default_pipe_buf_ops = { 331 .can_merge = 0, 332 .confirm = generic_pipe_buf_confirm, 333 .release = generic_pipe_buf_release, 334 .steal = generic_pipe_buf_steal, 335 .get = generic_pipe_buf_get, 336 }; 337 338 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, 339 struct pipe_buffer *buf) 340 { 341 return 1; 342 } 343 344 /* Pipe buffer operations for a socket and similar. */ 345 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 346 .can_merge = 0, 347 .confirm = generic_pipe_buf_confirm, 348 .release = generic_pipe_buf_release, 349 .steal = generic_pipe_buf_nosteal, 350 .get = generic_pipe_buf_get, 351 }; 352 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 353 354 static ssize_t kernel_readv(struct file *file, const struct kvec *vec, 355 unsigned long vlen, loff_t offset) 356 { 357 mm_segment_t old_fs; 358 loff_t pos = offset; 359 ssize_t res; 360 361 old_fs = get_fs(); 362 set_fs(get_ds()); 363 /* The cast to a user pointer is valid due to the set_fs() */ 364 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0); 365 set_fs(old_fs); 366 367 return res; 368 } 369 370 ssize_t kernel_write(struct file *file, const char *buf, size_t count, 371 loff_t pos) 372 { 373 mm_segment_t old_fs; 374 ssize_t res; 375 376 old_fs = get_fs(); 377 set_fs(get_ds()); 378 /* The cast to a user pointer is valid due to the set_fs() */ 379 res = vfs_write(file, (__force const char __user *)buf, count, &pos); 380 set_fs(old_fs); 381 382 return res; 383 } 384 EXPORT_SYMBOL(kernel_write); 385 386 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, 387 struct pipe_inode_info *pipe, size_t len, 388 unsigned int flags) 389 { 390 struct kvec *vec, __vec[PIPE_DEF_BUFFERS]; 391 struct iov_iter to; 392 struct page **pages; 393 unsigned int nr_pages; 394 size_t offset, dummy, copied = 0; 395 ssize_t res; 396 int i; 397 398 if (pipe->nrbufs == pipe->buffers) 399 return -EAGAIN; 400 401 /* 402 * Try to keep page boundaries matching to source pagecache ones - 403 * it probably won't be much help, but... 404 */ 405 offset = *ppos & ~PAGE_MASK; 406 407 iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); 408 409 res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); 410 if (res <= 0) 411 return -ENOMEM; 412 413 BUG_ON(dummy); 414 nr_pages = DIV_ROUND_UP(res, PAGE_SIZE); 415 416 vec = __vec; 417 if (nr_pages > PIPE_DEF_BUFFERS) { 418 vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL); 419 if (unlikely(!vec)) { 420 res = -ENOMEM; 421 goto out; 422 } 423 } 424 425 pipe->bufs[to.idx].offset = offset; 426 pipe->bufs[to.idx].len -= offset; 427 428 for (i = 0; i < nr_pages; i++) { 429 size_t this_len = min_t(size_t, len, PAGE_SIZE - offset); 430 vec[i].iov_base = page_address(pages[i]) + offset; 431 vec[i].iov_len = this_len; 432 len -= this_len; 433 offset = 0; 434 } 435 436 res = kernel_readv(in, vec, nr_pages, *ppos); 437 if (res > 0) { 438 copied = res; 439 *ppos += res; 440 } 441 442 if (vec != __vec) 443 kfree(vec); 444 out: 445 for (i = 0; i < nr_pages; i++) 446 put_page(pages[i]); 447 kvfree(pages); 448 iov_iter_advance(&to, copied); /* truncates and discards */ 449 return res; 450 } 451 452 /* 453 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 454 * using sendpage(). Return the number of bytes sent. 455 */ 456 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 457 struct pipe_buffer *buf, struct splice_desc *sd) 458 { 459 struct file *file = sd->u.file; 460 loff_t pos = sd->pos; 461 int more; 462 463 if (!likely(file->f_op->sendpage)) 464 return -EINVAL; 465 466 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 467 468 if (sd->len < sd->total_len && pipe->nrbufs > 1) 469 more |= MSG_SENDPAGE_NOTLAST; 470 471 return file->f_op->sendpage(file, buf->page, buf->offset, 472 sd->len, &pos, more); 473 } 474 475 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 476 { 477 smp_mb(); 478 if (waitqueue_active(&pipe->wait)) 479 wake_up_interruptible(&pipe->wait); 480 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 481 } 482 483 /** 484 * splice_from_pipe_feed - feed available data from a pipe to a file 485 * @pipe: pipe to splice from 486 * @sd: information to @actor 487 * @actor: handler that splices the data 488 * 489 * Description: 490 * This function loops over the pipe and calls @actor to do the 491 * actual moving of a single struct pipe_buffer to the desired 492 * destination. It returns when there's no more buffers left in 493 * the pipe or if the requested number of bytes (@sd->total_len) 494 * have been copied. It returns a positive number (one) if the 495 * pipe needs to be filled with more data, zero if the required 496 * number of bytes have been copied and -errno on error. 497 * 498 * This, together with splice_from_pipe_{begin,end,next}, may be 499 * used to implement the functionality of __splice_from_pipe() when 500 * locking is required around copying the pipe buffers to the 501 * destination. 502 */ 503 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 504 splice_actor *actor) 505 { 506 int ret; 507 508 while (pipe->nrbufs) { 509 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 510 511 sd->len = buf->len; 512 if (sd->len > sd->total_len) 513 sd->len = sd->total_len; 514 515 ret = pipe_buf_confirm(pipe, buf); 516 if (unlikely(ret)) { 517 if (ret == -ENODATA) 518 ret = 0; 519 return ret; 520 } 521 522 ret = actor(pipe, buf, sd); 523 if (ret <= 0) 524 return ret; 525 526 buf->offset += ret; 527 buf->len -= ret; 528 529 sd->num_spliced += ret; 530 sd->len -= ret; 531 sd->pos += ret; 532 sd->total_len -= ret; 533 534 if (!buf->len) { 535 pipe_buf_release(pipe, buf); 536 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 537 pipe->nrbufs--; 538 if (pipe->files) 539 sd->need_wakeup = true; 540 } 541 542 if (!sd->total_len) 543 return 0; 544 } 545 546 return 1; 547 } 548 549 /** 550 * splice_from_pipe_next - wait for some data to splice from 551 * @pipe: pipe to splice from 552 * @sd: information about the splice operation 553 * 554 * Description: 555 * This function will wait for some data and return a positive 556 * value (one) if pipe buffers are available. It will return zero 557 * or -errno if no more data needs to be spliced. 558 */ 559 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 560 { 561 /* 562 * Check for signal early to make process killable when there are 563 * always buffers available 564 */ 565 if (signal_pending(current)) 566 return -ERESTARTSYS; 567 568 while (!pipe->nrbufs) { 569 if (!pipe->writers) 570 return 0; 571 572 if (!pipe->waiting_writers && sd->num_spliced) 573 return 0; 574 575 if (sd->flags & SPLICE_F_NONBLOCK) 576 return -EAGAIN; 577 578 if (signal_pending(current)) 579 return -ERESTARTSYS; 580 581 if (sd->need_wakeup) { 582 wakeup_pipe_writers(pipe); 583 sd->need_wakeup = false; 584 } 585 586 pipe_wait(pipe); 587 } 588 589 return 1; 590 } 591 592 /** 593 * splice_from_pipe_begin - start splicing from pipe 594 * @sd: information about the splice operation 595 * 596 * Description: 597 * This function should be called before a loop containing 598 * splice_from_pipe_next() and splice_from_pipe_feed() to 599 * initialize the necessary fields of @sd. 600 */ 601 static void splice_from_pipe_begin(struct splice_desc *sd) 602 { 603 sd->num_spliced = 0; 604 sd->need_wakeup = false; 605 } 606 607 /** 608 * splice_from_pipe_end - finish splicing from pipe 609 * @pipe: pipe to splice from 610 * @sd: information about the splice operation 611 * 612 * Description: 613 * This function will wake up pipe writers if necessary. It should 614 * be called after a loop containing splice_from_pipe_next() and 615 * splice_from_pipe_feed(). 616 */ 617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 618 { 619 if (sd->need_wakeup) 620 wakeup_pipe_writers(pipe); 621 } 622 623 /** 624 * __splice_from_pipe - splice data from a pipe to given actor 625 * @pipe: pipe to splice from 626 * @sd: information to @actor 627 * @actor: handler that splices the data 628 * 629 * Description: 630 * This function does little more than loop over the pipe and call 631 * @actor to do the actual moving of a single struct pipe_buffer to 632 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 633 * pipe_to_user. 634 * 635 */ 636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 637 splice_actor *actor) 638 { 639 int ret; 640 641 splice_from_pipe_begin(sd); 642 do { 643 cond_resched(); 644 ret = splice_from_pipe_next(pipe, sd); 645 if (ret > 0) 646 ret = splice_from_pipe_feed(pipe, sd, actor); 647 } while (ret > 0); 648 splice_from_pipe_end(pipe, sd); 649 650 return sd->num_spliced ? sd->num_spliced : ret; 651 } 652 EXPORT_SYMBOL(__splice_from_pipe); 653 654 /** 655 * splice_from_pipe - splice data from a pipe to a file 656 * @pipe: pipe to splice from 657 * @out: file to splice to 658 * @ppos: position in @out 659 * @len: how many bytes to splice 660 * @flags: splice modifier flags 661 * @actor: handler that splices the data 662 * 663 * Description: 664 * See __splice_from_pipe. This function locks the pipe inode, 665 * otherwise it's identical to __splice_from_pipe(). 666 * 667 */ 668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 669 loff_t *ppos, size_t len, unsigned int flags, 670 splice_actor *actor) 671 { 672 ssize_t ret; 673 struct splice_desc sd = { 674 .total_len = len, 675 .flags = flags, 676 .pos = *ppos, 677 .u.file = out, 678 }; 679 680 pipe_lock(pipe); 681 ret = __splice_from_pipe(pipe, &sd, actor); 682 pipe_unlock(pipe); 683 684 return ret; 685 } 686 687 /** 688 * iter_file_splice_write - splice data from a pipe to a file 689 * @pipe: pipe info 690 * @out: file to write to 691 * @ppos: position in @out 692 * @len: number of bytes to splice 693 * @flags: splice modifier flags 694 * 695 * Description: 696 * Will either move or copy pages (determined by @flags options) from 697 * the given pipe inode to the given file. 698 * This one is ->write_iter-based. 699 * 700 */ 701 ssize_t 702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 703 loff_t *ppos, size_t len, unsigned int flags) 704 { 705 struct splice_desc sd = { 706 .total_len = len, 707 .flags = flags, 708 .pos = *ppos, 709 .u.file = out, 710 }; 711 int nbufs = pipe->buffers; 712 struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), 713 GFP_KERNEL); 714 ssize_t ret; 715 716 if (unlikely(!array)) 717 return -ENOMEM; 718 719 pipe_lock(pipe); 720 721 splice_from_pipe_begin(&sd); 722 while (sd.total_len) { 723 struct iov_iter from; 724 size_t left; 725 int n, idx; 726 727 ret = splice_from_pipe_next(pipe, &sd); 728 if (ret <= 0) 729 break; 730 731 if (unlikely(nbufs < pipe->buffers)) { 732 kfree(array); 733 nbufs = pipe->buffers; 734 array = kcalloc(nbufs, sizeof(struct bio_vec), 735 GFP_KERNEL); 736 if (!array) { 737 ret = -ENOMEM; 738 break; 739 } 740 } 741 742 /* build the vector */ 743 left = sd.total_len; 744 for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) { 745 struct pipe_buffer *buf = pipe->bufs + idx; 746 size_t this_len = buf->len; 747 748 if (this_len > left) 749 this_len = left; 750 751 if (idx == pipe->buffers - 1) 752 idx = -1; 753 754 ret = pipe_buf_confirm(pipe, buf); 755 if (unlikely(ret)) { 756 if (ret == -ENODATA) 757 ret = 0; 758 goto done; 759 } 760 761 array[n].bv_page = buf->page; 762 array[n].bv_len = this_len; 763 array[n].bv_offset = buf->offset; 764 left -= this_len; 765 } 766 767 iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n, 768 sd.total_len - left); 769 ret = vfs_iter_write(out, &from, &sd.pos); 770 if (ret <= 0) 771 break; 772 773 sd.num_spliced += ret; 774 sd.total_len -= ret; 775 *ppos = sd.pos; 776 777 /* dismiss the fully eaten buffers, adjust the partial one */ 778 while (ret) { 779 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 780 if (ret >= buf->len) { 781 ret -= buf->len; 782 buf->len = 0; 783 pipe_buf_release(pipe, buf); 784 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 785 pipe->nrbufs--; 786 if (pipe->files) 787 sd.need_wakeup = true; 788 } else { 789 buf->offset += ret; 790 buf->len -= ret; 791 ret = 0; 792 } 793 } 794 } 795 done: 796 kfree(array); 797 splice_from_pipe_end(pipe, &sd); 798 799 pipe_unlock(pipe); 800 801 if (sd.num_spliced) 802 ret = sd.num_spliced; 803 804 return ret; 805 } 806 807 EXPORT_SYMBOL(iter_file_splice_write); 808 809 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 810 struct splice_desc *sd) 811 { 812 int ret; 813 void *data; 814 loff_t tmp = sd->pos; 815 816 data = kmap(buf->page); 817 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 818 kunmap(buf->page); 819 820 return ret; 821 } 822 823 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, 824 struct file *out, loff_t *ppos, 825 size_t len, unsigned int flags) 826 { 827 ssize_t ret; 828 829 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); 830 if (ret > 0) 831 *ppos += ret; 832 833 return ret; 834 } 835 836 /** 837 * generic_splice_sendpage - splice data from a pipe to a socket 838 * @pipe: pipe to splice from 839 * @out: socket to write to 840 * @ppos: position in @out 841 * @len: number of bytes to splice 842 * @flags: splice modifier flags 843 * 844 * Description: 845 * Will send @len bytes from the pipe to a network socket. No data copying 846 * is involved. 847 * 848 */ 849 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 850 loff_t *ppos, size_t len, unsigned int flags) 851 { 852 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 853 } 854 855 EXPORT_SYMBOL(generic_splice_sendpage); 856 857 /* 858 * Attempt to initiate a splice from pipe to file. 859 */ 860 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 861 loff_t *ppos, size_t len, unsigned int flags) 862 { 863 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 864 loff_t *, size_t, unsigned int); 865 866 if (out->f_op->splice_write) 867 splice_write = out->f_op->splice_write; 868 else 869 splice_write = default_file_splice_write; 870 871 return splice_write(pipe, out, ppos, len, flags); 872 } 873 874 /* 875 * Attempt to initiate a splice from a file to a pipe. 876 */ 877 static long do_splice_to(struct file *in, loff_t *ppos, 878 struct pipe_inode_info *pipe, size_t len, 879 unsigned int flags) 880 { 881 ssize_t (*splice_read)(struct file *, loff_t *, 882 struct pipe_inode_info *, size_t, unsigned int); 883 int ret; 884 885 if (unlikely(!(in->f_mode & FMODE_READ))) 886 return -EBADF; 887 888 ret = rw_verify_area(READ, in, ppos, len); 889 if (unlikely(ret < 0)) 890 return ret; 891 892 if (unlikely(len > MAX_RW_COUNT)) 893 len = MAX_RW_COUNT; 894 895 if (in->f_op->splice_read) 896 splice_read = in->f_op->splice_read; 897 else 898 splice_read = default_file_splice_read; 899 900 return splice_read(in, ppos, pipe, len, flags); 901 } 902 903 /** 904 * splice_direct_to_actor - splices data directly between two non-pipes 905 * @in: file to splice from 906 * @sd: actor information on where to splice to 907 * @actor: handles the data splicing 908 * 909 * Description: 910 * This is a special case helper to splice directly between two 911 * points, without requiring an explicit pipe. Internally an allocated 912 * pipe is cached in the process, and reused during the lifetime of 913 * that process. 914 * 915 */ 916 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 917 splice_direct_actor *actor) 918 { 919 struct pipe_inode_info *pipe; 920 long ret, bytes; 921 umode_t i_mode; 922 size_t len; 923 int i, flags, more; 924 925 /* 926 * We require the input being a regular file, as we don't want to 927 * randomly drop data for eg socket -> socket splicing. Use the 928 * piped splicing for that! 929 */ 930 i_mode = file_inode(in)->i_mode; 931 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 932 return -EINVAL; 933 934 /* 935 * neither in nor out is a pipe, setup an internal pipe attached to 936 * 'out' and transfer the wanted data from 'in' to 'out' through that 937 */ 938 pipe = current->splice_pipe; 939 if (unlikely(!pipe)) { 940 pipe = alloc_pipe_info(); 941 if (!pipe) 942 return -ENOMEM; 943 944 /* 945 * We don't have an immediate reader, but we'll read the stuff 946 * out of the pipe right after the splice_to_pipe(). So set 947 * PIPE_READERS appropriately. 948 */ 949 pipe->readers = 1; 950 951 current->splice_pipe = pipe; 952 } 953 954 /* 955 * Do the splice. 956 */ 957 ret = 0; 958 bytes = 0; 959 len = sd->total_len; 960 flags = sd->flags; 961 962 /* 963 * Don't block on output, we have to drain the direct pipe. 964 */ 965 sd->flags &= ~SPLICE_F_NONBLOCK; 966 more = sd->flags & SPLICE_F_MORE; 967 968 while (len) { 969 size_t read_len; 970 loff_t pos = sd->pos, prev_pos = pos; 971 972 ret = do_splice_to(in, &pos, pipe, len, flags); 973 if (unlikely(ret <= 0)) 974 goto out_release; 975 976 read_len = ret; 977 sd->total_len = read_len; 978 979 /* 980 * If more data is pending, set SPLICE_F_MORE 981 * If this is the last data and SPLICE_F_MORE was not set 982 * initially, clears it. 983 */ 984 if (read_len < len) 985 sd->flags |= SPLICE_F_MORE; 986 else if (!more) 987 sd->flags &= ~SPLICE_F_MORE; 988 /* 989 * NOTE: nonblocking mode only applies to the input. We 990 * must not do the output in nonblocking mode as then we 991 * could get stuck data in the internal pipe: 992 */ 993 ret = actor(pipe, sd); 994 if (unlikely(ret <= 0)) { 995 sd->pos = prev_pos; 996 goto out_release; 997 } 998 999 bytes += ret; 1000 len -= ret; 1001 sd->pos = pos; 1002 1003 if (ret < read_len) { 1004 sd->pos = prev_pos + ret; 1005 goto out_release; 1006 } 1007 } 1008 1009 done: 1010 pipe->nrbufs = pipe->curbuf = 0; 1011 file_accessed(in); 1012 return bytes; 1013 1014 out_release: 1015 /* 1016 * If we did an incomplete transfer we must release 1017 * the pipe buffers in question: 1018 */ 1019 for (i = 0; i < pipe->buffers; i++) { 1020 struct pipe_buffer *buf = pipe->bufs + i; 1021 1022 if (buf->ops) 1023 pipe_buf_release(pipe, buf); 1024 } 1025 1026 if (!bytes) 1027 bytes = ret; 1028 1029 goto done; 1030 } 1031 EXPORT_SYMBOL(splice_direct_to_actor); 1032 1033 static int direct_splice_actor(struct pipe_inode_info *pipe, 1034 struct splice_desc *sd) 1035 { 1036 struct file *file = sd->u.file; 1037 1038 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1039 sd->flags); 1040 } 1041 1042 /** 1043 * do_splice_direct - splices data directly between two files 1044 * @in: file to splice from 1045 * @ppos: input file offset 1046 * @out: file to splice to 1047 * @opos: output file offset 1048 * @len: number of bytes to splice 1049 * @flags: splice modifier flags 1050 * 1051 * Description: 1052 * For use by do_sendfile(). splice can easily emulate sendfile, but 1053 * doing it in the application would incur an extra system call 1054 * (splice in + splice out, as compared to just sendfile()). So this helper 1055 * can splice directly through a process-private pipe. 1056 * 1057 */ 1058 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1059 loff_t *opos, size_t len, unsigned int flags) 1060 { 1061 struct splice_desc sd = { 1062 .len = len, 1063 .total_len = len, 1064 .flags = flags, 1065 .pos = *ppos, 1066 .u.file = out, 1067 .opos = opos, 1068 }; 1069 long ret; 1070 1071 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1072 return -EBADF; 1073 1074 if (unlikely(out->f_flags & O_APPEND)) 1075 return -EINVAL; 1076 1077 ret = rw_verify_area(WRITE, out, opos, len); 1078 if (unlikely(ret < 0)) 1079 return ret; 1080 1081 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1082 if (ret > 0) 1083 *ppos = sd.pos; 1084 1085 return ret; 1086 } 1087 EXPORT_SYMBOL(do_splice_direct); 1088 1089 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) 1090 { 1091 for (;;) { 1092 if (unlikely(!pipe->readers)) { 1093 send_sig(SIGPIPE, current, 0); 1094 return -EPIPE; 1095 } 1096 if (pipe->nrbufs != pipe->buffers) 1097 return 0; 1098 if (flags & SPLICE_F_NONBLOCK) 1099 return -EAGAIN; 1100 if (signal_pending(current)) 1101 return -ERESTARTSYS; 1102 pipe->waiting_writers++; 1103 pipe_wait(pipe); 1104 pipe->waiting_writers--; 1105 } 1106 } 1107 1108 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1109 struct pipe_inode_info *opipe, 1110 size_t len, unsigned int flags); 1111 1112 /* 1113 * Determine where to splice to/from. 1114 */ 1115 static long do_splice(struct file *in, loff_t __user *off_in, 1116 struct file *out, loff_t __user *off_out, 1117 size_t len, unsigned int flags) 1118 { 1119 struct pipe_inode_info *ipipe; 1120 struct pipe_inode_info *opipe; 1121 loff_t offset; 1122 long ret; 1123 1124 ipipe = get_pipe_info(in); 1125 opipe = get_pipe_info(out); 1126 1127 if (ipipe && opipe) { 1128 if (off_in || off_out) 1129 return -ESPIPE; 1130 1131 if (!(in->f_mode & FMODE_READ)) 1132 return -EBADF; 1133 1134 if (!(out->f_mode & FMODE_WRITE)) 1135 return -EBADF; 1136 1137 /* Splicing to self would be fun, but... */ 1138 if (ipipe == opipe) 1139 return -EINVAL; 1140 1141 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1142 } 1143 1144 if (ipipe) { 1145 if (off_in) 1146 return -ESPIPE; 1147 if (off_out) { 1148 if (!(out->f_mode & FMODE_PWRITE)) 1149 return -EINVAL; 1150 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1151 return -EFAULT; 1152 } else { 1153 offset = out->f_pos; 1154 } 1155 1156 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1157 return -EBADF; 1158 1159 if (unlikely(out->f_flags & O_APPEND)) 1160 return -EINVAL; 1161 1162 ret = rw_verify_area(WRITE, out, &offset, len); 1163 if (unlikely(ret < 0)) 1164 return ret; 1165 1166 file_start_write(out); 1167 ret = do_splice_from(ipipe, out, &offset, len, flags); 1168 file_end_write(out); 1169 1170 if (!off_out) 1171 out->f_pos = offset; 1172 else if (copy_to_user(off_out, &offset, sizeof(loff_t))) 1173 ret = -EFAULT; 1174 1175 return ret; 1176 } 1177 1178 if (opipe) { 1179 if (off_out) 1180 return -ESPIPE; 1181 if (off_in) { 1182 if (!(in->f_mode & FMODE_PREAD)) 1183 return -EINVAL; 1184 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1185 return -EFAULT; 1186 } else { 1187 offset = in->f_pos; 1188 } 1189 1190 pipe_lock(opipe); 1191 ret = wait_for_space(opipe, flags); 1192 if (!ret) 1193 ret = do_splice_to(in, &offset, opipe, len, flags); 1194 pipe_unlock(opipe); 1195 if (ret > 0) 1196 wakeup_pipe_readers(opipe); 1197 if (!off_in) 1198 in->f_pos = offset; 1199 else if (copy_to_user(off_in, &offset, sizeof(loff_t))) 1200 ret = -EFAULT; 1201 1202 return ret; 1203 } 1204 1205 return -EINVAL; 1206 } 1207 1208 static int iter_to_pipe(struct iov_iter *from, 1209 struct pipe_inode_info *pipe, 1210 unsigned flags) 1211 { 1212 struct pipe_buffer buf = { 1213 .ops = &user_page_pipe_buf_ops, 1214 .flags = flags 1215 }; 1216 size_t total = 0; 1217 int ret = 0; 1218 bool failed = false; 1219 1220 while (iov_iter_count(from) && !failed) { 1221 struct page *pages[16]; 1222 ssize_t copied; 1223 size_t start; 1224 int n; 1225 1226 copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start); 1227 if (copied <= 0) { 1228 ret = copied; 1229 break; 1230 } 1231 1232 for (n = 0; copied; n++, start = 0) { 1233 int size = min_t(int, copied, PAGE_SIZE - start); 1234 if (!failed) { 1235 buf.page = pages[n]; 1236 buf.offset = start; 1237 buf.len = size; 1238 ret = add_to_pipe(pipe, &buf); 1239 if (unlikely(ret < 0)) { 1240 failed = true; 1241 } else { 1242 iov_iter_advance(from, ret); 1243 total += ret; 1244 } 1245 } else { 1246 put_page(pages[n]); 1247 } 1248 copied -= size; 1249 } 1250 } 1251 return total ? total : ret; 1252 } 1253 1254 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1255 struct splice_desc *sd) 1256 { 1257 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1258 return n == sd->len ? n : -EFAULT; 1259 } 1260 1261 /* 1262 * For lack of a better implementation, implement vmsplice() to userspace 1263 * as a simple copy of the pipes pages to the user iov. 1264 */ 1265 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, 1266 unsigned long nr_segs, unsigned int flags) 1267 { 1268 struct pipe_inode_info *pipe; 1269 struct splice_desc sd; 1270 long ret; 1271 struct iovec iovstack[UIO_FASTIOV]; 1272 struct iovec *iov = iovstack; 1273 struct iov_iter iter; 1274 1275 pipe = get_pipe_info(file); 1276 if (!pipe) 1277 return -EBADF; 1278 1279 ret = import_iovec(READ, uiov, nr_segs, 1280 ARRAY_SIZE(iovstack), &iov, &iter); 1281 if (ret < 0) 1282 return ret; 1283 1284 sd.total_len = iov_iter_count(&iter); 1285 sd.len = 0; 1286 sd.flags = flags; 1287 sd.u.data = &iter; 1288 sd.pos = 0; 1289 1290 if (sd.total_len) { 1291 pipe_lock(pipe); 1292 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1293 pipe_unlock(pipe); 1294 } 1295 1296 kfree(iov); 1297 return ret; 1298 } 1299 1300 /* 1301 * vmsplice splices a user address range into a pipe. It can be thought of 1302 * as splice-from-memory, where the regular splice is splice-from-file (or 1303 * to file). In both cases the output is a pipe, naturally. 1304 */ 1305 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, 1306 unsigned long nr_segs, unsigned int flags) 1307 { 1308 struct pipe_inode_info *pipe; 1309 struct iovec iovstack[UIO_FASTIOV]; 1310 struct iovec *iov = iovstack; 1311 struct iov_iter from; 1312 long ret; 1313 unsigned buf_flag = 0; 1314 1315 if (flags & SPLICE_F_GIFT) 1316 buf_flag = PIPE_BUF_FLAG_GIFT; 1317 1318 pipe = get_pipe_info(file); 1319 if (!pipe) 1320 return -EBADF; 1321 1322 ret = import_iovec(WRITE, uiov, nr_segs, 1323 ARRAY_SIZE(iovstack), &iov, &from); 1324 if (ret < 0) 1325 return ret; 1326 1327 pipe_lock(pipe); 1328 ret = wait_for_space(pipe, flags); 1329 if (!ret) 1330 ret = iter_to_pipe(&from, pipe, buf_flag); 1331 pipe_unlock(pipe); 1332 if (ret > 0) 1333 wakeup_pipe_readers(pipe); 1334 kfree(iov); 1335 return ret; 1336 } 1337 1338 /* 1339 * Note that vmsplice only really supports true splicing _from_ user memory 1340 * to a pipe, not the other way around. Splicing from user memory is a simple 1341 * operation that can be supported without any funky alignment restrictions 1342 * or nasty vm tricks. We simply map in the user memory and fill them into 1343 * a pipe. The reverse isn't quite as easy, though. There are two possible 1344 * solutions for that: 1345 * 1346 * - memcpy() the data internally, at which point we might as well just 1347 * do a regular read() on the buffer anyway. 1348 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1349 * has restriction limitations on both ends of the pipe). 1350 * 1351 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1352 * 1353 */ 1354 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, 1355 unsigned long, nr_segs, unsigned int, flags) 1356 { 1357 struct fd f; 1358 long error; 1359 1360 if (unlikely(nr_segs > UIO_MAXIOV)) 1361 return -EINVAL; 1362 else if (unlikely(!nr_segs)) 1363 return 0; 1364 1365 error = -EBADF; 1366 f = fdget(fd); 1367 if (f.file) { 1368 if (f.file->f_mode & FMODE_WRITE) 1369 error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); 1370 else if (f.file->f_mode & FMODE_READ) 1371 error = vmsplice_to_user(f.file, iov, nr_segs, flags); 1372 1373 fdput(f); 1374 } 1375 1376 return error; 1377 } 1378 1379 #ifdef CONFIG_COMPAT 1380 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, 1381 unsigned int, nr_segs, unsigned int, flags) 1382 { 1383 unsigned i; 1384 struct iovec __user *iov; 1385 if (nr_segs > UIO_MAXIOV) 1386 return -EINVAL; 1387 iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); 1388 for (i = 0; i < nr_segs; i++) { 1389 struct compat_iovec v; 1390 if (get_user(v.iov_base, &iov32[i].iov_base) || 1391 get_user(v.iov_len, &iov32[i].iov_len) || 1392 put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || 1393 put_user(v.iov_len, &iov[i].iov_len)) 1394 return -EFAULT; 1395 } 1396 return sys_vmsplice(fd, iov, nr_segs, flags); 1397 } 1398 #endif 1399 1400 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1401 int, fd_out, loff_t __user *, off_out, 1402 size_t, len, unsigned int, flags) 1403 { 1404 struct fd in, out; 1405 long error; 1406 1407 if (unlikely(!len)) 1408 return 0; 1409 1410 error = -EBADF; 1411 in = fdget(fd_in); 1412 if (in.file) { 1413 if (in.file->f_mode & FMODE_READ) { 1414 out = fdget(fd_out); 1415 if (out.file) { 1416 if (out.file->f_mode & FMODE_WRITE) 1417 error = do_splice(in.file, off_in, 1418 out.file, off_out, 1419 len, flags); 1420 fdput(out); 1421 } 1422 } 1423 fdput(in); 1424 } 1425 return error; 1426 } 1427 1428 /* 1429 * Make sure there's data to read. Wait for input if we can, otherwise 1430 * return an appropriate error. 1431 */ 1432 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1433 { 1434 int ret; 1435 1436 /* 1437 * Check ->nrbufs without the inode lock first. This function 1438 * is speculative anyways, so missing one is ok. 1439 */ 1440 if (pipe->nrbufs) 1441 return 0; 1442 1443 ret = 0; 1444 pipe_lock(pipe); 1445 1446 while (!pipe->nrbufs) { 1447 if (signal_pending(current)) { 1448 ret = -ERESTARTSYS; 1449 break; 1450 } 1451 if (!pipe->writers) 1452 break; 1453 if (!pipe->waiting_writers) { 1454 if (flags & SPLICE_F_NONBLOCK) { 1455 ret = -EAGAIN; 1456 break; 1457 } 1458 } 1459 pipe_wait(pipe); 1460 } 1461 1462 pipe_unlock(pipe); 1463 return ret; 1464 } 1465 1466 /* 1467 * Make sure there's writeable room. Wait for room if we can, otherwise 1468 * return an appropriate error. 1469 */ 1470 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1471 { 1472 int ret; 1473 1474 /* 1475 * Check ->nrbufs without the inode lock first. This function 1476 * is speculative anyways, so missing one is ok. 1477 */ 1478 if (pipe->nrbufs < pipe->buffers) 1479 return 0; 1480 1481 ret = 0; 1482 pipe_lock(pipe); 1483 1484 while (pipe->nrbufs >= pipe->buffers) { 1485 if (!pipe->readers) { 1486 send_sig(SIGPIPE, current, 0); 1487 ret = -EPIPE; 1488 break; 1489 } 1490 if (flags & SPLICE_F_NONBLOCK) { 1491 ret = -EAGAIN; 1492 break; 1493 } 1494 if (signal_pending(current)) { 1495 ret = -ERESTARTSYS; 1496 break; 1497 } 1498 pipe->waiting_writers++; 1499 pipe_wait(pipe); 1500 pipe->waiting_writers--; 1501 } 1502 1503 pipe_unlock(pipe); 1504 return ret; 1505 } 1506 1507 /* 1508 * Splice contents of ipipe to opipe. 1509 */ 1510 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1511 struct pipe_inode_info *opipe, 1512 size_t len, unsigned int flags) 1513 { 1514 struct pipe_buffer *ibuf, *obuf; 1515 int ret = 0, nbuf; 1516 bool input_wakeup = false; 1517 1518 1519 retry: 1520 ret = ipipe_prep(ipipe, flags); 1521 if (ret) 1522 return ret; 1523 1524 ret = opipe_prep(opipe, flags); 1525 if (ret) 1526 return ret; 1527 1528 /* 1529 * Potential ABBA deadlock, work around it by ordering lock 1530 * grabbing by pipe info address. Otherwise two different processes 1531 * could deadlock (one doing tee from A -> B, the other from B -> A). 1532 */ 1533 pipe_double_lock(ipipe, opipe); 1534 1535 do { 1536 if (!opipe->readers) { 1537 send_sig(SIGPIPE, current, 0); 1538 if (!ret) 1539 ret = -EPIPE; 1540 break; 1541 } 1542 1543 if (!ipipe->nrbufs && !ipipe->writers) 1544 break; 1545 1546 /* 1547 * Cannot make any progress, because either the input 1548 * pipe is empty or the output pipe is full. 1549 */ 1550 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { 1551 /* Already processed some buffers, break */ 1552 if (ret) 1553 break; 1554 1555 if (flags & SPLICE_F_NONBLOCK) { 1556 ret = -EAGAIN; 1557 break; 1558 } 1559 1560 /* 1561 * We raced with another reader/writer and haven't 1562 * managed to process any buffers. A zero return 1563 * value means EOF, so retry instead. 1564 */ 1565 pipe_unlock(ipipe); 1566 pipe_unlock(opipe); 1567 goto retry; 1568 } 1569 1570 ibuf = ipipe->bufs + ipipe->curbuf; 1571 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1572 obuf = opipe->bufs + nbuf; 1573 1574 if (len >= ibuf->len) { 1575 /* 1576 * Simply move the whole buffer from ipipe to opipe 1577 */ 1578 *obuf = *ibuf; 1579 ibuf->ops = NULL; 1580 opipe->nrbufs++; 1581 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); 1582 ipipe->nrbufs--; 1583 input_wakeup = true; 1584 } else { 1585 /* 1586 * Get a reference to this pipe buffer, 1587 * so we can copy the contents over. 1588 */ 1589 pipe_buf_get(ipipe, ibuf); 1590 *obuf = *ibuf; 1591 1592 /* 1593 * Don't inherit the gift flag, we need to 1594 * prevent multiple steals of this page. 1595 */ 1596 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1597 1598 obuf->len = len; 1599 opipe->nrbufs++; 1600 ibuf->offset += obuf->len; 1601 ibuf->len -= obuf->len; 1602 } 1603 ret += obuf->len; 1604 len -= obuf->len; 1605 } while (len); 1606 1607 pipe_unlock(ipipe); 1608 pipe_unlock(opipe); 1609 1610 /* 1611 * If we put data in the output pipe, wakeup any potential readers. 1612 */ 1613 if (ret > 0) 1614 wakeup_pipe_readers(opipe); 1615 1616 if (input_wakeup) 1617 wakeup_pipe_writers(ipipe); 1618 1619 return ret; 1620 } 1621 1622 /* 1623 * Link contents of ipipe to opipe. 1624 */ 1625 static int link_pipe(struct pipe_inode_info *ipipe, 1626 struct pipe_inode_info *opipe, 1627 size_t len, unsigned int flags) 1628 { 1629 struct pipe_buffer *ibuf, *obuf; 1630 int ret = 0, i = 0, nbuf; 1631 1632 /* 1633 * Potential ABBA deadlock, work around it by ordering lock 1634 * grabbing by pipe info address. Otherwise two different processes 1635 * could deadlock (one doing tee from A -> B, the other from B -> A). 1636 */ 1637 pipe_double_lock(ipipe, opipe); 1638 1639 do { 1640 if (!opipe->readers) { 1641 send_sig(SIGPIPE, current, 0); 1642 if (!ret) 1643 ret = -EPIPE; 1644 break; 1645 } 1646 1647 /* 1648 * If we have iterated all input buffers or ran out of 1649 * output room, break. 1650 */ 1651 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) 1652 break; 1653 1654 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); 1655 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1656 1657 /* 1658 * Get a reference to this pipe buffer, 1659 * so we can copy the contents over. 1660 */ 1661 pipe_buf_get(ipipe, ibuf); 1662 1663 obuf = opipe->bufs + nbuf; 1664 *obuf = *ibuf; 1665 1666 /* 1667 * Don't inherit the gift flag, we need to 1668 * prevent multiple steals of this page. 1669 */ 1670 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1671 1672 if (obuf->len > len) 1673 obuf->len = len; 1674 1675 opipe->nrbufs++; 1676 ret += obuf->len; 1677 len -= obuf->len; 1678 i++; 1679 } while (len); 1680 1681 /* 1682 * return EAGAIN if we have the potential of some data in the 1683 * future, otherwise just return 0 1684 */ 1685 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1686 ret = -EAGAIN; 1687 1688 pipe_unlock(ipipe); 1689 pipe_unlock(opipe); 1690 1691 /* 1692 * If we put data in the output pipe, wakeup any potential readers. 1693 */ 1694 if (ret > 0) 1695 wakeup_pipe_readers(opipe); 1696 1697 return ret; 1698 } 1699 1700 /* 1701 * This is a tee(1) implementation that works on pipes. It doesn't copy 1702 * any data, it simply references the 'in' pages on the 'out' pipe. 1703 * The 'flags' used are the SPLICE_F_* variants, currently the only 1704 * applicable one is SPLICE_F_NONBLOCK. 1705 */ 1706 static long do_tee(struct file *in, struct file *out, size_t len, 1707 unsigned int flags) 1708 { 1709 struct pipe_inode_info *ipipe = get_pipe_info(in); 1710 struct pipe_inode_info *opipe = get_pipe_info(out); 1711 int ret = -EINVAL; 1712 1713 /* 1714 * Duplicate the contents of ipipe to opipe without actually 1715 * copying the data. 1716 */ 1717 if (ipipe && opipe && ipipe != opipe) { 1718 /* 1719 * Keep going, unless we encounter an error. The ipipe/opipe 1720 * ordering doesn't really matter. 1721 */ 1722 ret = ipipe_prep(ipipe, flags); 1723 if (!ret) { 1724 ret = opipe_prep(opipe, flags); 1725 if (!ret) 1726 ret = link_pipe(ipipe, opipe, len, flags); 1727 } 1728 } 1729 1730 return ret; 1731 } 1732 1733 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 1734 { 1735 struct fd in; 1736 int error; 1737 1738 if (unlikely(!len)) 1739 return 0; 1740 1741 error = -EBADF; 1742 in = fdget(fdin); 1743 if (in.file) { 1744 if (in.file->f_mode & FMODE_READ) { 1745 struct fd out = fdget(fdout); 1746 if (out.file) { 1747 if (out.file->f_mode & FMODE_WRITE) 1748 error = do_tee(in.file, out.file, 1749 len, flags); 1750 fdput(out); 1751 } 1752 } 1753 fdput(in); 1754 } 1755 1756 return error; 1757 } 1758