1 /* 2 * "splice": joining two ropes together by interweaving their strands. 3 * 4 * This is the "extended pipe" functionality, where a pipe is used as 5 * an arbitrary in-memory buffer. Think of a pipe as a small kernel 6 * buffer that you can use to transfer data from one end to the other. 7 * 8 * The traditional unix read/write is extended with a "splice()" operation 9 * that transfers data buffers to or from a pipe buffer. 10 * 11 * Named by Larry McVoy, original implementation from Linus, extended by 12 * Jens to support splicing to files, network, direct splicing, etc and 13 * fixing lots of bugs. 14 * 15 * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> 16 * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> 17 * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> 18 * 19 */ 20 #include <linux/fs.h> 21 #include <linux/file.h> 22 #include <linux/pagemap.h> 23 #include <linux/splice.h> 24 #include <linux/memcontrol.h> 25 #include <linux/mm_inline.h> 26 #include <linux/swap.h> 27 #include <linux/writeback.h> 28 #include <linux/export.h> 29 #include <linux/syscalls.h> 30 #include <linux/uio.h> 31 #include <linux/security.h> 32 #include <linux/gfp.h> 33 #include <linux/socket.h> 34 #include <linux/compat.h> 35 #include "internal.h" 36 37 /* 38 * Attempt to steal a page from a pipe buffer. This should perhaps go into 39 * a vm helper function, it's already simplified quite a bit by the 40 * addition of remove_mapping(). If success is returned, the caller may 41 * attempt to reuse this page for another destination. 42 */ 43 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, 44 struct pipe_buffer *buf) 45 { 46 struct page *page = buf->page; 47 struct address_space *mapping; 48 49 lock_page(page); 50 51 mapping = page_mapping(page); 52 if (mapping) { 53 WARN_ON(!PageUptodate(page)); 54 55 /* 56 * At least for ext2 with nobh option, we need to wait on 57 * writeback completing on this page, since we'll remove it 58 * from the pagecache. Otherwise truncate wont wait on the 59 * page, allowing the disk blocks to be reused by someone else 60 * before we actually wrote our data to them. fs corruption 61 * ensues. 62 */ 63 wait_on_page_writeback(page); 64 65 if (page_has_private(page) && 66 !try_to_release_page(page, GFP_KERNEL)) 67 goto out_unlock; 68 69 /* 70 * If we succeeded in removing the mapping, set LRU flag 71 * and return good. 72 */ 73 if (remove_mapping(mapping, page)) { 74 buf->flags |= PIPE_BUF_FLAG_LRU; 75 return 0; 76 } 77 } 78 79 /* 80 * Raced with truncate or failed to remove page from current 81 * address space, unlock and return failure. 82 */ 83 out_unlock: 84 unlock_page(page); 85 return 1; 86 } 87 88 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, 89 struct pipe_buffer *buf) 90 { 91 page_cache_release(buf->page); 92 buf->flags &= ~PIPE_BUF_FLAG_LRU; 93 } 94 95 /* 96 * Check whether the contents of buf is OK to access. Since the content 97 * is a page cache page, IO may be in flight. 98 */ 99 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, 100 struct pipe_buffer *buf) 101 { 102 struct page *page = buf->page; 103 int err; 104 105 if (!PageUptodate(page)) { 106 lock_page(page); 107 108 /* 109 * Page got truncated/unhashed. This will cause a 0-byte 110 * splice, if this is the first page. 111 */ 112 if (!page->mapping) { 113 err = -ENODATA; 114 goto error; 115 } 116 117 /* 118 * Uh oh, read-error from disk. 119 */ 120 if (!PageUptodate(page)) { 121 err = -EIO; 122 goto error; 123 } 124 125 /* 126 * Page is ok afterall, we are done. 127 */ 128 unlock_page(page); 129 } 130 131 return 0; 132 error: 133 unlock_page(page); 134 return err; 135 } 136 137 const struct pipe_buf_operations page_cache_pipe_buf_ops = { 138 .can_merge = 0, 139 .confirm = page_cache_pipe_buf_confirm, 140 .release = page_cache_pipe_buf_release, 141 .steal = page_cache_pipe_buf_steal, 142 .get = generic_pipe_buf_get, 143 }; 144 145 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, 146 struct pipe_buffer *buf) 147 { 148 if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) 149 return 1; 150 151 buf->flags |= PIPE_BUF_FLAG_LRU; 152 return generic_pipe_buf_steal(pipe, buf); 153 } 154 155 static const struct pipe_buf_operations user_page_pipe_buf_ops = { 156 .can_merge = 0, 157 .confirm = generic_pipe_buf_confirm, 158 .release = page_cache_pipe_buf_release, 159 .steal = user_page_pipe_buf_steal, 160 .get = generic_pipe_buf_get, 161 }; 162 163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe) 164 { 165 smp_mb(); 166 if (waitqueue_active(&pipe->wait)) 167 wake_up_interruptible(&pipe->wait); 168 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 169 } 170 171 /** 172 * splice_to_pipe - fill passed data into a pipe 173 * @pipe: pipe to fill 174 * @spd: data to fill 175 * 176 * Description: 177 * @spd contains a map of pages and len/offset tuples, along with 178 * the struct pipe_buf_operations associated with these pages. This 179 * function will link that data to the pipe. 180 * 181 */ 182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe, 183 struct splice_pipe_desc *spd) 184 { 185 unsigned int spd_pages = spd->nr_pages; 186 int ret, do_wakeup, page_nr; 187 188 ret = 0; 189 do_wakeup = 0; 190 page_nr = 0; 191 192 pipe_lock(pipe); 193 194 for (;;) { 195 if (!pipe->readers) { 196 send_sig(SIGPIPE, current, 0); 197 if (!ret) 198 ret = -EPIPE; 199 break; 200 } 201 202 if (pipe->nrbufs < pipe->buffers) { 203 int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); 204 struct pipe_buffer *buf = pipe->bufs + newbuf; 205 206 buf->page = spd->pages[page_nr]; 207 buf->offset = spd->partial[page_nr].offset; 208 buf->len = spd->partial[page_nr].len; 209 buf->private = spd->partial[page_nr].private; 210 buf->ops = spd->ops; 211 if (spd->flags & SPLICE_F_GIFT) 212 buf->flags |= PIPE_BUF_FLAG_GIFT; 213 214 pipe->nrbufs++; 215 page_nr++; 216 ret += buf->len; 217 218 if (pipe->files) 219 do_wakeup = 1; 220 221 if (!--spd->nr_pages) 222 break; 223 if (pipe->nrbufs < pipe->buffers) 224 continue; 225 226 break; 227 } 228 229 if (spd->flags & SPLICE_F_NONBLOCK) { 230 if (!ret) 231 ret = -EAGAIN; 232 break; 233 } 234 235 if (signal_pending(current)) { 236 if (!ret) 237 ret = -ERESTARTSYS; 238 break; 239 } 240 241 if (do_wakeup) { 242 smp_mb(); 243 if (waitqueue_active(&pipe->wait)) 244 wake_up_interruptible_sync(&pipe->wait); 245 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 246 do_wakeup = 0; 247 } 248 249 pipe->waiting_writers++; 250 pipe_wait(pipe); 251 pipe->waiting_writers--; 252 } 253 254 pipe_unlock(pipe); 255 256 if (do_wakeup) 257 wakeup_pipe_readers(pipe); 258 259 while (page_nr < spd_pages) 260 spd->spd_release(spd, page_nr++); 261 262 return ret; 263 } 264 265 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) 266 { 267 page_cache_release(spd->pages[i]); 268 } 269 270 /* 271 * Check if we need to grow the arrays holding pages and partial page 272 * descriptions. 273 */ 274 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) 275 { 276 unsigned int buffers = ACCESS_ONCE(pipe->buffers); 277 278 spd->nr_pages_max = buffers; 279 if (buffers <= PIPE_DEF_BUFFERS) 280 return 0; 281 282 spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); 283 spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); 284 285 if (spd->pages && spd->partial) 286 return 0; 287 288 kfree(spd->pages); 289 kfree(spd->partial); 290 return -ENOMEM; 291 } 292 293 void splice_shrink_spd(struct splice_pipe_desc *spd) 294 { 295 if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) 296 return; 297 298 kfree(spd->pages); 299 kfree(spd->partial); 300 } 301 302 static int 303 __generic_file_splice_read(struct file *in, loff_t *ppos, 304 struct pipe_inode_info *pipe, size_t len, 305 unsigned int flags) 306 { 307 struct address_space *mapping = in->f_mapping; 308 unsigned int loff, nr_pages, req_pages; 309 struct page *pages[PIPE_DEF_BUFFERS]; 310 struct partial_page partial[PIPE_DEF_BUFFERS]; 311 struct page *page; 312 pgoff_t index, end_index; 313 loff_t isize; 314 int error, page_nr; 315 struct splice_pipe_desc spd = { 316 .pages = pages, 317 .partial = partial, 318 .nr_pages_max = PIPE_DEF_BUFFERS, 319 .flags = flags, 320 .ops = &page_cache_pipe_buf_ops, 321 .spd_release = spd_release_page, 322 }; 323 324 if (splice_grow_spd(pipe, &spd)) 325 return -ENOMEM; 326 327 index = *ppos >> PAGE_CACHE_SHIFT; 328 loff = *ppos & ~PAGE_CACHE_MASK; 329 req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 330 nr_pages = min(req_pages, spd.nr_pages_max); 331 332 /* 333 * Lookup the (hopefully) full range of pages we need. 334 */ 335 spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages); 336 index += spd.nr_pages; 337 338 /* 339 * If find_get_pages_contig() returned fewer pages than we needed, 340 * readahead/allocate the rest and fill in the holes. 341 */ 342 if (spd.nr_pages < nr_pages) 343 page_cache_sync_readahead(mapping, &in->f_ra, in, 344 index, req_pages - spd.nr_pages); 345 346 error = 0; 347 while (spd.nr_pages < nr_pages) { 348 /* 349 * Page could be there, find_get_pages_contig() breaks on 350 * the first hole. 351 */ 352 page = find_get_page(mapping, index); 353 if (!page) { 354 /* 355 * page didn't exist, allocate one. 356 */ 357 page = page_cache_alloc_cold(mapping); 358 if (!page) 359 break; 360 361 error = add_to_page_cache_lru(page, mapping, index, 362 GFP_KERNEL); 363 if (unlikely(error)) { 364 page_cache_release(page); 365 if (error == -EEXIST) 366 continue; 367 break; 368 } 369 /* 370 * add_to_page_cache() locks the page, unlock it 371 * to avoid convoluting the logic below even more. 372 */ 373 unlock_page(page); 374 } 375 376 spd.pages[spd.nr_pages++] = page; 377 index++; 378 } 379 380 /* 381 * Now loop over the map and see if we need to start IO on any 382 * pages, fill in the partial map, etc. 383 */ 384 index = *ppos >> PAGE_CACHE_SHIFT; 385 nr_pages = spd.nr_pages; 386 spd.nr_pages = 0; 387 for (page_nr = 0; page_nr < nr_pages; page_nr++) { 388 unsigned int this_len; 389 390 if (!len) 391 break; 392 393 /* 394 * this_len is the max we'll use from this page 395 */ 396 this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); 397 page = spd.pages[page_nr]; 398 399 if (PageReadahead(page)) 400 page_cache_async_readahead(mapping, &in->f_ra, in, 401 page, index, req_pages - page_nr); 402 403 /* 404 * If the page isn't uptodate, we may need to start io on it 405 */ 406 if (!PageUptodate(page)) { 407 lock_page(page); 408 409 /* 410 * Page was truncated, or invalidated by the 411 * filesystem. Redo the find/create, but this time the 412 * page is kept locked, so there's no chance of another 413 * race with truncate/invalidate. 414 */ 415 if (!page->mapping) { 416 unlock_page(page); 417 page = find_or_create_page(mapping, index, 418 mapping_gfp_mask(mapping)); 419 420 if (!page) { 421 error = -ENOMEM; 422 break; 423 } 424 page_cache_release(spd.pages[page_nr]); 425 spd.pages[page_nr] = page; 426 } 427 /* 428 * page was already under io and is now done, great 429 */ 430 if (PageUptodate(page)) { 431 unlock_page(page); 432 goto fill_it; 433 } 434 435 /* 436 * need to read in the page 437 */ 438 error = mapping->a_ops->readpage(in, page); 439 if (unlikely(error)) { 440 /* 441 * We really should re-lookup the page here, 442 * but it complicates things a lot. Instead 443 * lets just do what we already stored, and 444 * we'll get it the next time we are called. 445 */ 446 if (error == AOP_TRUNCATED_PAGE) 447 error = 0; 448 449 break; 450 } 451 } 452 fill_it: 453 /* 454 * i_size must be checked after PageUptodate. 455 */ 456 isize = i_size_read(mapping->host); 457 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 458 if (unlikely(!isize || index > end_index)) 459 break; 460 461 /* 462 * if this is the last page, see if we need to shrink 463 * the length and stop 464 */ 465 if (end_index == index) { 466 unsigned int plen; 467 468 /* 469 * max good bytes in this page 470 */ 471 plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; 472 if (plen <= loff) 473 break; 474 475 /* 476 * force quit after adding this page 477 */ 478 this_len = min(this_len, plen - loff); 479 len = this_len; 480 } 481 482 spd.partial[page_nr].offset = loff; 483 spd.partial[page_nr].len = this_len; 484 len -= this_len; 485 loff = 0; 486 spd.nr_pages++; 487 index++; 488 } 489 490 /* 491 * Release any pages at the end, if we quit early. 'page_nr' is how far 492 * we got, 'nr_pages' is how many pages are in the map. 493 */ 494 while (page_nr < nr_pages) 495 page_cache_release(spd.pages[page_nr++]); 496 in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 497 498 if (spd.nr_pages) 499 error = splice_to_pipe(pipe, &spd); 500 501 splice_shrink_spd(&spd); 502 return error; 503 } 504 505 /** 506 * generic_file_splice_read - splice data from file to a pipe 507 * @in: file to splice from 508 * @ppos: position in @in 509 * @pipe: pipe to splice to 510 * @len: number of bytes to splice 511 * @flags: splice modifier flags 512 * 513 * Description: 514 * Will read pages from given file and fill them into a pipe. Can be 515 * used as long as the address_space operations for the source implements 516 * a readpage() hook. 517 * 518 */ 519 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, 520 struct pipe_inode_info *pipe, size_t len, 521 unsigned int flags) 522 { 523 loff_t isize, left; 524 int ret; 525 526 isize = i_size_read(in->f_mapping->host); 527 if (unlikely(*ppos >= isize)) 528 return 0; 529 530 left = isize - *ppos; 531 if (unlikely(left < len)) 532 len = left; 533 534 ret = __generic_file_splice_read(in, ppos, pipe, len, flags); 535 if (ret > 0) { 536 *ppos += ret; 537 file_accessed(in); 538 } 539 540 return ret; 541 } 542 EXPORT_SYMBOL(generic_file_splice_read); 543 544 static const struct pipe_buf_operations default_pipe_buf_ops = { 545 .can_merge = 0, 546 .confirm = generic_pipe_buf_confirm, 547 .release = generic_pipe_buf_release, 548 .steal = generic_pipe_buf_steal, 549 .get = generic_pipe_buf_get, 550 }; 551 552 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe, 553 struct pipe_buffer *buf) 554 { 555 return 1; 556 } 557 558 /* Pipe buffer operations for a socket and similar. */ 559 const struct pipe_buf_operations nosteal_pipe_buf_ops = { 560 .can_merge = 0, 561 .confirm = generic_pipe_buf_confirm, 562 .release = generic_pipe_buf_release, 563 .steal = generic_pipe_buf_nosteal, 564 .get = generic_pipe_buf_get, 565 }; 566 EXPORT_SYMBOL(nosteal_pipe_buf_ops); 567 568 static ssize_t kernel_readv(struct file *file, const struct iovec *vec, 569 unsigned long vlen, loff_t offset) 570 { 571 mm_segment_t old_fs; 572 loff_t pos = offset; 573 ssize_t res; 574 575 old_fs = get_fs(); 576 set_fs(get_ds()); 577 /* The cast to a user pointer is valid due to the set_fs() */ 578 res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); 579 set_fs(old_fs); 580 581 return res; 582 } 583 584 ssize_t kernel_write(struct file *file, const char *buf, size_t count, 585 loff_t pos) 586 { 587 mm_segment_t old_fs; 588 ssize_t res; 589 590 old_fs = get_fs(); 591 set_fs(get_ds()); 592 /* The cast to a user pointer is valid due to the set_fs() */ 593 res = vfs_write(file, (__force const char __user *)buf, count, &pos); 594 set_fs(old_fs); 595 596 return res; 597 } 598 EXPORT_SYMBOL(kernel_write); 599 600 ssize_t default_file_splice_read(struct file *in, loff_t *ppos, 601 struct pipe_inode_info *pipe, size_t len, 602 unsigned int flags) 603 { 604 unsigned int nr_pages; 605 unsigned int nr_freed; 606 size_t offset; 607 struct page *pages[PIPE_DEF_BUFFERS]; 608 struct partial_page partial[PIPE_DEF_BUFFERS]; 609 struct iovec *vec, __vec[PIPE_DEF_BUFFERS]; 610 ssize_t res; 611 size_t this_len; 612 int error; 613 int i; 614 struct splice_pipe_desc spd = { 615 .pages = pages, 616 .partial = partial, 617 .nr_pages_max = PIPE_DEF_BUFFERS, 618 .flags = flags, 619 .ops = &default_pipe_buf_ops, 620 .spd_release = spd_release_page, 621 }; 622 623 if (splice_grow_spd(pipe, &spd)) 624 return -ENOMEM; 625 626 res = -ENOMEM; 627 vec = __vec; 628 if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { 629 vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); 630 if (!vec) 631 goto shrink_ret; 632 } 633 634 offset = *ppos & ~PAGE_CACHE_MASK; 635 nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 636 637 for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { 638 struct page *page; 639 640 page = alloc_page(GFP_USER); 641 error = -ENOMEM; 642 if (!page) 643 goto err; 644 645 this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); 646 vec[i].iov_base = (void __user *) page_address(page); 647 vec[i].iov_len = this_len; 648 spd.pages[i] = page; 649 spd.nr_pages++; 650 len -= this_len; 651 offset = 0; 652 } 653 654 res = kernel_readv(in, vec, spd.nr_pages, *ppos); 655 if (res < 0) { 656 error = res; 657 goto err; 658 } 659 660 error = 0; 661 if (!res) 662 goto err; 663 664 nr_freed = 0; 665 for (i = 0; i < spd.nr_pages; i++) { 666 this_len = min_t(size_t, vec[i].iov_len, res); 667 spd.partial[i].offset = 0; 668 spd.partial[i].len = this_len; 669 if (!this_len) { 670 __free_page(spd.pages[i]); 671 spd.pages[i] = NULL; 672 nr_freed++; 673 } 674 res -= this_len; 675 } 676 spd.nr_pages -= nr_freed; 677 678 res = splice_to_pipe(pipe, &spd); 679 if (res > 0) 680 *ppos += res; 681 682 shrink_ret: 683 if (vec != __vec) 684 kfree(vec); 685 splice_shrink_spd(&spd); 686 return res; 687 688 err: 689 for (i = 0; i < spd.nr_pages; i++) 690 __free_page(spd.pages[i]); 691 692 res = error; 693 goto shrink_ret; 694 } 695 EXPORT_SYMBOL(default_file_splice_read); 696 697 /* 698 * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' 699 * using sendpage(). Return the number of bytes sent. 700 */ 701 static int pipe_to_sendpage(struct pipe_inode_info *pipe, 702 struct pipe_buffer *buf, struct splice_desc *sd) 703 { 704 struct file *file = sd->u.file; 705 loff_t pos = sd->pos; 706 int more; 707 708 if (!likely(file->f_op->sendpage)) 709 return -EINVAL; 710 711 more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0; 712 713 if (sd->len < sd->total_len && pipe->nrbufs > 1) 714 more |= MSG_SENDPAGE_NOTLAST; 715 716 return file->f_op->sendpage(file, buf->page, buf->offset, 717 sd->len, &pos, more); 718 } 719 720 /* 721 * This is a little more tricky than the file -> pipe splicing. There are 722 * basically three cases: 723 * 724 * - Destination page already exists in the address space and there 725 * are users of it. For that case we have no other option that 726 * copying the data. Tough luck. 727 * - Destination page already exists in the address space, but there 728 * are no users of it. Make sure it's uptodate, then drop it. Fall 729 * through to last case. 730 * - Destination page does not exist, we can add the pipe page to 731 * the page cache and avoid the copy. 732 * 733 * If asked to move pages to the output file (SPLICE_F_MOVE is set in 734 * sd->flags), we attempt to migrate pages from the pipe to the output 735 * file address space page cache. This is possible if no one else has 736 * the pipe page referenced outside of the pipe and page cache. If 737 * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create 738 * a new page in the output file page cache and fill/dirty that. 739 */ 740 int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 741 struct splice_desc *sd) 742 { 743 struct file *file = sd->u.file; 744 struct address_space *mapping = file->f_mapping; 745 unsigned int offset, this_len; 746 struct page *page; 747 void *fsdata; 748 int ret; 749 750 offset = sd->pos & ~PAGE_CACHE_MASK; 751 752 this_len = sd->len; 753 if (this_len + offset > PAGE_CACHE_SIZE) 754 this_len = PAGE_CACHE_SIZE - offset; 755 756 ret = pagecache_write_begin(file, mapping, sd->pos, this_len, 757 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); 758 if (unlikely(ret)) 759 goto out; 760 761 if (buf->page != page) { 762 char *src = kmap_atomic(buf->page); 763 char *dst = kmap_atomic(page); 764 765 memcpy(dst + offset, src + buf->offset, this_len); 766 flush_dcache_page(page); 767 kunmap_atomic(dst); 768 kunmap_atomic(src); 769 } 770 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, 771 page, fsdata); 772 out: 773 return ret; 774 } 775 EXPORT_SYMBOL(pipe_to_file); 776 777 static void wakeup_pipe_writers(struct pipe_inode_info *pipe) 778 { 779 smp_mb(); 780 if (waitqueue_active(&pipe->wait)) 781 wake_up_interruptible(&pipe->wait); 782 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 783 } 784 785 /** 786 * splice_from_pipe_feed - feed available data from a pipe to a file 787 * @pipe: pipe to splice from 788 * @sd: information to @actor 789 * @actor: handler that splices the data 790 * 791 * Description: 792 * This function loops over the pipe and calls @actor to do the 793 * actual moving of a single struct pipe_buffer to the desired 794 * destination. It returns when there's no more buffers left in 795 * the pipe or if the requested number of bytes (@sd->total_len) 796 * have been copied. It returns a positive number (one) if the 797 * pipe needs to be filled with more data, zero if the required 798 * number of bytes have been copied and -errno on error. 799 * 800 * This, together with splice_from_pipe_{begin,end,next}, may be 801 * used to implement the functionality of __splice_from_pipe() when 802 * locking is required around copying the pipe buffers to the 803 * destination. 804 */ 805 int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, 806 splice_actor *actor) 807 { 808 int ret; 809 810 while (pipe->nrbufs) { 811 struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; 812 const struct pipe_buf_operations *ops = buf->ops; 813 814 sd->len = buf->len; 815 if (sd->len > sd->total_len) 816 sd->len = sd->total_len; 817 818 ret = buf->ops->confirm(pipe, buf); 819 if (unlikely(ret)) { 820 if (ret == -ENODATA) 821 ret = 0; 822 return ret; 823 } 824 825 ret = actor(pipe, buf, sd); 826 if (ret <= 0) 827 return ret; 828 829 buf->offset += ret; 830 buf->len -= ret; 831 832 sd->num_spliced += ret; 833 sd->len -= ret; 834 sd->pos += ret; 835 sd->total_len -= ret; 836 837 if (!buf->len) { 838 buf->ops = NULL; 839 ops->release(pipe, buf); 840 pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); 841 pipe->nrbufs--; 842 if (pipe->files) 843 sd->need_wakeup = true; 844 } 845 846 if (!sd->total_len) 847 return 0; 848 } 849 850 return 1; 851 } 852 EXPORT_SYMBOL(splice_from_pipe_feed); 853 854 /** 855 * splice_from_pipe_next - wait for some data to splice from 856 * @pipe: pipe to splice from 857 * @sd: information about the splice operation 858 * 859 * Description: 860 * This function will wait for some data and return a positive 861 * value (one) if pipe buffers are available. It will return zero 862 * or -errno if no more data needs to be spliced. 863 */ 864 int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd) 865 { 866 while (!pipe->nrbufs) { 867 if (!pipe->writers) 868 return 0; 869 870 if (!pipe->waiting_writers && sd->num_spliced) 871 return 0; 872 873 if (sd->flags & SPLICE_F_NONBLOCK) 874 return -EAGAIN; 875 876 if (signal_pending(current)) 877 return -ERESTARTSYS; 878 879 if (sd->need_wakeup) { 880 wakeup_pipe_writers(pipe); 881 sd->need_wakeup = false; 882 } 883 884 pipe_wait(pipe); 885 } 886 887 return 1; 888 } 889 EXPORT_SYMBOL(splice_from_pipe_next); 890 891 /** 892 * splice_from_pipe_begin - start splicing from pipe 893 * @sd: information about the splice operation 894 * 895 * Description: 896 * This function should be called before a loop containing 897 * splice_from_pipe_next() and splice_from_pipe_feed() to 898 * initialize the necessary fields of @sd. 899 */ 900 void splice_from_pipe_begin(struct splice_desc *sd) 901 { 902 sd->num_spliced = 0; 903 sd->need_wakeup = false; 904 } 905 EXPORT_SYMBOL(splice_from_pipe_begin); 906 907 /** 908 * splice_from_pipe_end - finish splicing from pipe 909 * @pipe: pipe to splice from 910 * @sd: information about the splice operation 911 * 912 * Description: 913 * This function will wake up pipe writers if necessary. It should 914 * be called after a loop containing splice_from_pipe_next() and 915 * splice_from_pipe_feed(). 916 */ 917 void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd) 918 { 919 if (sd->need_wakeup) 920 wakeup_pipe_writers(pipe); 921 } 922 EXPORT_SYMBOL(splice_from_pipe_end); 923 924 /** 925 * __splice_from_pipe - splice data from a pipe to given actor 926 * @pipe: pipe to splice from 927 * @sd: information to @actor 928 * @actor: handler that splices the data 929 * 930 * Description: 931 * This function does little more than loop over the pipe and call 932 * @actor to do the actual moving of a single struct pipe_buffer to 933 * the desired destination. See pipe_to_file, pipe_to_sendpage, or 934 * pipe_to_user. 935 * 936 */ 937 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, 938 splice_actor *actor) 939 { 940 int ret; 941 942 splice_from_pipe_begin(sd); 943 do { 944 ret = splice_from_pipe_next(pipe, sd); 945 if (ret > 0) 946 ret = splice_from_pipe_feed(pipe, sd, actor); 947 } while (ret > 0); 948 splice_from_pipe_end(pipe, sd); 949 950 return sd->num_spliced ? sd->num_spliced : ret; 951 } 952 EXPORT_SYMBOL(__splice_from_pipe); 953 954 /** 955 * splice_from_pipe - splice data from a pipe to a file 956 * @pipe: pipe to splice from 957 * @out: file to splice to 958 * @ppos: position in @out 959 * @len: how many bytes to splice 960 * @flags: splice modifier flags 961 * @actor: handler that splices the data 962 * 963 * Description: 964 * See __splice_from_pipe. This function locks the pipe inode, 965 * otherwise it's identical to __splice_from_pipe(). 966 * 967 */ 968 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, 969 loff_t *ppos, size_t len, unsigned int flags, 970 splice_actor *actor) 971 { 972 ssize_t ret; 973 struct splice_desc sd = { 974 .total_len = len, 975 .flags = flags, 976 .pos = *ppos, 977 .u.file = out, 978 }; 979 980 pipe_lock(pipe); 981 ret = __splice_from_pipe(pipe, &sd, actor); 982 pipe_unlock(pipe); 983 984 return ret; 985 } 986 987 /** 988 * generic_file_splice_write - splice data from a pipe to a file 989 * @pipe: pipe info 990 * @out: file to write to 991 * @ppos: position in @out 992 * @len: number of bytes to splice 993 * @flags: splice modifier flags 994 * 995 * Description: 996 * Will either move or copy pages (determined by @flags options) from 997 * the given pipe inode to the given file. 998 * 999 */ 1000 ssize_t 1001 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, 1002 loff_t *ppos, size_t len, unsigned int flags) 1003 { 1004 struct address_space *mapping = out->f_mapping; 1005 struct inode *inode = mapping->host; 1006 struct splice_desc sd = { 1007 .total_len = len, 1008 .flags = flags, 1009 .pos = *ppos, 1010 .u.file = out, 1011 }; 1012 ssize_t ret; 1013 1014 pipe_lock(pipe); 1015 1016 splice_from_pipe_begin(&sd); 1017 do { 1018 ret = splice_from_pipe_next(pipe, &sd); 1019 if (ret <= 0) 1020 break; 1021 1022 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); 1023 ret = file_remove_suid(out); 1024 if (!ret) { 1025 ret = file_update_time(out); 1026 if (!ret) 1027 ret = splice_from_pipe_feed(pipe, &sd, 1028 pipe_to_file); 1029 } 1030 mutex_unlock(&inode->i_mutex); 1031 } while (ret > 0); 1032 splice_from_pipe_end(pipe, &sd); 1033 1034 pipe_unlock(pipe); 1035 1036 if (sd.num_spliced) 1037 ret = sd.num_spliced; 1038 1039 if (ret > 0) { 1040 int err; 1041 1042 err = generic_write_sync(out, *ppos, ret); 1043 if (err) 1044 ret = err; 1045 else 1046 *ppos += ret; 1047 balance_dirty_pages_ratelimited(mapping); 1048 } 1049 1050 return ret; 1051 } 1052 1053 EXPORT_SYMBOL(generic_file_splice_write); 1054 1055 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1056 struct splice_desc *sd) 1057 { 1058 int ret; 1059 void *data; 1060 loff_t tmp = sd->pos; 1061 1062 data = kmap(buf->page); 1063 ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp); 1064 kunmap(buf->page); 1065 1066 return ret; 1067 } 1068 1069 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, 1070 struct file *out, loff_t *ppos, 1071 size_t len, unsigned int flags) 1072 { 1073 ssize_t ret; 1074 1075 ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); 1076 if (ret > 0) 1077 *ppos += ret; 1078 1079 return ret; 1080 } 1081 1082 /** 1083 * generic_splice_sendpage - splice data from a pipe to a socket 1084 * @pipe: pipe to splice from 1085 * @out: socket to write to 1086 * @ppos: position in @out 1087 * @len: number of bytes to splice 1088 * @flags: splice modifier flags 1089 * 1090 * Description: 1091 * Will send @len bytes from the pipe to a network socket. No data copying 1092 * is involved. 1093 * 1094 */ 1095 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, 1096 loff_t *ppos, size_t len, unsigned int flags) 1097 { 1098 return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); 1099 } 1100 1101 EXPORT_SYMBOL(generic_splice_sendpage); 1102 1103 /* 1104 * Attempt to initiate a splice from pipe to file. 1105 */ 1106 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, 1107 loff_t *ppos, size_t len, unsigned int flags) 1108 { 1109 ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, 1110 loff_t *, size_t, unsigned int); 1111 1112 if (out->f_op->splice_write) 1113 splice_write = out->f_op->splice_write; 1114 else 1115 splice_write = default_file_splice_write; 1116 1117 return splice_write(pipe, out, ppos, len, flags); 1118 } 1119 1120 /* 1121 * Attempt to initiate a splice from a file to a pipe. 1122 */ 1123 static long do_splice_to(struct file *in, loff_t *ppos, 1124 struct pipe_inode_info *pipe, size_t len, 1125 unsigned int flags) 1126 { 1127 ssize_t (*splice_read)(struct file *, loff_t *, 1128 struct pipe_inode_info *, size_t, unsigned int); 1129 int ret; 1130 1131 if (unlikely(!(in->f_mode & FMODE_READ))) 1132 return -EBADF; 1133 1134 ret = rw_verify_area(READ, in, ppos, len); 1135 if (unlikely(ret < 0)) 1136 return ret; 1137 1138 if (in->f_op->splice_read) 1139 splice_read = in->f_op->splice_read; 1140 else 1141 splice_read = default_file_splice_read; 1142 1143 return splice_read(in, ppos, pipe, len, flags); 1144 } 1145 1146 /** 1147 * splice_direct_to_actor - splices data directly between two non-pipes 1148 * @in: file to splice from 1149 * @sd: actor information on where to splice to 1150 * @actor: handles the data splicing 1151 * 1152 * Description: 1153 * This is a special case helper to splice directly between two 1154 * points, without requiring an explicit pipe. Internally an allocated 1155 * pipe is cached in the process, and reused during the lifetime of 1156 * that process. 1157 * 1158 */ 1159 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, 1160 splice_direct_actor *actor) 1161 { 1162 struct pipe_inode_info *pipe; 1163 long ret, bytes; 1164 umode_t i_mode; 1165 size_t len; 1166 int i, flags; 1167 1168 /* 1169 * We require the input being a regular file, as we don't want to 1170 * randomly drop data for eg socket -> socket splicing. Use the 1171 * piped splicing for that! 1172 */ 1173 i_mode = file_inode(in)->i_mode; 1174 if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) 1175 return -EINVAL; 1176 1177 /* 1178 * neither in nor out is a pipe, setup an internal pipe attached to 1179 * 'out' and transfer the wanted data from 'in' to 'out' through that 1180 */ 1181 pipe = current->splice_pipe; 1182 if (unlikely(!pipe)) { 1183 pipe = alloc_pipe_info(); 1184 if (!pipe) 1185 return -ENOMEM; 1186 1187 /* 1188 * We don't have an immediate reader, but we'll read the stuff 1189 * out of the pipe right after the splice_to_pipe(). So set 1190 * PIPE_READERS appropriately. 1191 */ 1192 pipe->readers = 1; 1193 1194 current->splice_pipe = pipe; 1195 } 1196 1197 /* 1198 * Do the splice. 1199 */ 1200 ret = 0; 1201 bytes = 0; 1202 len = sd->total_len; 1203 flags = sd->flags; 1204 1205 /* 1206 * Don't block on output, we have to drain the direct pipe. 1207 */ 1208 sd->flags &= ~SPLICE_F_NONBLOCK; 1209 1210 while (len) { 1211 size_t read_len; 1212 loff_t pos = sd->pos, prev_pos = pos; 1213 1214 ret = do_splice_to(in, &pos, pipe, len, flags); 1215 if (unlikely(ret <= 0)) 1216 goto out_release; 1217 1218 read_len = ret; 1219 sd->total_len = read_len; 1220 1221 /* 1222 * NOTE: nonblocking mode only applies to the input. We 1223 * must not do the output in nonblocking mode as then we 1224 * could get stuck data in the internal pipe: 1225 */ 1226 ret = actor(pipe, sd); 1227 if (unlikely(ret <= 0)) { 1228 sd->pos = prev_pos; 1229 goto out_release; 1230 } 1231 1232 bytes += ret; 1233 len -= ret; 1234 sd->pos = pos; 1235 1236 if (ret < read_len) { 1237 sd->pos = prev_pos + ret; 1238 goto out_release; 1239 } 1240 } 1241 1242 done: 1243 pipe->nrbufs = pipe->curbuf = 0; 1244 file_accessed(in); 1245 return bytes; 1246 1247 out_release: 1248 /* 1249 * If we did an incomplete transfer we must release 1250 * the pipe buffers in question: 1251 */ 1252 for (i = 0; i < pipe->buffers; i++) { 1253 struct pipe_buffer *buf = pipe->bufs + i; 1254 1255 if (buf->ops) { 1256 buf->ops->release(pipe, buf); 1257 buf->ops = NULL; 1258 } 1259 } 1260 1261 if (!bytes) 1262 bytes = ret; 1263 1264 goto done; 1265 } 1266 EXPORT_SYMBOL(splice_direct_to_actor); 1267 1268 static int direct_splice_actor(struct pipe_inode_info *pipe, 1269 struct splice_desc *sd) 1270 { 1271 struct file *file = sd->u.file; 1272 1273 return do_splice_from(pipe, file, sd->opos, sd->total_len, 1274 sd->flags); 1275 } 1276 1277 /** 1278 * do_splice_direct - splices data directly between two files 1279 * @in: file to splice from 1280 * @ppos: input file offset 1281 * @out: file to splice to 1282 * @opos: output file offset 1283 * @len: number of bytes to splice 1284 * @flags: splice modifier flags 1285 * 1286 * Description: 1287 * For use by do_sendfile(). splice can easily emulate sendfile, but 1288 * doing it in the application would incur an extra system call 1289 * (splice in + splice out, as compared to just sendfile()). So this helper 1290 * can splice directly through a process-private pipe. 1291 * 1292 */ 1293 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, 1294 loff_t *opos, size_t len, unsigned int flags) 1295 { 1296 struct splice_desc sd = { 1297 .len = len, 1298 .total_len = len, 1299 .flags = flags, 1300 .pos = *ppos, 1301 .u.file = out, 1302 .opos = opos, 1303 }; 1304 long ret; 1305 1306 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1307 return -EBADF; 1308 1309 if (unlikely(out->f_flags & O_APPEND)) 1310 return -EINVAL; 1311 1312 ret = rw_verify_area(WRITE, out, opos, len); 1313 if (unlikely(ret < 0)) 1314 return ret; 1315 1316 ret = splice_direct_to_actor(in, &sd, direct_splice_actor); 1317 if (ret > 0) 1318 *ppos = sd.pos; 1319 1320 return ret; 1321 } 1322 1323 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1324 struct pipe_inode_info *opipe, 1325 size_t len, unsigned int flags); 1326 1327 /* 1328 * Determine where to splice to/from. 1329 */ 1330 static long do_splice(struct file *in, loff_t __user *off_in, 1331 struct file *out, loff_t __user *off_out, 1332 size_t len, unsigned int flags) 1333 { 1334 struct pipe_inode_info *ipipe; 1335 struct pipe_inode_info *opipe; 1336 loff_t offset; 1337 long ret; 1338 1339 ipipe = get_pipe_info(in); 1340 opipe = get_pipe_info(out); 1341 1342 if (ipipe && opipe) { 1343 if (off_in || off_out) 1344 return -ESPIPE; 1345 1346 if (!(in->f_mode & FMODE_READ)) 1347 return -EBADF; 1348 1349 if (!(out->f_mode & FMODE_WRITE)) 1350 return -EBADF; 1351 1352 /* Splicing to self would be fun, but... */ 1353 if (ipipe == opipe) 1354 return -EINVAL; 1355 1356 return splice_pipe_to_pipe(ipipe, opipe, len, flags); 1357 } 1358 1359 if (ipipe) { 1360 if (off_in) 1361 return -ESPIPE; 1362 if (off_out) { 1363 if (!(out->f_mode & FMODE_PWRITE)) 1364 return -EINVAL; 1365 if (copy_from_user(&offset, off_out, sizeof(loff_t))) 1366 return -EFAULT; 1367 } else { 1368 offset = out->f_pos; 1369 } 1370 1371 if (unlikely(!(out->f_mode & FMODE_WRITE))) 1372 return -EBADF; 1373 1374 if (unlikely(out->f_flags & O_APPEND)) 1375 return -EINVAL; 1376 1377 ret = rw_verify_area(WRITE, out, &offset, len); 1378 if (unlikely(ret < 0)) 1379 return ret; 1380 1381 file_start_write(out); 1382 ret = do_splice_from(ipipe, out, &offset, len, flags); 1383 file_end_write(out); 1384 1385 if (!off_out) 1386 out->f_pos = offset; 1387 else if (copy_to_user(off_out, &offset, sizeof(loff_t))) 1388 ret = -EFAULT; 1389 1390 return ret; 1391 } 1392 1393 if (opipe) { 1394 if (off_out) 1395 return -ESPIPE; 1396 if (off_in) { 1397 if (!(in->f_mode & FMODE_PREAD)) 1398 return -EINVAL; 1399 if (copy_from_user(&offset, off_in, sizeof(loff_t))) 1400 return -EFAULT; 1401 } else { 1402 offset = in->f_pos; 1403 } 1404 1405 ret = do_splice_to(in, &offset, opipe, len, flags); 1406 1407 if (!off_in) 1408 in->f_pos = offset; 1409 else if (copy_to_user(off_in, &offset, sizeof(loff_t))) 1410 ret = -EFAULT; 1411 1412 return ret; 1413 } 1414 1415 return -EINVAL; 1416 } 1417 1418 /* 1419 * Map an iov into an array of pages and offset/length tupples. With the 1420 * partial_page structure, we can map several non-contiguous ranges into 1421 * our ones pages[] map instead of splitting that operation into pieces. 1422 * Could easily be exported as a generic helper for other users, in which 1423 * case one would probably want to add a 'max_nr_pages' parameter as well. 1424 */ 1425 static int get_iovec_page_array(const struct iovec __user *iov, 1426 unsigned int nr_vecs, struct page **pages, 1427 struct partial_page *partial, bool aligned, 1428 unsigned int pipe_buffers) 1429 { 1430 int buffers = 0, error = 0; 1431 1432 while (nr_vecs) { 1433 unsigned long off, npages; 1434 struct iovec entry; 1435 void __user *base; 1436 size_t len; 1437 int i; 1438 1439 error = -EFAULT; 1440 if (copy_from_user(&entry, iov, sizeof(entry))) 1441 break; 1442 1443 base = entry.iov_base; 1444 len = entry.iov_len; 1445 1446 /* 1447 * Sanity check this iovec. 0 read succeeds. 1448 */ 1449 error = 0; 1450 if (unlikely(!len)) 1451 break; 1452 error = -EFAULT; 1453 if (!access_ok(VERIFY_READ, base, len)) 1454 break; 1455 1456 /* 1457 * Get this base offset and number of pages, then map 1458 * in the user pages. 1459 */ 1460 off = (unsigned long) base & ~PAGE_MASK; 1461 1462 /* 1463 * If asked for alignment, the offset must be zero and the 1464 * length a multiple of the PAGE_SIZE. 1465 */ 1466 error = -EINVAL; 1467 if (aligned && (off || len & ~PAGE_MASK)) 1468 break; 1469 1470 npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1471 if (npages > pipe_buffers - buffers) 1472 npages = pipe_buffers - buffers; 1473 1474 error = get_user_pages_fast((unsigned long)base, npages, 1475 0, &pages[buffers]); 1476 1477 if (unlikely(error <= 0)) 1478 break; 1479 1480 /* 1481 * Fill this contiguous range into the partial page map. 1482 */ 1483 for (i = 0; i < error; i++) { 1484 const int plen = min_t(size_t, len, PAGE_SIZE - off); 1485 1486 partial[buffers].offset = off; 1487 partial[buffers].len = plen; 1488 1489 off = 0; 1490 len -= plen; 1491 buffers++; 1492 } 1493 1494 /* 1495 * We didn't complete this iov, stop here since it probably 1496 * means we have to move some of this into a pipe to 1497 * be able to continue. 1498 */ 1499 if (len) 1500 break; 1501 1502 /* 1503 * Don't continue if we mapped fewer pages than we asked for, 1504 * or if we mapped the max number of pages that we have 1505 * room for. 1506 */ 1507 if (error < npages || buffers == pipe_buffers) 1508 break; 1509 1510 nr_vecs--; 1511 iov++; 1512 } 1513 1514 if (buffers) 1515 return buffers; 1516 1517 return error; 1518 } 1519 1520 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, 1521 struct splice_desc *sd) 1522 { 1523 int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data); 1524 return n == sd->len ? n : -EFAULT; 1525 } 1526 1527 /* 1528 * For lack of a better implementation, implement vmsplice() to userspace 1529 * as a simple copy of the pipes pages to the user iov. 1530 */ 1531 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, 1532 unsigned long nr_segs, unsigned int flags) 1533 { 1534 struct pipe_inode_info *pipe; 1535 struct splice_desc sd; 1536 long ret; 1537 struct iovec iovstack[UIO_FASTIOV]; 1538 struct iovec *iov = iovstack; 1539 struct iov_iter iter; 1540 ssize_t count = 0; 1541 1542 pipe = get_pipe_info(file); 1543 if (!pipe) 1544 return -EBADF; 1545 1546 ret = rw_copy_check_uvector(READ, uiov, nr_segs, 1547 ARRAY_SIZE(iovstack), iovstack, &iov); 1548 if (ret <= 0) 1549 return ret; 1550 1551 iov_iter_init(&iter, iov, nr_segs, count, 0); 1552 1553 sd.len = 0; 1554 sd.total_len = count; 1555 sd.flags = flags; 1556 sd.u.data = &iter; 1557 sd.pos = 0; 1558 1559 pipe_lock(pipe); 1560 ret = __splice_from_pipe(pipe, &sd, pipe_to_user); 1561 pipe_unlock(pipe); 1562 1563 if (iov != iovstack) 1564 kfree(iov); 1565 1566 return ret; 1567 } 1568 1569 /* 1570 * vmsplice splices a user address range into a pipe. It can be thought of 1571 * as splice-from-memory, where the regular splice is splice-from-file (or 1572 * to file). In both cases the output is a pipe, naturally. 1573 */ 1574 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, 1575 unsigned long nr_segs, unsigned int flags) 1576 { 1577 struct pipe_inode_info *pipe; 1578 struct page *pages[PIPE_DEF_BUFFERS]; 1579 struct partial_page partial[PIPE_DEF_BUFFERS]; 1580 struct splice_pipe_desc spd = { 1581 .pages = pages, 1582 .partial = partial, 1583 .nr_pages_max = PIPE_DEF_BUFFERS, 1584 .flags = flags, 1585 .ops = &user_page_pipe_buf_ops, 1586 .spd_release = spd_release_page, 1587 }; 1588 long ret; 1589 1590 pipe = get_pipe_info(file); 1591 if (!pipe) 1592 return -EBADF; 1593 1594 if (splice_grow_spd(pipe, &spd)) 1595 return -ENOMEM; 1596 1597 spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, 1598 spd.partial, false, 1599 spd.nr_pages_max); 1600 if (spd.nr_pages <= 0) 1601 ret = spd.nr_pages; 1602 else 1603 ret = splice_to_pipe(pipe, &spd); 1604 1605 splice_shrink_spd(&spd); 1606 return ret; 1607 } 1608 1609 /* 1610 * Note that vmsplice only really supports true splicing _from_ user memory 1611 * to a pipe, not the other way around. Splicing from user memory is a simple 1612 * operation that can be supported without any funky alignment restrictions 1613 * or nasty vm tricks. We simply map in the user memory and fill them into 1614 * a pipe. The reverse isn't quite as easy, though. There are two possible 1615 * solutions for that: 1616 * 1617 * - memcpy() the data internally, at which point we might as well just 1618 * do a regular read() on the buffer anyway. 1619 * - Lots of nasty vm tricks, that are neither fast nor flexible (it 1620 * has restriction limitations on both ends of the pipe). 1621 * 1622 * Currently we punt and implement it as a normal copy, see pipe_to_user(). 1623 * 1624 */ 1625 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, 1626 unsigned long, nr_segs, unsigned int, flags) 1627 { 1628 struct fd f; 1629 long error; 1630 1631 if (unlikely(nr_segs > UIO_MAXIOV)) 1632 return -EINVAL; 1633 else if (unlikely(!nr_segs)) 1634 return 0; 1635 1636 error = -EBADF; 1637 f = fdget(fd); 1638 if (f.file) { 1639 if (f.file->f_mode & FMODE_WRITE) 1640 error = vmsplice_to_pipe(f.file, iov, nr_segs, flags); 1641 else if (f.file->f_mode & FMODE_READ) 1642 error = vmsplice_to_user(f.file, iov, nr_segs, flags); 1643 1644 fdput(f); 1645 } 1646 1647 return error; 1648 } 1649 1650 #ifdef CONFIG_COMPAT 1651 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, 1652 unsigned int, nr_segs, unsigned int, flags) 1653 { 1654 unsigned i; 1655 struct iovec __user *iov; 1656 if (nr_segs > UIO_MAXIOV) 1657 return -EINVAL; 1658 iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); 1659 for (i = 0; i < nr_segs; i++) { 1660 struct compat_iovec v; 1661 if (get_user(v.iov_base, &iov32[i].iov_base) || 1662 get_user(v.iov_len, &iov32[i].iov_len) || 1663 put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || 1664 put_user(v.iov_len, &iov[i].iov_len)) 1665 return -EFAULT; 1666 } 1667 return sys_vmsplice(fd, iov, nr_segs, flags); 1668 } 1669 #endif 1670 1671 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, 1672 int, fd_out, loff_t __user *, off_out, 1673 size_t, len, unsigned int, flags) 1674 { 1675 struct fd in, out; 1676 long error; 1677 1678 if (unlikely(!len)) 1679 return 0; 1680 1681 error = -EBADF; 1682 in = fdget(fd_in); 1683 if (in.file) { 1684 if (in.file->f_mode & FMODE_READ) { 1685 out = fdget(fd_out); 1686 if (out.file) { 1687 if (out.file->f_mode & FMODE_WRITE) 1688 error = do_splice(in.file, off_in, 1689 out.file, off_out, 1690 len, flags); 1691 fdput(out); 1692 } 1693 } 1694 fdput(in); 1695 } 1696 return error; 1697 } 1698 1699 /* 1700 * Make sure there's data to read. Wait for input if we can, otherwise 1701 * return an appropriate error. 1702 */ 1703 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1704 { 1705 int ret; 1706 1707 /* 1708 * Check ->nrbufs without the inode lock first. This function 1709 * is speculative anyways, so missing one is ok. 1710 */ 1711 if (pipe->nrbufs) 1712 return 0; 1713 1714 ret = 0; 1715 pipe_lock(pipe); 1716 1717 while (!pipe->nrbufs) { 1718 if (signal_pending(current)) { 1719 ret = -ERESTARTSYS; 1720 break; 1721 } 1722 if (!pipe->writers) 1723 break; 1724 if (!pipe->waiting_writers) { 1725 if (flags & SPLICE_F_NONBLOCK) { 1726 ret = -EAGAIN; 1727 break; 1728 } 1729 } 1730 pipe_wait(pipe); 1731 } 1732 1733 pipe_unlock(pipe); 1734 return ret; 1735 } 1736 1737 /* 1738 * Make sure there's writeable room. Wait for room if we can, otherwise 1739 * return an appropriate error. 1740 */ 1741 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) 1742 { 1743 int ret; 1744 1745 /* 1746 * Check ->nrbufs without the inode lock first. This function 1747 * is speculative anyways, so missing one is ok. 1748 */ 1749 if (pipe->nrbufs < pipe->buffers) 1750 return 0; 1751 1752 ret = 0; 1753 pipe_lock(pipe); 1754 1755 while (pipe->nrbufs >= pipe->buffers) { 1756 if (!pipe->readers) { 1757 send_sig(SIGPIPE, current, 0); 1758 ret = -EPIPE; 1759 break; 1760 } 1761 if (flags & SPLICE_F_NONBLOCK) { 1762 ret = -EAGAIN; 1763 break; 1764 } 1765 if (signal_pending(current)) { 1766 ret = -ERESTARTSYS; 1767 break; 1768 } 1769 pipe->waiting_writers++; 1770 pipe_wait(pipe); 1771 pipe->waiting_writers--; 1772 } 1773 1774 pipe_unlock(pipe); 1775 return ret; 1776 } 1777 1778 /* 1779 * Splice contents of ipipe to opipe. 1780 */ 1781 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, 1782 struct pipe_inode_info *opipe, 1783 size_t len, unsigned int flags) 1784 { 1785 struct pipe_buffer *ibuf, *obuf; 1786 int ret = 0, nbuf; 1787 bool input_wakeup = false; 1788 1789 1790 retry: 1791 ret = ipipe_prep(ipipe, flags); 1792 if (ret) 1793 return ret; 1794 1795 ret = opipe_prep(opipe, flags); 1796 if (ret) 1797 return ret; 1798 1799 /* 1800 * Potential ABBA deadlock, work around it by ordering lock 1801 * grabbing by pipe info address. Otherwise two different processes 1802 * could deadlock (one doing tee from A -> B, the other from B -> A). 1803 */ 1804 pipe_double_lock(ipipe, opipe); 1805 1806 do { 1807 if (!opipe->readers) { 1808 send_sig(SIGPIPE, current, 0); 1809 if (!ret) 1810 ret = -EPIPE; 1811 break; 1812 } 1813 1814 if (!ipipe->nrbufs && !ipipe->writers) 1815 break; 1816 1817 /* 1818 * Cannot make any progress, because either the input 1819 * pipe is empty or the output pipe is full. 1820 */ 1821 if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) { 1822 /* Already processed some buffers, break */ 1823 if (ret) 1824 break; 1825 1826 if (flags & SPLICE_F_NONBLOCK) { 1827 ret = -EAGAIN; 1828 break; 1829 } 1830 1831 /* 1832 * We raced with another reader/writer and haven't 1833 * managed to process any buffers. A zero return 1834 * value means EOF, so retry instead. 1835 */ 1836 pipe_unlock(ipipe); 1837 pipe_unlock(opipe); 1838 goto retry; 1839 } 1840 1841 ibuf = ipipe->bufs + ipipe->curbuf; 1842 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1843 obuf = opipe->bufs + nbuf; 1844 1845 if (len >= ibuf->len) { 1846 /* 1847 * Simply move the whole buffer from ipipe to opipe 1848 */ 1849 *obuf = *ibuf; 1850 ibuf->ops = NULL; 1851 opipe->nrbufs++; 1852 ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1); 1853 ipipe->nrbufs--; 1854 input_wakeup = true; 1855 } else { 1856 /* 1857 * Get a reference to this pipe buffer, 1858 * so we can copy the contents over. 1859 */ 1860 ibuf->ops->get(ipipe, ibuf); 1861 *obuf = *ibuf; 1862 1863 /* 1864 * Don't inherit the gift flag, we need to 1865 * prevent multiple steals of this page. 1866 */ 1867 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1868 1869 obuf->len = len; 1870 opipe->nrbufs++; 1871 ibuf->offset += obuf->len; 1872 ibuf->len -= obuf->len; 1873 } 1874 ret += obuf->len; 1875 len -= obuf->len; 1876 } while (len); 1877 1878 pipe_unlock(ipipe); 1879 pipe_unlock(opipe); 1880 1881 /* 1882 * If we put data in the output pipe, wakeup any potential readers. 1883 */ 1884 if (ret > 0) 1885 wakeup_pipe_readers(opipe); 1886 1887 if (input_wakeup) 1888 wakeup_pipe_writers(ipipe); 1889 1890 return ret; 1891 } 1892 1893 /* 1894 * Link contents of ipipe to opipe. 1895 */ 1896 static int link_pipe(struct pipe_inode_info *ipipe, 1897 struct pipe_inode_info *opipe, 1898 size_t len, unsigned int flags) 1899 { 1900 struct pipe_buffer *ibuf, *obuf; 1901 int ret = 0, i = 0, nbuf; 1902 1903 /* 1904 * Potential ABBA deadlock, work around it by ordering lock 1905 * grabbing by pipe info address. Otherwise two different processes 1906 * could deadlock (one doing tee from A -> B, the other from B -> A). 1907 */ 1908 pipe_double_lock(ipipe, opipe); 1909 1910 do { 1911 if (!opipe->readers) { 1912 send_sig(SIGPIPE, current, 0); 1913 if (!ret) 1914 ret = -EPIPE; 1915 break; 1916 } 1917 1918 /* 1919 * If we have iterated all input buffers or ran out of 1920 * output room, break. 1921 */ 1922 if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) 1923 break; 1924 1925 ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1)); 1926 nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1); 1927 1928 /* 1929 * Get a reference to this pipe buffer, 1930 * so we can copy the contents over. 1931 */ 1932 ibuf->ops->get(ipipe, ibuf); 1933 1934 obuf = opipe->bufs + nbuf; 1935 *obuf = *ibuf; 1936 1937 /* 1938 * Don't inherit the gift flag, we need to 1939 * prevent multiple steals of this page. 1940 */ 1941 obuf->flags &= ~PIPE_BUF_FLAG_GIFT; 1942 1943 if (obuf->len > len) 1944 obuf->len = len; 1945 1946 opipe->nrbufs++; 1947 ret += obuf->len; 1948 len -= obuf->len; 1949 i++; 1950 } while (len); 1951 1952 /* 1953 * return EAGAIN if we have the potential of some data in the 1954 * future, otherwise just return 0 1955 */ 1956 if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) 1957 ret = -EAGAIN; 1958 1959 pipe_unlock(ipipe); 1960 pipe_unlock(opipe); 1961 1962 /* 1963 * If we put data in the output pipe, wakeup any potential readers. 1964 */ 1965 if (ret > 0) 1966 wakeup_pipe_readers(opipe); 1967 1968 return ret; 1969 } 1970 1971 /* 1972 * This is a tee(1) implementation that works on pipes. It doesn't copy 1973 * any data, it simply references the 'in' pages on the 'out' pipe. 1974 * The 'flags' used are the SPLICE_F_* variants, currently the only 1975 * applicable one is SPLICE_F_NONBLOCK. 1976 */ 1977 static long do_tee(struct file *in, struct file *out, size_t len, 1978 unsigned int flags) 1979 { 1980 struct pipe_inode_info *ipipe = get_pipe_info(in); 1981 struct pipe_inode_info *opipe = get_pipe_info(out); 1982 int ret = -EINVAL; 1983 1984 /* 1985 * Duplicate the contents of ipipe to opipe without actually 1986 * copying the data. 1987 */ 1988 if (ipipe && opipe && ipipe != opipe) { 1989 /* 1990 * Keep going, unless we encounter an error. The ipipe/opipe 1991 * ordering doesn't really matter. 1992 */ 1993 ret = ipipe_prep(ipipe, flags); 1994 if (!ret) { 1995 ret = opipe_prep(opipe, flags); 1996 if (!ret) 1997 ret = link_pipe(ipipe, opipe, len, flags); 1998 } 1999 } 2000 2001 return ret; 2002 } 2003 2004 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) 2005 { 2006 struct fd in; 2007 int error; 2008 2009 if (unlikely(!len)) 2010 return 0; 2011 2012 error = -EBADF; 2013 in = fdget(fdin); 2014 if (in.file) { 2015 if (in.file->f_mode & FMODE_READ) { 2016 struct fd out = fdget(fdout); 2017 if (out.file) { 2018 if (out.file->f_mode & FMODE_WRITE) 2019 error = do_tee(in.file, out.file, 2020 len, flags); 2021 fdput(out); 2022 } 2023 } 2024 fdput(in); 2025 } 2026 2027 return error; 2028 } 2029