xref: /openbmc/linux/fs/splice.c (revision 64c70b1c)
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18  *
19  */
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/splice.h>
24 #include <linux/mm_inline.h>
25 #include <linux/swap.h>
26 #include <linux/writeback.h>
27 #include <linux/buffer_head.h>
28 #include <linux/module.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
31 
32 /*
33  * Attempt to steal a page from a pipe buffer. This should perhaps go into
34  * a vm helper function, it's already simplified quite a bit by the
35  * addition of remove_mapping(). If success is returned, the caller may
36  * attempt to reuse this page for another destination.
37  */
38 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
39 				     struct pipe_buffer *buf)
40 {
41 	struct page *page = buf->page;
42 	struct address_space *mapping;
43 
44 	lock_page(page);
45 
46 	mapping = page_mapping(page);
47 	if (mapping) {
48 		WARN_ON(!PageUptodate(page));
49 
50 		/*
51 		 * At least for ext2 with nobh option, we need to wait on
52 		 * writeback completing on this page, since we'll remove it
53 		 * from the pagecache.  Otherwise truncate wont wait on the
54 		 * page, allowing the disk blocks to be reused by someone else
55 		 * before we actually wrote our data to them. fs corruption
56 		 * ensues.
57 		 */
58 		wait_on_page_writeback(page);
59 
60 		if (PagePrivate(page))
61 			try_to_release_page(page, GFP_KERNEL);
62 
63 		/*
64 		 * If we succeeded in removing the mapping, set LRU flag
65 		 * and return good.
66 		 */
67 		if (remove_mapping(mapping, page)) {
68 			buf->flags |= PIPE_BUF_FLAG_LRU;
69 			return 0;
70 		}
71 	}
72 
73 	/*
74 	 * Raced with truncate or failed to remove page from current
75 	 * address space, unlock and return failure.
76 	 */
77 	unlock_page(page);
78 	return 1;
79 }
80 
81 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
82 					struct pipe_buffer *buf)
83 {
84 	page_cache_release(buf->page);
85 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
86 }
87 
88 /*
89  * Check whether the contents of buf is OK to access. Since the content
90  * is a page cache page, IO may be in flight.
91  */
92 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
93 				       struct pipe_buffer *buf)
94 {
95 	struct page *page = buf->page;
96 	int err;
97 
98 	if (!PageUptodate(page)) {
99 		lock_page(page);
100 
101 		/*
102 		 * Page got truncated/unhashed. This will cause a 0-byte
103 		 * splice, if this is the first page.
104 		 */
105 		if (!page->mapping) {
106 			err = -ENODATA;
107 			goto error;
108 		}
109 
110 		/*
111 		 * Uh oh, read-error from disk.
112 		 */
113 		if (!PageUptodate(page)) {
114 			err = -EIO;
115 			goto error;
116 		}
117 
118 		/*
119 		 * Page is ok afterall, we are done.
120 		 */
121 		unlock_page(page);
122 	}
123 
124 	return 0;
125 error:
126 	unlock_page(page);
127 	return err;
128 }
129 
130 static const struct pipe_buf_operations page_cache_pipe_buf_ops = {
131 	.can_merge = 0,
132 	.map = generic_pipe_buf_map,
133 	.unmap = generic_pipe_buf_unmap,
134 	.confirm = page_cache_pipe_buf_confirm,
135 	.release = page_cache_pipe_buf_release,
136 	.steal = page_cache_pipe_buf_steal,
137 	.get = generic_pipe_buf_get,
138 };
139 
140 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
141 				    struct pipe_buffer *buf)
142 {
143 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
144 		return 1;
145 
146 	buf->flags |= PIPE_BUF_FLAG_LRU;
147 	return generic_pipe_buf_steal(pipe, buf);
148 }
149 
150 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
151 	.can_merge = 0,
152 	.map = generic_pipe_buf_map,
153 	.unmap = generic_pipe_buf_unmap,
154 	.confirm = generic_pipe_buf_confirm,
155 	.release = page_cache_pipe_buf_release,
156 	.steal = user_page_pipe_buf_steal,
157 	.get = generic_pipe_buf_get,
158 };
159 
160 /**
161  * splice_to_pipe - fill passed data into a pipe
162  * @pipe:	pipe to fill
163  * @spd:	data to fill
164  *
165  * Description:
166  *    @spd contains a map of pages and len/offset tupples, a long with
167  *    the struct pipe_buf_operations associated with these pages. This
168  *    function will link that data to the pipe.
169  *
170  */
171 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
172 		       struct splice_pipe_desc *spd)
173 {
174 	unsigned int spd_pages = spd->nr_pages;
175 	int ret, do_wakeup, page_nr;
176 
177 	ret = 0;
178 	do_wakeup = 0;
179 	page_nr = 0;
180 
181 	if (pipe->inode)
182 		mutex_lock(&pipe->inode->i_mutex);
183 
184 	for (;;) {
185 		if (!pipe->readers) {
186 			send_sig(SIGPIPE, current, 0);
187 			if (!ret)
188 				ret = -EPIPE;
189 			break;
190 		}
191 
192 		if (pipe->nrbufs < PIPE_BUFFERS) {
193 			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
194 			struct pipe_buffer *buf = pipe->bufs + newbuf;
195 
196 			buf->page = spd->pages[page_nr];
197 			buf->offset = spd->partial[page_nr].offset;
198 			buf->len = spd->partial[page_nr].len;
199 			buf->private = spd->partial[page_nr].private;
200 			buf->ops = spd->ops;
201 			if (spd->flags & SPLICE_F_GIFT)
202 				buf->flags |= PIPE_BUF_FLAG_GIFT;
203 
204 			pipe->nrbufs++;
205 			page_nr++;
206 			ret += buf->len;
207 
208 			if (pipe->inode)
209 				do_wakeup = 1;
210 
211 			if (!--spd->nr_pages)
212 				break;
213 			if (pipe->nrbufs < PIPE_BUFFERS)
214 				continue;
215 
216 			break;
217 		}
218 
219 		if (spd->flags & SPLICE_F_NONBLOCK) {
220 			if (!ret)
221 				ret = -EAGAIN;
222 			break;
223 		}
224 
225 		if (signal_pending(current)) {
226 			if (!ret)
227 				ret = -ERESTARTSYS;
228 			break;
229 		}
230 
231 		if (do_wakeup) {
232 			smp_mb();
233 			if (waitqueue_active(&pipe->wait))
234 				wake_up_interruptible_sync(&pipe->wait);
235 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
236 			do_wakeup = 0;
237 		}
238 
239 		pipe->waiting_writers++;
240 		pipe_wait(pipe);
241 		pipe->waiting_writers--;
242 	}
243 
244 	if (pipe->inode) {
245 		mutex_unlock(&pipe->inode->i_mutex);
246 
247 		if (do_wakeup) {
248 			smp_mb();
249 			if (waitqueue_active(&pipe->wait))
250 				wake_up_interruptible(&pipe->wait);
251 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
252 		}
253 	}
254 
255 	while (page_nr < spd_pages)
256 		page_cache_release(spd->pages[page_nr++]);
257 
258 	return ret;
259 }
260 
261 static int
262 __generic_file_splice_read(struct file *in, loff_t *ppos,
263 			   struct pipe_inode_info *pipe, size_t len,
264 			   unsigned int flags)
265 {
266 	struct address_space *mapping = in->f_mapping;
267 	unsigned int loff, nr_pages;
268 	struct page *pages[PIPE_BUFFERS];
269 	struct partial_page partial[PIPE_BUFFERS];
270 	struct page *page;
271 	pgoff_t index, end_index;
272 	loff_t isize;
273 	int error, page_nr;
274 	struct splice_pipe_desc spd = {
275 		.pages = pages,
276 		.partial = partial,
277 		.flags = flags,
278 		.ops = &page_cache_pipe_buf_ops,
279 	};
280 
281 	index = *ppos >> PAGE_CACHE_SHIFT;
282 	loff = *ppos & ~PAGE_CACHE_MASK;
283 	nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
284 
285 	if (nr_pages > PIPE_BUFFERS)
286 		nr_pages = PIPE_BUFFERS;
287 
288 	/*
289 	 * Don't try to 2nd guess the read-ahead logic, call into
290 	 * page_cache_readahead() like the page cache reads would do.
291 	 */
292 	page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages);
293 
294 	/*
295 	 * Lookup the (hopefully) full range of pages we need.
296 	 */
297 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages);
298 
299 	/*
300 	 * If find_get_pages_contig() returned fewer pages than we needed,
301 	 * allocate the rest and fill in the holes.
302 	 */
303 	error = 0;
304 	index += spd.nr_pages;
305 	while (spd.nr_pages < nr_pages) {
306 		/*
307 		 * Page could be there, find_get_pages_contig() breaks on
308 		 * the first hole.
309 		 */
310 		page = find_get_page(mapping, index);
311 		if (!page) {
312 			/*
313 			 * Make sure the read-ahead engine is notified
314 			 * about this failure.
315 			 */
316 			handle_ra_miss(mapping, &in->f_ra, index);
317 
318 			/*
319 			 * page didn't exist, allocate one.
320 			 */
321 			page = page_cache_alloc_cold(mapping);
322 			if (!page)
323 				break;
324 
325 			error = add_to_page_cache_lru(page, mapping, index,
326 					      GFP_KERNEL);
327 			if (unlikely(error)) {
328 				page_cache_release(page);
329 				if (error == -EEXIST)
330 					continue;
331 				break;
332 			}
333 			/*
334 			 * add_to_page_cache() locks the page, unlock it
335 			 * to avoid convoluting the logic below even more.
336 			 */
337 			unlock_page(page);
338 		}
339 
340 		pages[spd.nr_pages++] = page;
341 		index++;
342 	}
343 
344 	/*
345 	 * Now loop over the map and see if we need to start IO on any
346 	 * pages, fill in the partial map, etc.
347 	 */
348 	index = *ppos >> PAGE_CACHE_SHIFT;
349 	nr_pages = spd.nr_pages;
350 	spd.nr_pages = 0;
351 	for (page_nr = 0; page_nr < nr_pages; page_nr++) {
352 		unsigned int this_len;
353 
354 		if (!len)
355 			break;
356 
357 		/*
358 		 * this_len is the max we'll use from this page
359 		 */
360 		this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
361 		page = pages[page_nr];
362 
363 		/*
364 		 * If the page isn't uptodate, we may need to start io on it
365 		 */
366 		if (!PageUptodate(page)) {
367 			/*
368 			 * If in nonblock mode then dont block on waiting
369 			 * for an in-flight io page
370 			 */
371 			if (flags & SPLICE_F_NONBLOCK) {
372 				if (TestSetPageLocked(page))
373 					break;
374 			} else
375 				lock_page(page);
376 
377 			/*
378 			 * page was truncated, stop here. if this isn't the
379 			 * first page, we'll just complete what we already
380 			 * added
381 			 */
382 			if (!page->mapping) {
383 				unlock_page(page);
384 				break;
385 			}
386 			/*
387 			 * page was already under io and is now done, great
388 			 */
389 			if (PageUptodate(page)) {
390 				unlock_page(page);
391 				goto fill_it;
392 			}
393 
394 			/*
395 			 * need to read in the page
396 			 */
397 			error = mapping->a_ops->readpage(in, page);
398 			if (unlikely(error)) {
399 				/*
400 				 * We really should re-lookup the page here,
401 				 * but it complicates things a lot. Instead
402 				 * lets just do what we already stored, and
403 				 * we'll get it the next time we are called.
404 				 */
405 				if (error == AOP_TRUNCATED_PAGE)
406 					error = 0;
407 
408 				break;
409 			}
410 		}
411 fill_it:
412 		/*
413 		 * i_size must be checked after PageUptodate.
414 		 */
415 		isize = i_size_read(mapping->host);
416 		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
417 		if (unlikely(!isize || index > end_index))
418 			break;
419 
420 		/*
421 		 * if this is the last page, see if we need to shrink
422 		 * the length and stop
423 		 */
424 		if (end_index == index) {
425 			unsigned int plen;
426 
427 			/*
428 			 * max good bytes in this page
429 			 */
430 			plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
431 			if (plen <= loff)
432 				break;
433 
434 			/*
435 			 * force quit after adding this page
436 			 */
437 			this_len = min(this_len, plen - loff);
438 			len = this_len;
439 		}
440 
441 		partial[page_nr].offset = loff;
442 		partial[page_nr].len = this_len;
443 		len -= this_len;
444 		loff = 0;
445 		spd.nr_pages++;
446 		index++;
447 	}
448 
449 	/*
450 	 * Release any pages at the end, if we quit early. 'page_nr' is how far
451 	 * we got, 'nr_pages' is how many pages are in the map.
452 	 */
453 	while (page_nr < nr_pages)
454 		page_cache_release(pages[page_nr++]);
455 
456 	if (spd.nr_pages)
457 		return splice_to_pipe(pipe, &spd);
458 
459 	return error;
460 }
461 
462 /**
463  * generic_file_splice_read - splice data from file to a pipe
464  * @in:		file to splice from
465  * @ppos:	position in @in
466  * @pipe:	pipe to splice to
467  * @len:	number of bytes to splice
468  * @flags:	splice modifier flags
469  *
470  * Description:
471  *    Will read pages from given file and fill them into a pipe. Can be
472  *    used as long as the address_space operations for the source implements
473  *    a readpage() hook.
474  *
475  */
476 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
477 				 struct pipe_inode_info *pipe, size_t len,
478 				 unsigned int flags)
479 {
480 	ssize_t spliced;
481 	int ret;
482 	loff_t isize, left;
483 
484 	isize = i_size_read(in->f_mapping->host);
485 	if (unlikely(*ppos >= isize))
486 		return 0;
487 
488 	left = isize - *ppos;
489 	if (unlikely(left < len))
490 		len = left;
491 
492 	ret = 0;
493 	spliced = 0;
494 	while (len) {
495 		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
496 
497 		if (ret < 0)
498 			break;
499 		else if (!ret) {
500 			if (spliced)
501 				break;
502 			if (flags & SPLICE_F_NONBLOCK) {
503 				ret = -EAGAIN;
504 				break;
505 			}
506 		}
507 
508 		*ppos += ret;
509 		len -= ret;
510 		spliced += ret;
511 	}
512 
513 	if (spliced)
514 		return spliced;
515 
516 	return ret;
517 }
518 
519 EXPORT_SYMBOL(generic_file_splice_read);
520 
521 /*
522  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
523  * using sendpage(). Return the number of bytes sent.
524  */
525 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
526 			    struct pipe_buffer *buf, struct splice_desc *sd)
527 {
528 	struct file *file = sd->u.file;
529 	loff_t pos = sd->pos;
530 	int ret, more;
531 
532 	ret = buf->ops->confirm(pipe, buf);
533 	if (!ret) {
534 		more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len;
535 
536 		ret = file->f_op->sendpage(file, buf->page, buf->offset,
537 					   sd->len, &pos, more);
538 	}
539 
540 	return ret;
541 }
542 
543 /*
544  * This is a little more tricky than the file -> pipe splicing. There are
545  * basically three cases:
546  *
547  *	- Destination page already exists in the address space and there
548  *	  are users of it. For that case we have no other option that
549  *	  copying the data. Tough luck.
550  *	- Destination page already exists in the address space, but there
551  *	  are no users of it. Make sure it's uptodate, then drop it. Fall
552  *	  through to last case.
553  *	- Destination page does not exist, we can add the pipe page to
554  *	  the page cache and avoid the copy.
555  *
556  * If asked to move pages to the output file (SPLICE_F_MOVE is set in
557  * sd->flags), we attempt to migrate pages from the pipe to the output
558  * file address space page cache. This is possible if no one else has
559  * the pipe page referenced outside of the pipe and page cache. If
560  * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create
561  * a new page in the output file page cache and fill/dirty that.
562  */
563 static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
564 			struct splice_desc *sd)
565 {
566 	struct file *file = sd->u.file;
567 	struct address_space *mapping = file->f_mapping;
568 	unsigned int offset, this_len;
569 	struct page *page;
570 	pgoff_t index;
571 	int ret;
572 
573 	/*
574 	 * make sure the data in this buffer is uptodate
575 	 */
576 	ret = buf->ops->confirm(pipe, buf);
577 	if (unlikely(ret))
578 		return ret;
579 
580 	index = sd->pos >> PAGE_CACHE_SHIFT;
581 	offset = sd->pos & ~PAGE_CACHE_MASK;
582 
583 	this_len = sd->len;
584 	if (this_len + offset > PAGE_CACHE_SIZE)
585 		this_len = PAGE_CACHE_SIZE - offset;
586 
587 find_page:
588 	page = find_lock_page(mapping, index);
589 	if (!page) {
590 		ret = -ENOMEM;
591 		page = page_cache_alloc_cold(mapping);
592 		if (unlikely(!page))
593 			goto out_ret;
594 
595 		/*
596 		 * This will also lock the page
597 		 */
598 		ret = add_to_page_cache_lru(page, mapping, index,
599 					    GFP_KERNEL);
600 		if (unlikely(ret))
601 			goto out;
602 	}
603 
604 	ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
605 	if (unlikely(ret)) {
606 		loff_t isize = i_size_read(mapping->host);
607 
608 		if (ret != AOP_TRUNCATED_PAGE)
609 			unlock_page(page);
610 		page_cache_release(page);
611 		if (ret == AOP_TRUNCATED_PAGE)
612 			goto find_page;
613 
614 		/*
615 		 * prepare_write() may have instantiated a few blocks
616 		 * outside i_size.  Trim these off again.
617 		 */
618 		if (sd->pos + this_len > isize)
619 			vmtruncate(mapping->host, isize);
620 
621 		goto out_ret;
622 	}
623 
624 	if (buf->page != page) {
625 		/*
626 		 * Careful, ->map() uses KM_USER0!
627 		 */
628 		char *src = buf->ops->map(pipe, buf, 1);
629 		char *dst = kmap_atomic(page, KM_USER1);
630 
631 		memcpy(dst + offset, src + buf->offset, this_len);
632 		flush_dcache_page(page);
633 		kunmap_atomic(dst, KM_USER1);
634 		buf->ops->unmap(pipe, buf, src);
635 	}
636 
637 	ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len);
638 	if (ret) {
639 		if (ret == AOP_TRUNCATED_PAGE) {
640 			page_cache_release(page);
641 			goto find_page;
642 		}
643 		if (ret < 0)
644 			goto out;
645 		/*
646 		 * Partial write has happened, so 'ret' already initialized by
647 		 * number of bytes written, Where is nothing we have to do here.
648 		 */
649 	} else
650 		ret = this_len;
651 	/*
652 	 * Return the number of bytes written and mark page as
653 	 * accessed, we are now done!
654 	 */
655 	mark_page_accessed(page);
656 out:
657 	page_cache_release(page);
658 	unlock_page(page);
659 out_ret:
660 	return ret;
661 }
662 
663 /**
664  * __splice_from_pipe - splice data from a pipe to given actor
665  * @pipe:	pipe to splice from
666  * @sd:		information to @actor
667  * @actor:	handler that splices the data
668  *
669  * Description:
670  *    This function does little more than loop over the pipe and call
671  *    @actor to do the actual moving of a single struct pipe_buffer to
672  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
673  *    pipe_to_user.
674  *
675  */
676 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
677 			   splice_actor *actor)
678 {
679 	int ret, do_wakeup, err;
680 
681 	ret = 0;
682 	do_wakeup = 0;
683 
684 	for (;;) {
685 		if (pipe->nrbufs) {
686 			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
687 			const struct pipe_buf_operations *ops = buf->ops;
688 
689 			sd->len = buf->len;
690 			if (sd->len > sd->total_len)
691 				sd->len = sd->total_len;
692 
693 			err = actor(pipe, buf, sd);
694 			if (err <= 0) {
695 				if (!ret && err != -ENODATA)
696 					ret = err;
697 
698 				break;
699 			}
700 
701 			ret += err;
702 			buf->offset += err;
703 			buf->len -= err;
704 
705 			sd->len -= err;
706 			sd->pos += err;
707 			sd->total_len -= err;
708 			if (sd->len)
709 				continue;
710 
711 			if (!buf->len) {
712 				buf->ops = NULL;
713 				ops->release(pipe, buf);
714 				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
715 				pipe->nrbufs--;
716 				if (pipe->inode)
717 					do_wakeup = 1;
718 			}
719 
720 			if (!sd->total_len)
721 				break;
722 		}
723 
724 		if (pipe->nrbufs)
725 			continue;
726 		if (!pipe->writers)
727 			break;
728 		if (!pipe->waiting_writers) {
729 			if (ret)
730 				break;
731 		}
732 
733 		if (sd->flags & SPLICE_F_NONBLOCK) {
734 			if (!ret)
735 				ret = -EAGAIN;
736 			break;
737 		}
738 
739 		if (signal_pending(current)) {
740 			if (!ret)
741 				ret = -ERESTARTSYS;
742 			break;
743 		}
744 
745 		if (do_wakeup) {
746 			smp_mb();
747 			if (waitqueue_active(&pipe->wait))
748 				wake_up_interruptible_sync(&pipe->wait);
749 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
750 			do_wakeup = 0;
751 		}
752 
753 		pipe_wait(pipe);
754 	}
755 
756 	if (do_wakeup) {
757 		smp_mb();
758 		if (waitqueue_active(&pipe->wait))
759 			wake_up_interruptible(&pipe->wait);
760 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
761 	}
762 
763 	return ret;
764 }
765 EXPORT_SYMBOL(__splice_from_pipe);
766 
767 /**
768  * splice_from_pipe - splice data from a pipe to a file
769  * @pipe:	pipe to splice from
770  * @out:	file to splice to
771  * @ppos:	position in @out
772  * @len:	how many bytes to splice
773  * @flags:	splice modifier flags
774  * @actor:	handler that splices the data
775  *
776  * Description:
777  *    See __splice_from_pipe. This function locks the input and output inodes,
778  *    otherwise it's identical to __splice_from_pipe().
779  *
780  */
781 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
782 			 loff_t *ppos, size_t len, unsigned int flags,
783 			 splice_actor *actor)
784 {
785 	ssize_t ret;
786 	struct inode *inode = out->f_mapping->host;
787 	struct splice_desc sd = {
788 		.total_len = len,
789 		.flags = flags,
790 		.pos = *ppos,
791 		.u.file = out,
792 	};
793 
794 	/*
795 	 * The actor worker might be calling ->prepare_write and
796 	 * ->commit_write. Most of the time, these expect i_mutex to
797 	 * be held. Since this may result in an ABBA deadlock with
798 	 * pipe->inode, we have to order lock acquiry here.
799 	 */
800 	inode_double_lock(inode, pipe->inode);
801 	ret = __splice_from_pipe(pipe, &sd, actor);
802 	inode_double_unlock(inode, pipe->inode);
803 
804 	return ret;
805 }
806 
807 /**
808  * generic_file_splice_write_nolock - generic_file_splice_write without mutexes
809  * @pipe:	pipe info
810  * @out:	file to write to
811  * @ppos:	position in @out
812  * @len:	number of bytes to splice
813  * @flags:	splice modifier flags
814  *
815  * Description:
816  *    Will either move or copy pages (determined by @flags options) from
817  *    the given pipe inode to the given file. The caller is responsible
818  *    for acquiring i_mutex on both inodes.
819  *
820  */
821 ssize_t
822 generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out,
823 				 loff_t *ppos, size_t len, unsigned int flags)
824 {
825 	struct address_space *mapping = out->f_mapping;
826 	struct inode *inode = mapping->host;
827 	struct splice_desc sd = {
828 		.total_len = len,
829 		.flags = flags,
830 		.pos = *ppos,
831 		.u.file = out,
832 	};
833 	ssize_t ret;
834 	int err;
835 
836 	err = remove_suid(out->f_path.dentry);
837 	if (unlikely(err))
838 		return err;
839 
840 	ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
841 	if (ret > 0) {
842 		unsigned long nr_pages;
843 
844 		*ppos += ret;
845 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
846 
847 		/*
848 		 * If file or inode is SYNC and we actually wrote some data,
849 		 * sync it.
850 		 */
851 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
852 			err = generic_osync_inode(inode, mapping,
853 						  OSYNC_METADATA|OSYNC_DATA);
854 
855 			if (err)
856 				ret = err;
857 		}
858 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
859 	}
860 
861 	return ret;
862 }
863 
864 EXPORT_SYMBOL(generic_file_splice_write_nolock);
865 
866 /**
867  * generic_file_splice_write - splice data from a pipe to a file
868  * @pipe:	pipe info
869  * @out:	file to write to
870  * @ppos:	position in @out
871  * @len:	number of bytes to splice
872  * @flags:	splice modifier flags
873  *
874  * Description:
875  *    Will either move or copy pages (determined by @flags options) from
876  *    the given pipe inode to the given file.
877  *
878  */
879 ssize_t
880 generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
881 			  loff_t *ppos, size_t len, unsigned int flags)
882 {
883 	struct address_space *mapping = out->f_mapping;
884 	struct inode *inode = mapping->host;
885 	ssize_t ret;
886 	int err;
887 
888 	err = should_remove_suid(out->f_path.dentry);
889 	if (unlikely(err)) {
890 		mutex_lock(&inode->i_mutex);
891 		err = __remove_suid(out->f_path.dentry, err);
892 		mutex_unlock(&inode->i_mutex);
893 		if (err)
894 			return err;
895 	}
896 
897 	ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
898 	if (ret > 0) {
899 		unsigned long nr_pages;
900 
901 		*ppos += ret;
902 		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
903 
904 		/*
905 		 * If file or inode is SYNC and we actually wrote some data,
906 		 * sync it.
907 		 */
908 		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
909 			mutex_lock(&inode->i_mutex);
910 			err = generic_osync_inode(inode, mapping,
911 						  OSYNC_METADATA|OSYNC_DATA);
912 			mutex_unlock(&inode->i_mutex);
913 
914 			if (err)
915 				ret = err;
916 		}
917 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
918 	}
919 
920 	return ret;
921 }
922 
923 EXPORT_SYMBOL(generic_file_splice_write);
924 
925 /**
926  * generic_splice_sendpage - splice data from a pipe to a socket
927  * @pipe:	pipe to splice from
928  * @out:	socket to write to
929  * @ppos:	position in @out
930  * @len:	number of bytes to splice
931  * @flags:	splice modifier flags
932  *
933  * Description:
934  *    Will send @len bytes from the pipe to a network socket. No data copying
935  *    is involved.
936  *
937  */
938 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
939 				loff_t *ppos, size_t len, unsigned int flags)
940 {
941 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
942 }
943 
944 EXPORT_SYMBOL(generic_splice_sendpage);
945 
946 /*
947  * Attempt to initiate a splice from pipe to file.
948  */
949 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
950 			   loff_t *ppos, size_t len, unsigned int flags)
951 {
952 	int ret;
953 
954 	if (unlikely(!out->f_op || !out->f_op->splice_write))
955 		return -EINVAL;
956 
957 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
958 		return -EBADF;
959 
960 	ret = rw_verify_area(WRITE, out, ppos, len);
961 	if (unlikely(ret < 0))
962 		return ret;
963 
964 	return out->f_op->splice_write(pipe, out, ppos, len, flags);
965 }
966 
967 /*
968  * Attempt to initiate a splice from a file to a pipe.
969  */
970 static long do_splice_to(struct file *in, loff_t *ppos,
971 			 struct pipe_inode_info *pipe, size_t len,
972 			 unsigned int flags)
973 {
974 	int ret;
975 
976 	if (unlikely(!in->f_op || !in->f_op->splice_read))
977 		return -EINVAL;
978 
979 	if (unlikely(!(in->f_mode & FMODE_READ)))
980 		return -EBADF;
981 
982 	ret = rw_verify_area(READ, in, ppos, len);
983 	if (unlikely(ret < 0))
984 		return ret;
985 
986 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
987 }
988 
989 /**
990  * splice_direct_to_actor - splices data directly between two non-pipes
991  * @in:		file to splice from
992  * @sd:		actor information on where to splice to
993  * @actor:	handles the data splicing
994  *
995  * Description:
996  *    This is a special case helper to splice directly between two
997  *    points, without requiring an explicit pipe. Internally an allocated
998  *    pipe is cached in the process, and reused during the life time of
999  *    that process.
1000  *
1001  */
1002 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
1003 			       splice_direct_actor *actor)
1004 {
1005 	struct pipe_inode_info *pipe;
1006 	long ret, bytes;
1007 	umode_t i_mode;
1008 	size_t len;
1009 	int i, flags;
1010 
1011 	/*
1012 	 * We require the input being a regular file, as we don't want to
1013 	 * randomly drop data for eg socket -> socket splicing. Use the
1014 	 * piped splicing for that!
1015 	 */
1016 	i_mode = in->f_path.dentry->d_inode->i_mode;
1017 	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
1018 		return -EINVAL;
1019 
1020 	/*
1021 	 * neither in nor out is a pipe, setup an internal pipe attached to
1022 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
1023 	 */
1024 	pipe = current->splice_pipe;
1025 	if (unlikely(!pipe)) {
1026 		pipe = alloc_pipe_info(NULL);
1027 		if (!pipe)
1028 			return -ENOMEM;
1029 
1030 		/*
1031 		 * We don't have an immediate reader, but we'll read the stuff
1032 		 * out of the pipe right after the splice_to_pipe(). So set
1033 		 * PIPE_READERS appropriately.
1034 		 */
1035 		pipe->readers = 1;
1036 
1037 		current->splice_pipe = pipe;
1038 	}
1039 
1040 	/*
1041 	 * Do the splice.
1042 	 */
1043 	ret = 0;
1044 	bytes = 0;
1045 	len = sd->total_len;
1046 	flags = sd->flags;
1047 
1048 	/*
1049 	 * Don't block on output, we have to drain the direct pipe.
1050 	 */
1051 	sd->flags &= ~SPLICE_F_NONBLOCK;
1052 
1053 	while (len) {
1054 		size_t read_len, max_read_len;
1055 
1056 		/*
1057 		 * Do at most PIPE_BUFFERS pages worth of transfer:
1058 		 */
1059 		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
1060 
1061 		ret = do_splice_to(in, &sd->pos, pipe, max_read_len, flags);
1062 		if (unlikely(ret < 0))
1063 			goto out_release;
1064 
1065 		read_len = ret;
1066 		sd->total_len = read_len;
1067 
1068 		/*
1069 		 * NOTE: nonblocking mode only applies to the input. We
1070 		 * must not do the output in nonblocking mode as then we
1071 		 * could get stuck data in the internal pipe:
1072 		 */
1073 		ret = actor(pipe, sd);
1074 		if (unlikely(ret < 0))
1075 			goto out_release;
1076 
1077 		bytes += ret;
1078 		len -= ret;
1079 
1080 		/*
1081 		 * In nonblocking mode, if we got back a short read then
1082 		 * that was due to either an IO error or due to the
1083 		 * pagecache entry not being there. In the IO error case
1084 		 * the _next_ splice attempt will produce a clean IO error
1085 		 * return value (not a short read), so in both cases it's
1086 		 * correct to break out of the loop here:
1087 		 */
1088 		if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
1089 			break;
1090 	}
1091 
1092 	pipe->nrbufs = pipe->curbuf = 0;
1093 
1094 	return bytes;
1095 
1096 out_release:
1097 	/*
1098 	 * If we did an incomplete transfer we must release
1099 	 * the pipe buffers in question:
1100 	 */
1101 	for (i = 0; i < PIPE_BUFFERS; i++) {
1102 		struct pipe_buffer *buf = pipe->bufs + i;
1103 
1104 		if (buf->ops) {
1105 			buf->ops->release(pipe, buf);
1106 			buf->ops = NULL;
1107 		}
1108 	}
1109 	pipe->nrbufs = pipe->curbuf = 0;
1110 
1111 	/*
1112 	 * If we transferred some data, return the number of bytes:
1113 	 */
1114 	if (bytes > 0)
1115 		return bytes;
1116 
1117 	return ret;
1118 
1119 }
1120 EXPORT_SYMBOL(splice_direct_to_actor);
1121 
1122 static int direct_splice_actor(struct pipe_inode_info *pipe,
1123 			       struct splice_desc *sd)
1124 {
1125 	struct file *file = sd->u.file;
1126 
1127 	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
1128 }
1129 
1130 /**
1131  * do_splice_direct - splices data directly between two files
1132  * @in:		file to splice from
1133  * @ppos:	input file offset
1134  * @out:	file to splice to
1135  * @len:	number of bytes to splice
1136  * @flags:	splice modifier flags
1137  *
1138  * Description:
1139  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1140  *    doing it in the application would incur an extra system call
1141  *    (splice in + splice out, as compared to just sendfile()). So this helper
1142  *    can splice directly through a process-private pipe.
1143  *
1144  */
1145 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1146 		      size_t len, unsigned int flags)
1147 {
1148 	struct splice_desc sd = {
1149 		.len		= len,
1150 		.total_len	= len,
1151 		.flags		= flags,
1152 		.pos		= *ppos,
1153 		.u.file		= out,
1154 	};
1155 	size_t ret;
1156 
1157 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1158 	*ppos = sd.pos;
1159 	return ret;
1160 }
1161 
1162 /*
1163  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
1164  * location, so checking ->i_pipe is not enough to verify that this is a
1165  * pipe.
1166  */
1167 static inline struct pipe_inode_info *pipe_info(struct inode *inode)
1168 {
1169 	if (S_ISFIFO(inode->i_mode))
1170 		return inode->i_pipe;
1171 
1172 	return NULL;
1173 }
1174 
1175 /*
1176  * Determine where to splice to/from.
1177  */
1178 static long do_splice(struct file *in, loff_t __user *off_in,
1179 		      struct file *out, loff_t __user *off_out,
1180 		      size_t len, unsigned int flags)
1181 {
1182 	struct pipe_inode_info *pipe;
1183 	loff_t offset, *off;
1184 	long ret;
1185 
1186 	pipe = pipe_info(in->f_path.dentry->d_inode);
1187 	if (pipe) {
1188 		if (off_in)
1189 			return -ESPIPE;
1190 		if (off_out) {
1191 			if (out->f_op->llseek == no_llseek)
1192 				return -EINVAL;
1193 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1194 				return -EFAULT;
1195 			off = &offset;
1196 		} else
1197 			off = &out->f_pos;
1198 
1199 		ret = do_splice_from(pipe, out, off, len, flags);
1200 
1201 		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
1202 			ret = -EFAULT;
1203 
1204 		return ret;
1205 	}
1206 
1207 	pipe = pipe_info(out->f_path.dentry->d_inode);
1208 	if (pipe) {
1209 		if (off_out)
1210 			return -ESPIPE;
1211 		if (off_in) {
1212 			if (in->f_op->llseek == no_llseek)
1213 				return -EINVAL;
1214 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1215 				return -EFAULT;
1216 			off = &offset;
1217 		} else
1218 			off = &in->f_pos;
1219 
1220 		ret = do_splice_to(in, off, pipe, len, flags);
1221 
1222 		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
1223 			ret = -EFAULT;
1224 
1225 		return ret;
1226 	}
1227 
1228 	return -EINVAL;
1229 }
1230 
1231 /*
1232  * Map an iov into an array of pages and offset/length tupples. With the
1233  * partial_page structure, we can map several non-contiguous ranges into
1234  * our ones pages[] map instead of splitting that operation into pieces.
1235  * Could easily be exported as a generic helper for other users, in which
1236  * case one would probably want to add a 'max_nr_pages' parameter as well.
1237  */
1238 static int get_iovec_page_array(const struct iovec __user *iov,
1239 				unsigned int nr_vecs, struct page **pages,
1240 				struct partial_page *partial, int aligned)
1241 {
1242 	int buffers = 0, error = 0;
1243 
1244 	/*
1245 	 * It's ok to take the mmap_sem for reading, even
1246 	 * across a "get_user()".
1247 	 */
1248 	down_read(&current->mm->mmap_sem);
1249 
1250 	while (nr_vecs) {
1251 		unsigned long off, npages;
1252 		void __user *base;
1253 		size_t len;
1254 		int i;
1255 
1256 		/*
1257 		 * Get user address base and length for this iovec.
1258 		 */
1259 		error = get_user(base, &iov->iov_base);
1260 		if (unlikely(error))
1261 			break;
1262 		error = get_user(len, &iov->iov_len);
1263 		if (unlikely(error))
1264 			break;
1265 
1266 		/*
1267 		 * Sanity check this iovec. 0 read succeeds.
1268 		 */
1269 		if (unlikely(!len))
1270 			break;
1271 		error = -EFAULT;
1272 		if (unlikely(!base))
1273 			break;
1274 
1275 		/*
1276 		 * Get this base offset and number of pages, then map
1277 		 * in the user pages.
1278 		 */
1279 		off = (unsigned long) base & ~PAGE_MASK;
1280 
1281 		/*
1282 		 * If asked for alignment, the offset must be zero and the
1283 		 * length a multiple of the PAGE_SIZE.
1284 		 */
1285 		error = -EINVAL;
1286 		if (aligned && (off || len & ~PAGE_MASK))
1287 			break;
1288 
1289 		npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
1290 		if (npages > PIPE_BUFFERS - buffers)
1291 			npages = PIPE_BUFFERS - buffers;
1292 
1293 		error = get_user_pages(current, current->mm,
1294 				       (unsigned long) base, npages, 0, 0,
1295 				       &pages[buffers], NULL);
1296 
1297 		if (unlikely(error <= 0))
1298 			break;
1299 
1300 		/*
1301 		 * Fill this contiguous range into the partial page map.
1302 		 */
1303 		for (i = 0; i < error; i++) {
1304 			const int plen = min_t(size_t, len, PAGE_SIZE - off);
1305 
1306 			partial[buffers].offset = off;
1307 			partial[buffers].len = plen;
1308 
1309 			off = 0;
1310 			len -= plen;
1311 			buffers++;
1312 		}
1313 
1314 		/*
1315 		 * We didn't complete this iov, stop here since it probably
1316 		 * means we have to move some of this into a pipe to
1317 		 * be able to continue.
1318 		 */
1319 		if (len)
1320 			break;
1321 
1322 		/*
1323 		 * Don't continue if we mapped fewer pages than we asked for,
1324 		 * or if we mapped the max number of pages that we have
1325 		 * room for.
1326 		 */
1327 		if (error < npages || buffers == PIPE_BUFFERS)
1328 			break;
1329 
1330 		nr_vecs--;
1331 		iov++;
1332 	}
1333 
1334 	up_read(&current->mm->mmap_sem);
1335 
1336 	if (buffers)
1337 		return buffers;
1338 
1339 	return error;
1340 }
1341 
1342 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1343 			struct splice_desc *sd)
1344 {
1345 	char *src;
1346 	int ret;
1347 
1348 	ret = buf->ops->confirm(pipe, buf);
1349 	if (unlikely(ret))
1350 		return ret;
1351 
1352 	/*
1353 	 * See if we can use the atomic maps, by prefaulting in the
1354 	 * pages and doing an atomic copy
1355 	 */
1356 	if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) {
1357 		src = buf->ops->map(pipe, buf, 1);
1358 		ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset,
1359 							sd->len);
1360 		buf->ops->unmap(pipe, buf, src);
1361 		if (!ret) {
1362 			ret = sd->len;
1363 			goto out;
1364 		}
1365 	}
1366 
1367 	/*
1368 	 * No dice, use slow non-atomic map and copy
1369  	 */
1370 	src = buf->ops->map(pipe, buf, 0);
1371 
1372 	ret = sd->len;
1373 	if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len))
1374 		ret = -EFAULT;
1375 
1376 out:
1377 	if (ret > 0)
1378 		sd->u.userptr += ret;
1379 	buf->ops->unmap(pipe, buf, src);
1380 	return ret;
1381 }
1382 
1383 /*
1384  * For lack of a better implementation, implement vmsplice() to userspace
1385  * as a simple copy of the pipes pages to the user iov.
1386  */
1387 static long vmsplice_to_user(struct file *file, const struct iovec __user *iov,
1388 			     unsigned long nr_segs, unsigned int flags)
1389 {
1390 	struct pipe_inode_info *pipe;
1391 	struct splice_desc sd;
1392 	ssize_t size;
1393 	int error;
1394 	long ret;
1395 
1396 	pipe = pipe_info(file->f_path.dentry->d_inode);
1397 	if (!pipe)
1398 		return -EBADF;
1399 
1400 	if (pipe->inode)
1401 		mutex_lock(&pipe->inode->i_mutex);
1402 
1403 	error = ret = 0;
1404 	while (nr_segs) {
1405 		void __user *base;
1406 		size_t len;
1407 
1408 		/*
1409 		 * Get user address base and length for this iovec.
1410 		 */
1411 		error = get_user(base, &iov->iov_base);
1412 		if (unlikely(error))
1413 			break;
1414 		error = get_user(len, &iov->iov_len);
1415 		if (unlikely(error))
1416 			break;
1417 
1418 		/*
1419 		 * Sanity check this iovec. 0 read succeeds.
1420 		 */
1421 		if (unlikely(!len))
1422 			break;
1423 		if (unlikely(!base)) {
1424 			error = -EFAULT;
1425 			break;
1426 		}
1427 
1428 		sd.len = 0;
1429 		sd.total_len = len;
1430 		sd.flags = flags;
1431 		sd.u.userptr = base;
1432 		sd.pos = 0;
1433 
1434 		size = __splice_from_pipe(pipe, &sd, pipe_to_user);
1435 		if (size < 0) {
1436 			if (!ret)
1437 				ret = size;
1438 
1439 			break;
1440 		}
1441 
1442 		ret += size;
1443 
1444 		if (size < len)
1445 			break;
1446 
1447 		nr_segs--;
1448 		iov++;
1449 	}
1450 
1451 	if (pipe->inode)
1452 		mutex_unlock(&pipe->inode->i_mutex);
1453 
1454 	if (!ret)
1455 		ret = error;
1456 
1457 	return ret;
1458 }
1459 
1460 /*
1461  * vmsplice splices a user address range into a pipe. It can be thought of
1462  * as splice-from-memory, where the regular splice is splice-from-file (or
1463  * to file). In both cases the output is a pipe, naturally.
1464  */
1465 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
1466 			     unsigned long nr_segs, unsigned int flags)
1467 {
1468 	struct pipe_inode_info *pipe;
1469 	struct page *pages[PIPE_BUFFERS];
1470 	struct partial_page partial[PIPE_BUFFERS];
1471 	struct splice_pipe_desc spd = {
1472 		.pages = pages,
1473 		.partial = partial,
1474 		.flags = flags,
1475 		.ops = &user_page_pipe_buf_ops,
1476 	};
1477 
1478 	pipe = pipe_info(file->f_path.dentry->d_inode);
1479 	if (!pipe)
1480 		return -EBADF;
1481 
1482 	spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial,
1483 					    flags & SPLICE_F_GIFT);
1484 	if (spd.nr_pages <= 0)
1485 		return spd.nr_pages;
1486 
1487 	return splice_to_pipe(pipe, &spd);
1488 }
1489 
1490 /*
1491  * Note that vmsplice only really supports true splicing _from_ user memory
1492  * to a pipe, not the other way around. Splicing from user memory is a simple
1493  * operation that can be supported without any funky alignment restrictions
1494  * or nasty vm tricks. We simply map in the user memory and fill them into
1495  * a pipe. The reverse isn't quite as easy, though. There are two possible
1496  * solutions for that:
1497  *
1498  *	- memcpy() the data internally, at which point we might as well just
1499  *	  do a regular read() on the buffer anyway.
1500  *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1501  *	  has restriction limitations on both ends of the pipe).
1502  *
1503  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1504  *
1505  */
1506 asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
1507 			     unsigned long nr_segs, unsigned int flags)
1508 {
1509 	struct file *file;
1510 	long error;
1511 	int fput;
1512 
1513 	if (unlikely(nr_segs > UIO_MAXIOV))
1514 		return -EINVAL;
1515 	else if (unlikely(!nr_segs))
1516 		return 0;
1517 
1518 	error = -EBADF;
1519 	file = fget_light(fd, &fput);
1520 	if (file) {
1521 		if (file->f_mode & FMODE_WRITE)
1522 			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
1523 		else if (file->f_mode & FMODE_READ)
1524 			error = vmsplice_to_user(file, iov, nr_segs, flags);
1525 
1526 		fput_light(file, fput);
1527 	}
1528 
1529 	return error;
1530 }
1531 
1532 asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
1533 			   int fd_out, loff_t __user *off_out,
1534 			   size_t len, unsigned int flags)
1535 {
1536 	long error;
1537 	struct file *in, *out;
1538 	int fput_in, fput_out;
1539 
1540 	if (unlikely(!len))
1541 		return 0;
1542 
1543 	error = -EBADF;
1544 	in = fget_light(fd_in, &fput_in);
1545 	if (in) {
1546 		if (in->f_mode & FMODE_READ) {
1547 			out = fget_light(fd_out, &fput_out);
1548 			if (out) {
1549 				if (out->f_mode & FMODE_WRITE)
1550 					error = do_splice(in, off_in,
1551 							  out, off_out,
1552 							  len, flags);
1553 				fput_light(out, fput_out);
1554 			}
1555 		}
1556 
1557 		fput_light(in, fput_in);
1558 	}
1559 
1560 	return error;
1561 }
1562 
1563 /*
1564  * Make sure there's data to read. Wait for input if we can, otherwise
1565  * return an appropriate error.
1566  */
1567 static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1568 {
1569 	int ret;
1570 
1571 	/*
1572 	 * Check ->nrbufs without the inode lock first. This function
1573 	 * is speculative anyways, so missing one is ok.
1574 	 */
1575 	if (pipe->nrbufs)
1576 		return 0;
1577 
1578 	ret = 0;
1579 	mutex_lock(&pipe->inode->i_mutex);
1580 
1581 	while (!pipe->nrbufs) {
1582 		if (signal_pending(current)) {
1583 			ret = -ERESTARTSYS;
1584 			break;
1585 		}
1586 		if (!pipe->writers)
1587 			break;
1588 		if (!pipe->waiting_writers) {
1589 			if (flags & SPLICE_F_NONBLOCK) {
1590 				ret = -EAGAIN;
1591 				break;
1592 			}
1593 		}
1594 		pipe_wait(pipe);
1595 	}
1596 
1597 	mutex_unlock(&pipe->inode->i_mutex);
1598 	return ret;
1599 }
1600 
1601 /*
1602  * Make sure there's writeable room. Wait for room if we can, otherwise
1603  * return an appropriate error.
1604  */
1605 static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1606 {
1607 	int ret;
1608 
1609 	/*
1610 	 * Check ->nrbufs without the inode lock first. This function
1611 	 * is speculative anyways, so missing one is ok.
1612 	 */
1613 	if (pipe->nrbufs < PIPE_BUFFERS)
1614 		return 0;
1615 
1616 	ret = 0;
1617 	mutex_lock(&pipe->inode->i_mutex);
1618 
1619 	while (pipe->nrbufs >= PIPE_BUFFERS) {
1620 		if (!pipe->readers) {
1621 			send_sig(SIGPIPE, current, 0);
1622 			ret = -EPIPE;
1623 			break;
1624 		}
1625 		if (flags & SPLICE_F_NONBLOCK) {
1626 			ret = -EAGAIN;
1627 			break;
1628 		}
1629 		if (signal_pending(current)) {
1630 			ret = -ERESTARTSYS;
1631 			break;
1632 		}
1633 		pipe->waiting_writers++;
1634 		pipe_wait(pipe);
1635 		pipe->waiting_writers--;
1636 	}
1637 
1638 	mutex_unlock(&pipe->inode->i_mutex);
1639 	return ret;
1640 }
1641 
1642 /*
1643  * Link contents of ipipe to opipe.
1644  */
1645 static int link_pipe(struct pipe_inode_info *ipipe,
1646 		     struct pipe_inode_info *opipe,
1647 		     size_t len, unsigned int flags)
1648 {
1649 	struct pipe_buffer *ibuf, *obuf;
1650 	int ret = 0, i = 0, nbuf;
1651 
1652 	/*
1653 	 * Potential ABBA deadlock, work around it by ordering lock
1654 	 * grabbing by inode address. Otherwise two different processes
1655 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1656 	 */
1657 	inode_double_lock(ipipe->inode, opipe->inode);
1658 
1659 	do {
1660 		if (!opipe->readers) {
1661 			send_sig(SIGPIPE, current, 0);
1662 			if (!ret)
1663 				ret = -EPIPE;
1664 			break;
1665 		}
1666 
1667 		/*
1668 		 * If we have iterated all input buffers or ran out of
1669 		 * output room, break.
1670 		 */
1671 		if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS)
1672 			break;
1673 
1674 		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
1675 		nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
1676 
1677 		/*
1678 		 * Get a reference to this pipe buffer,
1679 		 * so we can copy the contents over.
1680 		 */
1681 		ibuf->ops->get(ipipe, ibuf);
1682 
1683 		obuf = opipe->bufs + nbuf;
1684 		*obuf = *ibuf;
1685 
1686 		/*
1687 		 * Don't inherit the gift flag, we need to
1688 		 * prevent multiple steals of this page.
1689 		 */
1690 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1691 
1692 		if (obuf->len > len)
1693 			obuf->len = len;
1694 
1695 		opipe->nrbufs++;
1696 		ret += obuf->len;
1697 		len -= obuf->len;
1698 		i++;
1699 	} while (len);
1700 
1701 	inode_double_unlock(ipipe->inode, opipe->inode);
1702 
1703 	/*
1704 	 * If we put data in the output pipe, wakeup any potential readers.
1705 	 */
1706 	if (ret > 0) {
1707 		smp_mb();
1708 		if (waitqueue_active(&opipe->wait))
1709 			wake_up_interruptible(&opipe->wait);
1710 		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
1711 	}
1712 
1713 	return ret;
1714 }
1715 
1716 /*
1717  * This is a tee(1) implementation that works on pipes. It doesn't copy
1718  * any data, it simply references the 'in' pages on the 'out' pipe.
1719  * The 'flags' used are the SPLICE_F_* variants, currently the only
1720  * applicable one is SPLICE_F_NONBLOCK.
1721  */
1722 static long do_tee(struct file *in, struct file *out, size_t len,
1723 		   unsigned int flags)
1724 {
1725 	struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode);
1726 	struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode);
1727 	int ret = -EINVAL;
1728 
1729 	/*
1730 	 * Duplicate the contents of ipipe to opipe without actually
1731 	 * copying the data.
1732 	 */
1733 	if (ipipe && opipe && ipipe != opipe) {
1734 		/*
1735 		 * Keep going, unless we encounter an error. The ipipe/opipe
1736 		 * ordering doesn't really matter.
1737 		 */
1738 		ret = link_ipipe_prep(ipipe, flags);
1739 		if (!ret) {
1740 			ret = link_opipe_prep(opipe, flags);
1741 			if (!ret) {
1742 				ret = link_pipe(ipipe, opipe, len, flags);
1743 				if (!ret && (flags & SPLICE_F_NONBLOCK))
1744 					ret = -EAGAIN;
1745 			}
1746 		}
1747 	}
1748 
1749 	return ret;
1750 }
1751 
1752 asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
1753 {
1754 	struct file *in;
1755 	int error, fput_in;
1756 
1757 	if (unlikely(!len))
1758 		return 0;
1759 
1760 	error = -EBADF;
1761 	in = fget_light(fdin, &fput_in);
1762 	if (in) {
1763 		if (in->f_mode & FMODE_READ) {
1764 			int fput_out;
1765 			struct file *out = fget_light(fdout, &fput_out);
1766 
1767 			if (out) {
1768 				if (out->f_mode & FMODE_WRITE)
1769 					error = do_tee(in, out, len, flags);
1770 				fput_light(out, fput_out);
1771 			}
1772 		}
1773  		fput_light(in, fput_in);
1774  	}
1775 
1776 	return error;
1777 }
1778