xref: /openbmc/linux/fs/splice.c (revision 7bcae826)
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18  *
19  */
20 #include <linux/bvec.h>
21 #include <linux/fs.h>
22 #include <linux/file.h>
23 #include <linux/pagemap.h>
24 #include <linux/splice.h>
25 #include <linux/memcontrol.h>
26 #include <linux/mm_inline.h>
27 #include <linux/swap.h>
28 #include <linux/writeback.h>
29 #include <linux/export.h>
30 #include <linux/syscalls.h>
31 #include <linux/uio.h>
32 #include <linux/security.h>
33 #include <linux/gfp.h>
34 #include <linux/socket.h>
35 #include <linux/compat.h>
36 #include "internal.h"
37 
38 /*
39  * Attempt to steal a page from a pipe buffer. This should perhaps go into
40  * a vm helper function, it's already simplified quite a bit by the
41  * addition of remove_mapping(). If success is returned, the caller may
42  * attempt to reuse this page for another destination.
43  */
44 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
45 				     struct pipe_buffer *buf)
46 {
47 	struct page *page = buf->page;
48 	struct address_space *mapping;
49 
50 	lock_page(page);
51 
52 	mapping = page_mapping(page);
53 	if (mapping) {
54 		WARN_ON(!PageUptodate(page));
55 
56 		/*
57 		 * At least for ext2 with nobh option, we need to wait on
58 		 * writeback completing on this page, since we'll remove it
59 		 * from the pagecache.  Otherwise truncate wont wait on the
60 		 * page, allowing the disk blocks to be reused by someone else
61 		 * before we actually wrote our data to them. fs corruption
62 		 * ensues.
63 		 */
64 		wait_on_page_writeback(page);
65 
66 		if (page_has_private(page) &&
67 		    !try_to_release_page(page, GFP_KERNEL))
68 			goto out_unlock;
69 
70 		/*
71 		 * If we succeeded in removing the mapping, set LRU flag
72 		 * and return good.
73 		 */
74 		if (remove_mapping(mapping, page)) {
75 			buf->flags |= PIPE_BUF_FLAG_LRU;
76 			return 0;
77 		}
78 	}
79 
80 	/*
81 	 * Raced with truncate or failed to remove page from current
82 	 * address space, unlock and return failure.
83 	 */
84 out_unlock:
85 	unlock_page(page);
86 	return 1;
87 }
88 
89 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
90 					struct pipe_buffer *buf)
91 {
92 	put_page(buf->page);
93 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
94 }
95 
96 /*
97  * Check whether the contents of buf is OK to access. Since the content
98  * is a page cache page, IO may be in flight.
99  */
100 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
101 				       struct pipe_buffer *buf)
102 {
103 	struct page *page = buf->page;
104 	int err;
105 
106 	if (!PageUptodate(page)) {
107 		lock_page(page);
108 
109 		/*
110 		 * Page got truncated/unhashed. This will cause a 0-byte
111 		 * splice, if this is the first page.
112 		 */
113 		if (!page->mapping) {
114 			err = -ENODATA;
115 			goto error;
116 		}
117 
118 		/*
119 		 * Uh oh, read-error from disk.
120 		 */
121 		if (!PageUptodate(page)) {
122 			err = -EIO;
123 			goto error;
124 		}
125 
126 		/*
127 		 * Page is ok afterall, we are done.
128 		 */
129 		unlock_page(page);
130 	}
131 
132 	return 0;
133 error:
134 	unlock_page(page);
135 	return err;
136 }
137 
138 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
139 	.can_merge = 0,
140 	.confirm = page_cache_pipe_buf_confirm,
141 	.release = page_cache_pipe_buf_release,
142 	.steal = page_cache_pipe_buf_steal,
143 	.get = generic_pipe_buf_get,
144 };
145 
146 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
147 				    struct pipe_buffer *buf)
148 {
149 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
150 		return 1;
151 
152 	buf->flags |= PIPE_BUF_FLAG_LRU;
153 	return generic_pipe_buf_steal(pipe, buf);
154 }
155 
156 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
157 	.can_merge = 0,
158 	.confirm = generic_pipe_buf_confirm,
159 	.release = page_cache_pipe_buf_release,
160 	.steal = user_page_pipe_buf_steal,
161 	.get = generic_pipe_buf_get,
162 };
163 
164 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
165 {
166 	smp_mb();
167 	if (waitqueue_active(&pipe->wait))
168 		wake_up_interruptible(&pipe->wait);
169 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
170 }
171 
172 /**
173  * splice_to_pipe - fill passed data into a pipe
174  * @pipe:	pipe to fill
175  * @spd:	data to fill
176  *
177  * Description:
178  *    @spd contains a map of pages and len/offset tuples, along with
179  *    the struct pipe_buf_operations associated with these pages. This
180  *    function will link that data to the pipe.
181  *
182  */
183 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
184 		       struct splice_pipe_desc *spd)
185 {
186 	unsigned int spd_pages = spd->nr_pages;
187 	int ret = 0, page_nr = 0;
188 
189 	if (!spd_pages)
190 		return 0;
191 
192 	if (unlikely(!pipe->readers)) {
193 		send_sig(SIGPIPE, current, 0);
194 		ret = -EPIPE;
195 		goto out;
196 	}
197 
198 	while (pipe->nrbufs < pipe->buffers) {
199 		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
200 		struct pipe_buffer *buf = pipe->bufs + newbuf;
201 
202 		buf->page = spd->pages[page_nr];
203 		buf->offset = spd->partial[page_nr].offset;
204 		buf->len = spd->partial[page_nr].len;
205 		buf->private = spd->partial[page_nr].private;
206 		buf->ops = spd->ops;
207 		buf->flags = 0;
208 
209 		pipe->nrbufs++;
210 		page_nr++;
211 		ret += buf->len;
212 
213 		if (!--spd->nr_pages)
214 			break;
215 	}
216 
217 	if (!ret)
218 		ret = -EAGAIN;
219 
220 out:
221 	while (page_nr < spd_pages)
222 		spd->spd_release(spd, page_nr++);
223 
224 	return ret;
225 }
226 EXPORT_SYMBOL_GPL(splice_to_pipe);
227 
228 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
229 {
230 	int ret;
231 
232 	if (unlikely(!pipe->readers)) {
233 		send_sig(SIGPIPE, current, 0);
234 		ret = -EPIPE;
235 	} else if (pipe->nrbufs == pipe->buffers) {
236 		ret = -EAGAIN;
237 	} else {
238 		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
239 		pipe->bufs[newbuf] = *buf;
240 		pipe->nrbufs++;
241 		return buf->len;
242 	}
243 	pipe_buf_release(pipe, buf);
244 	return ret;
245 }
246 EXPORT_SYMBOL(add_to_pipe);
247 
248 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
249 {
250 	put_page(spd->pages[i]);
251 }
252 
253 /*
254  * Check if we need to grow the arrays holding pages and partial page
255  * descriptions.
256  */
257 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
258 {
259 	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
260 
261 	spd->nr_pages_max = buffers;
262 	if (buffers <= PIPE_DEF_BUFFERS)
263 		return 0;
264 
265 	spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
266 	spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
267 
268 	if (spd->pages && spd->partial)
269 		return 0;
270 
271 	kfree(spd->pages);
272 	kfree(spd->partial);
273 	return -ENOMEM;
274 }
275 
276 void splice_shrink_spd(struct splice_pipe_desc *spd)
277 {
278 	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
279 		return;
280 
281 	kfree(spd->pages);
282 	kfree(spd->partial);
283 }
284 
285 /**
286  * generic_file_splice_read - splice data from file to a pipe
287  * @in:		file to splice from
288  * @ppos:	position in @in
289  * @pipe:	pipe to splice to
290  * @len:	number of bytes to splice
291  * @flags:	splice modifier flags
292  *
293  * Description:
294  *    Will read pages from given file and fill them into a pipe. Can be
295  *    used as long as it has more or less sane ->read_iter().
296  *
297  */
298 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
299 				 struct pipe_inode_info *pipe, size_t len,
300 				 unsigned int flags)
301 {
302 	struct iov_iter to;
303 	struct kiocb kiocb;
304 	int idx, ret;
305 
306 	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
307 	idx = to.idx;
308 	init_sync_kiocb(&kiocb, in);
309 	kiocb.ki_pos = *ppos;
310 	ret = in->f_op->read_iter(&kiocb, &to);
311 	if (ret > 0) {
312 		*ppos = kiocb.ki_pos;
313 		file_accessed(in);
314 	} else if (ret < 0) {
315 		to.idx = idx;
316 		to.iov_offset = 0;
317 		iov_iter_advance(&to, 0); /* to free what was emitted */
318 		/*
319 		 * callers of ->splice_read() expect -EAGAIN on
320 		 * "can't put anything in there", rather than -EFAULT.
321 		 */
322 		if (ret == -EFAULT)
323 			ret = -EAGAIN;
324 	}
325 
326 	return ret;
327 }
328 EXPORT_SYMBOL(generic_file_splice_read);
329 
330 const struct pipe_buf_operations default_pipe_buf_ops = {
331 	.can_merge = 0,
332 	.confirm = generic_pipe_buf_confirm,
333 	.release = generic_pipe_buf_release,
334 	.steal = generic_pipe_buf_steal,
335 	.get = generic_pipe_buf_get,
336 };
337 
338 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
339 				    struct pipe_buffer *buf)
340 {
341 	return 1;
342 }
343 
344 /* Pipe buffer operations for a socket and similar. */
345 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
346 	.can_merge = 0,
347 	.confirm = generic_pipe_buf_confirm,
348 	.release = generic_pipe_buf_release,
349 	.steal = generic_pipe_buf_nosteal,
350 	.get = generic_pipe_buf_get,
351 };
352 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
353 
354 static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
355 			    unsigned long vlen, loff_t offset)
356 {
357 	mm_segment_t old_fs;
358 	loff_t pos = offset;
359 	ssize_t res;
360 
361 	old_fs = get_fs();
362 	set_fs(get_ds());
363 	/* The cast to a user pointer is valid due to the set_fs() */
364 	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
365 	set_fs(old_fs);
366 
367 	return res;
368 }
369 
370 ssize_t kernel_write(struct file *file, const char *buf, size_t count,
371 			    loff_t pos)
372 {
373 	mm_segment_t old_fs;
374 	ssize_t res;
375 
376 	old_fs = get_fs();
377 	set_fs(get_ds());
378 	/* The cast to a user pointer is valid due to the set_fs() */
379 	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
380 	set_fs(old_fs);
381 
382 	return res;
383 }
384 EXPORT_SYMBOL(kernel_write);
385 
386 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
387 				 struct pipe_inode_info *pipe, size_t len,
388 				 unsigned int flags)
389 {
390 	struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
391 	struct iov_iter to;
392 	struct page **pages;
393 	unsigned int nr_pages;
394 	size_t offset, dummy, copied = 0;
395 	ssize_t res;
396 	int i;
397 
398 	if (pipe->nrbufs == pipe->buffers)
399 		return -EAGAIN;
400 
401 	/*
402 	 * Try to keep page boundaries matching to source pagecache ones -
403 	 * it probably won't be much help, but...
404 	 */
405 	offset = *ppos & ~PAGE_MASK;
406 
407 	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
408 
409 	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
410 	if (res <= 0)
411 		return -ENOMEM;
412 
413 	BUG_ON(dummy);
414 	nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
415 
416 	vec = __vec;
417 	if (nr_pages > PIPE_DEF_BUFFERS) {
418 		vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
419 		if (unlikely(!vec)) {
420 			res = -ENOMEM;
421 			goto out;
422 		}
423 	}
424 
425 	pipe->bufs[to.idx].offset = offset;
426 	pipe->bufs[to.idx].len -= offset;
427 
428 	for (i = 0; i < nr_pages; i++) {
429 		size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
430 		vec[i].iov_base = page_address(pages[i]) + offset;
431 		vec[i].iov_len = this_len;
432 		len -= this_len;
433 		offset = 0;
434 	}
435 
436 	res = kernel_readv(in, vec, nr_pages, *ppos);
437 	if (res > 0) {
438 		copied = res;
439 		*ppos += res;
440 	}
441 
442 	if (vec != __vec)
443 		kfree(vec);
444 out:
445 	for (i = 0; i < nr_pages; i++)
446 		put_page(pages[i]);
447 	kvfree(pages);
448 	iov_iter_advance(&to, copied);	/* truncates and discards */
449 	return res;
450 }
451 
452 /*
453  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
454  * using sendpage(). Return the number of bytes sent.
455  */
456 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
457 			    struct pipe_buffer *buf, struct splice_desc *sd)
458 {
459 	struct file *file = sd->u.file;
460 	loff_t pos = sd->pos;
461 	int more;
462 
463 	if (!likely(file->f_op->sendpage))
464 		return -EINVAL;
465 
466 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
467 
468 	if (sd->len < sd->total_len && pipe->nrbufs > 1)
469 		more |= MSG_SENDPAGE_NOTLAST;
470 
471 	return file->f_op->sendpage(file, buf->page, buf->offset,
472 				    sd->len, &pos, more);
473 }
474 
475 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
476 {
477 	smp_mb();
478 	if (waitqueue_active(&pipe->wait))
479 		wake_up_interruptible(&pipe->wait);
480 	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
481 }
482 
483 /**
484  * splice_from_pipe_feed - feed available data from a pipe to a file
485  * @pipe:	pipe to splice from
486  * @sd:		information to @actor
487  * @actor:	handler that splices the data
488  *
489  * Description:
490  *    This function loops over the pipe and calls @actor to do the
491  *    actual moving of a single struct pipe_buffer to the desired
492  *    destination.  It returns when there's no more buffers left in
493  *    the pipe or if the requested number of bytes (@sd->total_len)
494  *    have been copied.  It returns a positive number (one) if the
495  *    pipe needs to be filled with more data, zero if the required
496  *    number of bytes have been copied and -errno on error.
497  *
498  *    This, together with splice_from_pipe_{begin,end,next}, may be
499  *    used to implement the functionality of __splice_from_pipe() when
500  *    locking is required around copying the pipe buffers to the
501  *    destination.
502  */
503 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
504 			  splice_actor *actor)
505 {
506 	int ret;
507 
508 	while (pipe->nrbufs) {
509 		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
510 
511 		sd->len = buf->len;
512 		if (sd->len > sd->total_len)
513 			sd->len = sd->total_len;
514 
515 		ret = pipe_buf_confirm(pipe, buf);
516 		if (unlikely(ret)) {
517 			if (ret == -ENODATA)
518 				ret = 0;
519 			return ret;
520 		}
521 
522 		ret = actor(pipe, buf, sd);
523 		if (ret <= 0)
524 			return ret;
525 
526 		buf->offset += ret;
527 		buf->len -= ret;
528 
529 		sd->num_spliced += ret;
530 		sd->len -= ret;
531 		sd->pos += ret;
532 		sd->total_len -= ret;
533 
534 		if (!buf->len) {
535 			pipe_buf_release(pipe, buf);
536 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
537 			pipe->nrbufs--;
538 			if (pipe->files)
539 				sd->need_wakeup = true;
540 		}
541 
542 		if (!sd->total_len)
543 			return 0;
544 	}
545 
546 	return 1;
547 }
548 
549 /**
550  * splice_from_pipe_next - wait for some data to splice from
551  * @pipe:	pipe to splice from
552  * @sd:		information about the splice operation
553  *
554  * Description:
555  *    This function will wait for some data and return a positive
556  *    value (one) if pipe buffers are available.  It will return zero
557  *    or -errno if no more data needs to be spliced.
558  */
559 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
560 {
561 	/*
562 	 * Check for signal early to make process killable when there are
563 	 * always buffers available
564 	 */
565 	if (signal_pending(current))
566 		return -ERESTARTSYS;
567 
568 	while (!pipe->nrbufs) {
569 		if (!pipe->writers)
570 			return 0;
571 
572 		if (!pipe->waiting_writers && sd->num_spliced)
573 			return 0;
574 
575 		if (sd->flags & SPLICE_F_NONBLOCK)
576 			return -EAGAIN;
577 
578 		if (signal_pending(current))
579 			return -ERESTARTSYS;
580 
581 		if (sd->need_wakeup) {
582 			wakeup_pipe_writers(pipe);
583 			sd->need_wakeup = false;
584 		}
585 
586 		pipe_wait(pipe);
587 	}
588 
589 	return 1;
590 }
591 
592 /**
593  * splice_from_pipe_begin - start splicing from pipe
594  * @sd:		information about the splice operation
595  *
596  * Description:
597  *    This function should be called before a loop containing
598  *    splice_from_pipe_next() and splice_from_pipe_feed() to
599  *    initialize the necessary fields of @sd.
600  */
601 static void splice_from_pipe_begin(struct splice_desc *sd)
602 {
603 	sd->num_spliced = 0;
604 	sd->need_wakeup = false;
605 }
606 
607 /**
608  * splice_from_pipe_end - finish splicing from pipe
609  * @pipe:	pipe to splice from
610  * @sd:		information about the splice operation
611  *
612  * Description:
613  *    This function will wake up pipe writers if necessary.  It should
614  *    be called after a loop containing splice_from_pipe_next() and
615  *    splice_from_pipe_feed().
616  */
617 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
618 {
619 	if (sd->need_wakeup)
620 		wakeup_pipe_writers(pipe);
621 }
622 
623 /**
624  * __splice_from_pipe - splice data from a pipe to given actor
625  * @pipe:	pipe to splice from
626  * @sd:		information to @actor
627  * @actor:	handler that splices the data
628  *
629  * Description:
630  *    This function does little more than loop over the pipe and call
631  *    @actor to do the actual moving of a single struct pipe_buffer to
632  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
633  *    pipe_to_user.
634  *
635  */
636 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
637 			   splice_actor *actor)
638 {
639 	int ret;
640 
641 	splice_from_pipe_begin(sd);
642 	do {
643 		cond_resched();
644 		ret = splice_from_pipe_next(pipe, sd);
645 		if (ret > 0)
646 			ret = splice_from_pipe_feed(pipe, sd, actor);
647 	} while (ret > 0);
648 	splice_from_pipe_end(pipe, sd);
649 
650 	return sd->num_spliced ? sd->num_spliced : ret;
651 }
652 EXPORT_SYMBOL(__splice_from_pipe);
653 
654 /**
655  * splice_from_pipe - splice data from a pipe to a file
656  * @pipe:	pipe to splice from
657  * @out:	file to splice to
658  * @ppos:	position in @out
659  * @len:	how many bytes to splice
660  * @flags:	splice modifier flags
661  * @actor:	handler that splices the data
662  *
663  * Description:
664  *    See __splice_from_pipe. This function locks the pipe inode,
665  *    otherwise it's identical to __splice_from_pipe().
666  *
667  */
668 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
669 			 loff_t *ppos, size_t len, unsigned int flags,
670 			 splice_actor *actor)
671 {
672 	ssize_t ret;
673 	struct splice_desc sd = {
674 		.total_len = len,
675 		.flags = flags,
676 		.pos = *ppos,
677 		.u.file = out,
678 	};
679 
680 	pipe_lock(pipe);
681 	ret = __splice_from_pipe(pipe, &sd, actor);
682 	pipe_unlock(pipe);
683 
684 	return ret;
685 }
686 
687 /**
688  * iter_file_splice_write - splice data from a pipe to a file
689  * @pipe:	pipe info
690  * @out:	file to write to
691  * @ppos:	position in @out
692  * @len:	number of bytes to splice
693  * @flags:	splice modifier flags
694  *
695  * Description:
696  *    Will either move or copy pages (determined by @flags options) from
697  *    the given pipe inode to the given file.
698  *    This one is ->write_iter-based.
699  *
700  */
701 ssize_t
702 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
703 			  loff_t *ppos, size_t len, unsigned int flags)
704 {
705 	struct splice_desc sd = {
706 		.total_len = len,
707 		.flags = flags,
708 		.pos = *ppos,
709 		.u.file = out,
710 	};
711 	int nbufs = pipe->buffers;
712 	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
713 					GFP_KERNEL);
714 	ssize_t ret;
715 
716 	if (unlikely(!array))
717 		return -ENOMEM;
718 
719 	pipe_lock(pipe);
720 
721 	splice_from_pipe_begin(&sd);
722 	while (sd.total_len) {
723 		struct iov_iter from;
724 		size_t left;
725 		int n, idx;
726 
727 		ret = splice_from_pipe_next(pipe, &sd);
728 		if (ret <= 0)
729 			break;
730 
731 		if (unlikely(nbufs < pipe->buffers)) {
732 			kfree(array);
733 			nbufs = pipe->buffers;
734 			array = kcalloc(nbufs, sizeof(struct bio_vec),
735 					GFP_KERNEL);
736 			if (!array) {
737 				ret = -ENOMEM;
738 				break;
739 			}
740 		}
741 
742 		/* build the vector */
743 		left = sd.total_len;
744 		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
745 			struct pipe_buffer *buf = pipe->bufs + idx;
746 			size_t this_len = buf->len;
747 
748 			if (this_len > left)
749 				this_len = left;
750 
751 			if (idx == pipe->buffers - 1)
752 				idx = -1;
753 
754 			ret = pipe_buf_confirm(pipe, buf);
755 			if (unlikely(ret)) {
756 				if (ret == -ENODATA)
757 					ret = 0;
758 				goto done;
759 			}
760 
761 			array[n].bv_page = buf->page;
762 			array[n].bv_len = this_len;
763 			array[n].bv_offset = buf->offset;
764 			left -= this_len;
765 		}
766 
767 		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
768 			      sd.total_len - left);
769 		ret = vfs_iter_write(out, &from, &sd.pos);
770 		if (ret <= 0)
771 			break;
772 
773 		sd.num_spliced += ret;
774 		sd.total_len -= ret;
775 		*ppos = sd.pos;
776 
777 		/* dismiss the fully eaten buffers, adjust the partial one */
778 		while (ret) {
779 			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
780 			if (ret >= buf->len) {
781 				ret -= buf->len;
782 				buf->len = 0;
783 				pipe_buf_release(pipe, buf);
784 				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
785 				pipe->nrbufs--;
786 				if (pipe->files)
787 					sd.need_wakeup = true;
788 			} else {
789 				buf->offset += ret;
790 				buf->len -= ret;
791 				ret = 0;
792 			}
793 		}
794 	}
795 done:
796 	kfree(array);
797 	splice_from_pipe_end(pipe, &sd);
798 
799 	pipe_unlock(pipe);
800 
801 	if (sd.num_spliced)
802 		ret = sd.num_spliced;
803 
804 	return ret;
805 }
806 
807 EXPORT_SYMBOL(iter_file_splice_write);
808 
809 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
810 			  struct splice_desc *sd)
811 {
812 	int ret;
813 	void *data;
814 	loff_t tmp = sd->pos;
815 
816 	data = kmap(buf->page);
817 	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
818 	kunmap(buf->page);
819 
820 	return ret;
821 }
822 
823 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
824 					 struct file *out, loff_t *ppos,
825 					 size_t len, unsigned int flags)
826 {
827 	ssize_t ret;
828 
829 	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
830 	if (ret > 0)
831 		*ppos += ret;
832 
833 	return ret;
834 }
835 
836 /**
837  * generic_splice_sendpage - splice data from a pipe to a socket
838  * @pipe:	pipe to splice from
839  * @out:	socket to write to
840  * @ppos:	position in @out
841  * @len:	number of bytes to splice
842  * @flags:	splice modifier flags
843  *
844  * Description:
845  *    Will send @len bytes from the pipe to a network socket. No data copying
846  *    is involved.
847  *
848  */
849 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
850 				loff_t *ppos, size_t len, unsigned int flags)
851 {
852 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
853 }
854 
855 EXPORT_SYMBOL(generic_splice_sendpage);
856 
857 /*
858  * Attempt to initiate a splice from pipe to file.
859  */
860 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
861 			   loff_t *ppos, size_t len, unsigned int flags)
862 {
863 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
864 				loff_t *, size_t, unsigned int);
865 
866 	if (out->f_op->splice_write)
867 		splice_write = out->f_op->splice_write;
868 	else
869 		splice_write = default_file_splice_write;
870 
871 	return splice_write(pipe, out, ppos, len, flags);
872 }
873 
874 /*
875  * Attempt to initiate a splice from a file to a pipe.
876  */
877 static long do_splice_to(struct file *in, loff_t *ppos,
878 			 struct pipe_inode_info *pipe, size_t len,
879 			 unsigned int flags)
880 {
881 	ssize_t (*splice_read)(struct file *, loff_t *,
882 			       struct pipe_inode_info *, size_t, unsigned int);
883 	int ret;
884 
885 	if (unlikely(!(in->f_mode & FMODE_READ)))
886 		return -EBADF;
887 
888 	ret = rw_verify_area(READ, in, ppos, len);
889 	if (unlikely(ret < 0))
890 		return ret;
891 
892 	if (unlikely(len > MAX_RW_COUNT))
893 		len = MAX_RW_COUNT;
894 
895 	if (in->f_op->splice_read)
896 		splice_read = in->f_op->splice_read;
897 	else
898 		splice_read = default_file_splice_read;
899 
900 	return splice_read(in, ppos, pipe, len, flags);
901 }
902 
903 /**
904  * splice_direct_to_actor - splices data directly between two non-pipes
905  * @in:		file to splice from
906  * @sd:		actor information on where to splice to
907  * @actor:	handles the data splicing
908  *
909  * Description:
910  *    This is a special case helper to splice directly between two
911  *    points, without requiring an explicit pipe. Internally an allocated
912  *    pipe is cached in the process, and reused during the lifetime of
913  *    that process.
914  *
915  */
916 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
917 			       splice_direct_actor *actor)
918 {
919 	struct pipe_inode_info *pipe;
920 	long ret, bytes;
921 	umode_t i_mode;
922 	size_t len;
923 	int i, flags, more;
924 
925 	/*
926 	 * We require the input being a regular file, as we don't want to
927 	 * randomly drop data for eg socket -> socket splicing. Use the
928 	 * piped splicing for that!
929 	 */
930 	i_mode = file_inode(in)->i_mode;
931 	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
932 		return -EINVAL;
933 
934 	/*
935 	 * neither in nor out is a pipe, setup an internal pipe attached to
936 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
937 	 */
938 	pipe = current->splice_pipe;
939 	if (unlikely(!pipe)) {
940 		pipe = alloc_pipe_info();
941 		if (!pipe)
942 			return -ENOMEM;
943 
944 		/*
945 		 * We don't have an immediate reader, but we'll read the stuff
946 		 * out of the pipe right after the splice_to_pipe(). So set
947 		 * PIPE_READERS appropriately.
948 		 */
949 		pipe->readers = 1;
950 
951 		current->splice_pipe = pipe;
952 	}
953 
954 	/*
955 	 * Do the splice.
956 	 */
957 	ret = 0;
958 	bytes = 0;
959 	len = sd->total_len;
960 	flags = sd->flags;
961 
962 	/*
963 	 * Don't block on output, we have to drain the direct pipe.
964 	 */
965 	sd->flags &= ~SPLICE_F_NONBLOCK;
966 	more = sd->flags & SPLICE_F_MORE;
967 
968 	while (len) {
969 		size_t read_len;
970 		loff_t pos = sd->pos, prev_pos = pos;
971 
972 		ret = do_splice_to(in, &pos, pipe, len, flags);
973 		if (unlikely(ret <= 0))
974 			goto out_release;
975 
976 		read_len = ret;
977 		sd->total_len = read_len;
978 
979 		/*
980 		 * If more data is pending, set SPLICE_F_MORE
981 		 * If this is the last data and SPLICE_F_MORE was not set
982 		 * initially, clears it.
983 		 */
984 		if (read_len < len)
985 			sd->flags |= SPLICE_F_MORE;
986 		else if (!more)
987 			sd->flags &= ~SPLICE_F_MORE;
988 		/*
989 		 * NOTE: nonblocking mode only applies to the input. We
990 		 * must not do the output in nonblocking mode as then we
991 		 * could get stuck data in the internal pipe:
992 		 */
993 		ret = actor(pipe, sd);
994 		if (unlikely(ret <= 0)) {
995 			sd->pos = prev_pos;
996 			goto out_release;
997 		}
998 
999 		bytes += ret;
1000 		len -= ret;
1001 		sd->pos = pos;
1002 
1003 		if (ret < read_len) {
1004 			sd->pos = prev_pos + ret;
1005 			goto out_release;
1006 		}
1007 	}
1008 
1009 done:
1010 	pipe->nrbufs = pipe->curbuf = 0;
1011 	file_accessed(in);
1012 	return bytes;
1013 
1014 out_release:
1015 	/*
1016 	 * If we did an incomplete transfer we must release
1017 	 * the pipe buffers in question:
1018 	 */
1019 	for (i = 0; i < pipe->buffers; i++) {
1020 		struct pipe_buffer *buf = pipe->bufs + i;
1021 
1022 		if (buf->ops)
1023 			pipe_buf_release(pipe, buf);
1024 	}
1025 
1026 	if (!bytes)
1027 		bytes = ret;
1028 
1029 	goto done;
1030 }
1031 EXPORT_SYMBOL(splice_direct_to_actor);
1032 
1033 static int direct_splice_actor(struct pipe_inode_info *pipe,
1034 			       struct splice_desc *sd)
1035 {
1036 	struct file *file = sd->u.file;
1037 
1038 	return do_splice_from(pipe, file, sd->opos, sd->total_len,
1039 			      sd->flags);
1040 }
1041 
1042 /**
1043  * do_splice_direct - splices data directly between two files
1044  * @in:		file to splice from
1045  * @ppos:	input file offset
1046  * @out:	file to splice to
1047  * @opos:	output file offset
1048  * @len:	number of bytes to splice
1049  * @flags:	splice modifier flags
1050  *
1051  * Description:
1052  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1053  *    doing it in the application would incur an extra system call
1054  *    (splice in + splice out, as compared to just sendfile()). So this helper
1055  *    can splice directly through a process-private pipe.
1056  *
1057  */
1058 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1059 		      loff_t *opos, size_t len, unsigned int flags)
1060 {
1061 	struct splice_desc sd = {
1062 		.len		= len,
1063 		.total_len	= len,
1064 		.flags		= flags,
1065 		.pos		= *ppos,
1066 		.u.file		= out,
1067 		.opos		= opos,
1068 	};
1069 	long ret;
1070 
1071 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1072 		return -EBADF;
1073 
1074 	if (unlikely(out->f_flags & O_APPEND))
1075 		return -EINVAL;
1076 
1077 	ret = rw_verify_area(WRITE, out, opos, len);
1078 	if (unlikely(ret < 0))
1079 		return ret;
1080 
1081 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1082 	if (ret > 0)
1083 		*ppos = sd.pos;
1084 
1085 	return ret;
1086 }
1087 EXPORT_SYMBOL(do_splice_direct);
1088 
1089 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1090 {
1091 	for (;;) {
1092 		if (unlikely(!pipe->readers)) {
1093 			send_sig(SIGPIPE, current, 0);
1094 			return -EPIPE;
1095 		}
1096 		if (pipe->nrbufs != pipe->buffers)
1097 			return 0;
1098 		if (flags & SPLICE_F_NONBLOCK)
1099 			return -EAGAIN;
1100 		if (signal_pending(current))
1101 			return -ERESTARTSYS;
1102 		pipe->waiting_writers++;
1103 		pipe_wait(pipe);
1104 		pipe->waiting_writers--;
1105 	}
1106 }
1107 
1108 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1109 			       struct pipe_inode_info *opipe,
1110 			       size_t len, unsigned int flags);
1111 
1112 /*
1113  * Determine where to splice to/from.
1114  */
1115 static long do_splice(struct file *in, loff_t __user *off_in,
1116 		      struct file *out, loff_t __user *off_out,
1117 		      size_t len, unsigned int flags)
1118 {
1119 	struct pipe_inode_info *ipipe;
1120 	struct pipe_inode_info *opipe;
1121 	loff_t offset;
1122 	long ret;
1123 
1124 	ipipe = get_pipe_info(in);
1125 	opipe = get_pipe_info(out);
1126 
1127 	if (ipipe && opipe) {
1128 		if (off_in || off_out)
1129 			return -ESPIPE;
1130 
1131 		if (!(in->f_mode & FMODE_READ))
1132 			return -EBADF;
1133 
1134 		if (!(out->f_mode & FMODE_WRITE))
1135 			return -EBADF;
1136 
1137 		/* Splicing to self would be fun, but... */
1138 		if (ipipe == opipe)
1139 			return -EINVAL;
1140 
1141 		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1142 	}
1143 
1144 	if (ipipe) {
1145 		if (off_in)
1146 			return -ESPIPE;
1147 		if (off_out) {
1148 			if (!(out->f_mode & FMODE_PWRITE))
1149 				return -EINVAL;
1150 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1151 				return -EFAULT;
1152 		} else {
1153 			offset = out->f_pos;
1154 		}
1155 
1156 		if (unlikely(!(out->f_mode & FMODE_WRITE)))
1157 			return -EBADF;
1158 
1159 		if (unlikely(out->f_flags & O_APPEND))
1160 			return -EINVAL;
1161 
1162 		ret = rw_verify_area(WRITE, out, &offset, len);
1163 		if (unlikely(ret < 0))
1164 			return ret;
1165 
1166 		file_start_write(out);
1167 		ret = do_splice_from(ipipe, out, &offset, len, flags);
1168 		file_end_write(out);
1169 
1170 		if (!off_out)
1171 			out->f_pos = offset;
1172 		else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1173 			ret = -EFAULT;
1174 
1175 		return ret;
1176 	}
1177 
1178 	if (opipe) {
1179 		if (off_out)
1180 			return -ESPIPE;
1181 		if (off_in) {
1182 			if (!(in->f_mode & FMODE_PREAD))
1183 				return -EINVAL;
1184 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1185 				return -EFAULT;
1186 		} else {
1187 			offset = in->f_pos;
1188 		}
1189 
1190 		pipe_lock(opipe);
1191 		ret = wait_for_space(opipe, flags);
1192 		if (!ret)
1193 			ret = do_splice_to(in, &offset, opipe, len, flags);
1194 		pipe_unlock(opipe);
1195 		if (ret > 0)
1196 			wakeup_pipe_readers(opipe);
1197 		if (!off_in)
1198 			in->f_pos = offset;
1199 		else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1200 			ret = -EFAULT;
1201 
1202 		return ret;
1203 	}
1204 
1205 	return -EINVAL;
1206 }
1207 
1208 static int iter_to_pipe(struct iov_iter *from,
1209 			struct pipe_inode_info *pipe,
1210 			unsigned flags)
1211 {
1212 	struct pipe_buffer buf = {
1213 		.ops = &user_page_pipe_buf_ops,
1214 		.flags = flags
1215 	};
1216 	size_t total = 0;
1217 	int ret = 0;
1218 	bool failed = false;
1219 
1220 	while (iov_iter_count(from) && !failed) {
1221 		struct page *pages[16];
1222 		ssize_t copied;
1223 		size_t start;
1224 		int n;
1225 
1226 		copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1227 		if (copied <= 0) {
1228 			ret = copied;
1229 			break;
1230 		}
1231 
1232 		for (n = 0; copied; n++, start = 0) {
1233 			int size = min_t(int, copied, PAGE_SIZE - start);
1234 			if (!failed) {
1235 				buf.page = pages[n];
1236 				buf.offset = start;
1237 				buf.len = size;
1238 				ret = add_to_pipe(pipe, &buf);
1239 				if (unlikely(ret < 0)) {
1240 					failed = true;
1241 				} else {
1242 					iov_iter_advance(from, ret);
1243 					total += ret;
1244 				}
1245 			} else {
1246 				put_page(pages[n]);
1247 			}
1248 			copied -= size;
1249 		}
1250 	}
1251 	return total ? total : ret;
1252 }
1253 
1254 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1255 			struct splice_desc *sd)
1256 {
1257 	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1258 	return n == sd->len ? n : -EFAULT;
1259 }
1260 
1261 /*
1262  * For lack of a better implementation, implement vmsplice() to userspace
1263  * as a simple copy of the pipes pages to the user iov.
1264  */
1265 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1266 			     unsigned long nr_segs, unsigned int flags)
1267 {
1268 	struct pipe_inode_info *pipe;
1269 	struct splice_desc sd;
1270 	long ret;
1271 	struct iovec iovstack[UIO_FASTIOV];
1272 	struct iovec *iov = iovstack;
1273 	struct iov_iter iter;
1274 
1275 	pipe = get_pipe_info(file);
1276 	if (!pipe)
1277 		return -EBADF;
1278 
1279 	ret = import_iovec(READ, uiov, nr_segs,
1280 			   ARRAY_SIZE(iovstack), &iov, &iter);
1281 	if (ret < 0)
1282 		return ret;
1283 
1284 	sd.total_len = iov_iter_count(&iter);
1285 	sd.len = 0;
1286 	sd.flags = flags;
1287 	sd.u.data = &iter;
1288 	sd.pos = 0;
1289 
1290 	if (sd.total_len) {
1291 		pipe_lock(pipe);
1292 		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1293 		pipe_unlock(pipe);
1294 	}
1295 
1296 	kfree(iov);
1297 	return ret;
1298 }
1299 
1300 /*
1301  * vmsplice splices a user address range into a pipe. It can be thought of
1302  * as splice-from-memory, where the regular splice is splice-from-file (or
1303  * to file). In both cases the output is a pipe, naturally.
1304  */
1305 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1306 			     unsigned long nr_segs, unsigned int flags)
1307 {
1308 	struct pipe_inode_info *pipe;
1309 	struct iovec iovstack[UIO_FASTIOV];
1310 	struct iovec *iov = iovstack;
1311 	struct iov_iter from;
1312 	long ret;
1313 	unsigned buf_flag = 0;
1314 
1315 	if (flags & SPLICE_F_GIFT)
1316 		buf_flag = PIPE_BUF_FLAG_GIFT;
1317 
1318 	pipe = get_pipe_info(file);
1319 	if (!pipe)
1320 		return -EBADF;
1321 
1322 	ret = import_iovec(WRITE, uiov, nr_segs,
1323 			   ARRAY_SIZE(iovstack), &iov, &from);
1324 	if (ret < 0)
1325 		return ret;
1326 
1327 	pipe_lock(pipe);
1328 	ret = wait_for_space(pipe, flags);
1329 	if (!ret)
1330 		ret = iter_to_pipe(&from, pipe, buf_flag);
1331 	pipe_unlock(pipe);
1332 	if (ret > 0)
1333 		wakeup_pipe_readers(pipe);
1334 	kfree(iov);
1335 	return ret;
1336 }
1337 
1338 /*
1339  * Note that vmsplice only really supports true splicing _from_ user memory
1340  * to a pipe, not the other way around. Splicing from user memory is a simple
1341  * operation that can be supported without any funky alignment restrictions
1342  * or nasty vm tricks. We simply map in the user memory and fill them into
1343  * a pipe. The reverse isn't quite as easy, though. There are two possible
1344  * solutions for that:
1345  *
1346  *	- memcpy() the data internally, at which point we might as well just
1347  *	  do a regular read() on the buffer anyway.
1348  *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1349  *	  has restriction limitations on both ends of the pipe).
1350  *
1351  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1352  *
1353  */
1354 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1355 		unsigned long, nr_segs, unsigned int, flags)
1356 {
1357 	struct fd f;
1358 	long error;
1359 
1360 	if (unlikely(nr_segs > UIO_MAXIOV))
1361 		return -EINVAL;
1362 	else if (unlikely(!nr_segs))
1363 		return 0;
1364 
1365 	error = -EBADF;
1366 	f = fdget(fd);
1367 	if (f.file) {
1368 		if (f.file->f_mode & FMODE_WRITE)
1369 			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1370 		else if (f.file->f_mode & FMODE_READ)
1371 			error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1372 
1373 		fdput(f);
1374 	}
1375 
1376 	return error;
1377 }
1378 
1379 #ifdef CONFIG_COMPAT
1380 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1381 		    unsigned int, nr_segs, unsigned int, flags)
1382 {
1383 	unsigned i;
1384 	struct iovec __user *iov;
1385 	if (nr_segs > UIO_MAXIOV)
1386 		return -EINVAL;
1387 	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1388 	for (i = 0; i < nr_segs; i++) {
1389 		struct compat_iovec v;
1390 		if (get_user(v.iov_base, &iov32[i].iov_base) ||
1391 		    get_user(v.iov_len, &iov32[i].iov_len) ||
1392 		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1393 		    put_user(v.iov_len, &iov[i].iov_len))
1394 			return -EFAULT;
1395 	}
1396 	return sys_vmsplice(fd, iov, nr_segs, flags);
1397 }
1398 #endif
1399 
1400 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1401 		int, fd_out, loff_t __user *, off_out,
1402 		size_t, len, unsigned int, flags)
1403 {
1404 	struct fd in, out;
1405 	long error;
1406 
1407 	if (unlikely(!len))
1408 		return 0;
1409 
1410 	error = -EBADF;
1411 	in = fdget(fd_in);
1412 	if (in.file) {
1413 		if (in.file->f_mode & FMODE_READ) {
1414 			out = fdget(fd_out);
1415 			if (out.file) {
1416 				if (out.file->f_mode & FMODE_WRITE)
1417 					error = do_splice(in.file, off_in,
1418 							  out.file, off_out,
1419 							  len, flags);
1420 				fdput(out);
1421 			}
1422 		}
1423 		fdput(in);
1424 	}
1425 	return error;
1426 }
1427 
1428 /*
1429  * Make sure there's data to read. Wait for input if we can, otherwise
1430  * return an appropriate error.
1431  */
1432 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1433 {
1434 	int ret;
1435 
1436 	/*
1437 	 * Check ->nrbufs without the inode lock first. This function
1438 	 * is speculative anyways, so missing one is ok.
1439 	 */
1440 	if (pipe->nrbufs)
1441 		return 0;
1442 
1443 	ret = 0;
1444 	pipe_lock(pipe);
1445 
1446 	while (!pipe->nrbufs) {
1447 		if (signal_pending(current)) {
1448 			ret = -ERESTARTSYS;
1449 			break;
1450 		}
1451 		if (!pipe->writers)
1452 			break;
1453 		if (!pipe->waiting_writers) {
1454 			if (flags & SPLICE_F_NONBLOCK) {
1455 				ret = -EAGAIN;
1456 				break;
1457 			}
1458 		}
1459 		pipe_wait(pipe);
1460 	}
1461 
1462 	pipe_unlock(pipe);
1463 	return ret;
1464 }
1465 
1466 /*
1467  * Make sure there's writeable room. Wait for room if we can, otherwise
1468  * return an appropriate error.
1469  */
1470 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1471 {
1472 	int ret;
1473 
1474 	/*
1475 	 * Check ->nrbufs without the inode lock first. This function
1476 	 * is speculative anyways, so missing one is ok.
1477 	 */
1478 	if (pipe->nrbufs < pipe->buffers)
1479 		return 0;
1480 
1481 	ret = 0;
1482 	pipe_lock(pipe);
1483 
1484 	while (pipe->nrbufs >= pipe->buffers) {
1485 		if (!pipe->readers) {
1486 			send_sig(SIGPIPE, current, 0);
1487 			ret = -EPIPE;
1488 			break;
1489 		}
1490 		if (flags & SPLICE_F_NONBLOCK) {
1491 			ret = -EAGAIN;
1492 			break;
1493 		}
1494 		if (signal_pending(current)) {
1495 			ret = -ERESTARTSYS;
1496 			break;
1497 		}
1498 		pipe->waiting_writers++;
1499 		pipe_wait(pipe);
1500 		pipe->waiting_writers--;
1501 	}
1502 
1503 	pipe_unlock(pipe);
1504 	return ret;
1505 }
1506 
1507 /*
1508  * Splice contents of ipipe to opipe.
1509  */
1510 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1511 			       struct pipe_inode_info *opipe,
1512 			       size_t len, unsigned int flags)
1513 {
1514 	struct pipe_buffer *ibuf, *obuf;
1515 	int ret = 0, nbuf;
1516 	bool input_wakeup = false;
1517 
1518 
1519 retry:
1520 	ret = ipipe_prep(ipipe, flags);
1521 	if (ret)
1522 		return ret;
1523 
1524 	ret = opipe_prep(opipe, flags);
1525 	if (ret)
1526 		return ret;
1527 
1528 	/*
1529 	 * Potential ABBA deadlock, work around it by ordering lock
1530 	 * grabbing by pipe info address. Otherwise two different processes
1531 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1532 	 */
1533 	pipe_double_lock(ipipe, opipe);
1534 
1535 	do {
1536 		if (!opipe->readers) {
1537 			send_sig(SIGPIPE, current, 0);
1538 			if (!ret)
1539 				ret = -EPIPE;
1540 			break;
1541 		}
1542 
1543 		if (!ipipe->nrbufs && !ipipe->writers)
1544 			break;
1545 
1546 		/*
1547 		 * Cannot make any progress, because either the input
1548 		 * pipe is empty or the output pipe is full.
1549 		 */
1550 		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1551 			/* Already processed some buffers, break */
1552 			if (ret)
1553 				break;
1554 
1555 			if (flags & SPLICE_F_NONBLOCK) {
1556 				ret = -EAGAIN;
1557 				break;
1558 			}
1559 
1560 			/*
1561 			 * We raced with another reader/writer and haven't
1562 			 * managed to process any buffers.  A zero return
1563 			 * value means EOF, so retry instead.
1564 			 */
1565 			pipe_unlock(ipipe);
1566 			pipe_unlock(opipe);
1567 			goto retry;
1568 		}
1569 
1570 		ibuf = ipipe->bufs + ipipe->curbuf;
1571 		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1572 		obuf = opipe->bufs + nbuf;
1573 
1574 		if (len >= ibuf->len) {
1575 			/*
1576 			 * Simply move the whole buffer from ipipe to opipe
1577 			 */
1578 			*obuf = *ibuf;
1579 			ibuf->ops = NULL;
1580 			opipe->nrbufs++;
1581 			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1582 			ipipe->nrbufs--;
1583 			input_wakeup = true;
1584 		} else {
1585 			/*
1586 			 * Get a reference to this pipe buffer,
1587 			 * so we can copy the contents over.
1588 			 */
1589 			pipe_buf_get(ipipe, ibuf);
1590 			*obuf = *ibuf;
1591 
1592 			/*
1593 			 * Don't inherit the gift flag, we need to
1594 			 * prevent multiple steals of this page.
1595 			 */
1596 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1597 
1598 			obuf->len = len;
1599 			opipe->nrbufs++;
1600 			ibuf->offset += obuf->len;
1601 			ibuf->len -= obuf->len;
1602 		}
1603 		ret += obuf->len;
1604 		len -= obuf->len;
1605 	} while (len);
1606 
1607 	pipe_unlock(ipipe);
1608 	pipe_unlock(opipe);
1609 
1610 	/*
1611 	 * If we put data in the output pipe, wakeup any potential readers.
1612 	 */
1613 	if (ret > 0)
1614 		wakeup_pipe_readers(opipe);
1615 
1616 	if (input_wakeup)
1617 		wakeup_pipe_writers(ipipe);
1618 
1619 	return ret;
1620 }
1621 
1622 /*
1623  * Link contents of ipipe to opipe.
1624  */
1625 static int link_pipe(struct pipe_inode_info *ipipe,
1626 		     struct pipe_inode_info *opipe,
1627 		     size_t len, unsigned int flags)
1628 {
1629 	struct pipe_buffer *ibuf, *obuf;
1630 	int ret = 0, i = 0, nbuf;
1631 
1632 	/*
1633 	 * Potential ABBA deadlock, work around it by ordering lock
1634 	 * grabbing by pipe info address. Otherwise two different processes
1635 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1636 	 */
1637 	pipe_double_lock(ipipe, opipe);
1638 
1639 	do {
1640 		if (!opipe->readers) {
1641 			send_sig(SIGPIPE, current, 0);
1642 			if (!ret)
1643 				ret = -EPIPE;
1644 			break;
1645 		}
1646 
1647 		/*
1648 		 * If we have iterated all input buffers or ran out of
1649 		 * output room, break.
1650 		 */
1651 		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1652 			break;
1653 
1654 		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1655 		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1656 
1657 		/*
1658 		 * Get a reference to this pipe buffer,
1659 		 * so we can copy the contents over.
1660 		 */
1661 		pipe_buf_get(ipipe, ibuf);
1662 
1663 		obuf = opipe->bufs + nbuf;
1664 		*obuf = *ibuf;
1665 
1666 		/*
1667 		 * Don't inherit the gift flag, we need to
1668 		 * prevent multiple steals of this page.
1669 		 */
1670 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1671 
1672 		if (obuf->len > len)
1673 			obuf->len = len;
1674 
1675 		opipe->nrbufs++;
1676 		ret += obuf->len;
1677 		len -= obuf->len;
1678 		i++;
1679 	} while (len);
1680 
1681 	/*
1682 	 * return EAGAIN if we have the potential of some data in the
1683 	 * future, otherwise just return 0
1684 	 */
1685 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1686 		ret = -EAGAIN;
1687 
1688 	pipe_unlock(ipipe);
1689 	pipe_unlock(opipe);
1690 
1691 	/*
1692 	 * If we put data in the output pipe, wakeup any potential readers.
1693 	 */
1694 	if (ret > 0)
1695 		wakeup_pipe_readers(opipe);
1696 
1697 	return ret;
1698 }
1699 
1700 /*
1701  * This is a tee(1) implementation that works on pipes. It doesn't copy
1702  * any data, it simply references the 'in' pages on the 'out' pipe.
1703  * The 'flags' used are the SPLICE_F_* variants, currently the only
1704  * applicable one is SPLICE_F_NONBLOCK.
1705  */
1706 static long do_tee(struct file *in, struct file *out, size_t len,
1707 		   unsigned int flags)
1708 {
1709 	struct pipe_inode_info *ipipe = get_pipe_info(in);
1710 	struct pipe_inode_info *opipe = get_pipe_info(out);
1711 	int ret = -EINVAL;
1712 
1713 	/*
1714 	 * Duplicate the contents of ipipe to opipe without actually
1715 	 * copying the data.
1716 	 */
1717 	if (ipipe && opipe && ipipe != opipe) {
1718 		/*
1719 		 * Keep going, unless we encounter an error. The ipipe/opipe
1720 		 * ordering doesn't really matter.
1721 		 */
1722 		ret = ipipe_prep(ipipe, flags);
1723 		if (!ret) {
1724 			ret = opipe_prep(opipe, flags);
1725 			if (!ret)
1726 				ret = link_pipe(ipipe, opipe, len, flags);
1727 		}
1728 	}
1729 
1730 	return ret;
1731 }
1732 
1733 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1734 {
1735 	struct fd in;
1736 	int error;
1737 
1738 	if (unlikely(!len))
1739 		return 0;
1740 
1741 	error = -EBADF;
1742 	in = fdget(fdin);
1743 	if (in.file) {
1744 		if (in.file->f_mode & FMODE_READ) {
1745 			struct fd out = fdget(fdout);
1746 			if (out.file) {
1747 				if (out.file->f_mode & FMODE_WRITE)
1748 					error = do_tee(in.file, out.file,
1749 							len, flags);
1750 				fdput(out);
1751 			}
1752 		}
1753  		fdput(in);
1754  	}
1755 
1756 	return error;
1757 }
1758