xref: /openbmc/linux/fs/splice.c (revision c51d39010a1bccc9c1294e2d7c00005aefeb2b5c)
1 /*
2  * "splice": joining two ropes together by interweaving their strands.
3  *
4  * This is the "extended pipe" functionality, where a pipe is used as
5  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
6  * buffer that you can use to transfer data from one end to the other.
7  *
8  * The traditional unix read/write is extended with a "splice()" operation
9  * that transfers data buffers to or from a pipe buffer.
10  *
11  * Named by Larry McVoy, original implementation from Linus, extended by
12  * Jens to support splicing to files, network, direct splicing, etc and
13  * fixing lots of bugs.
14  *
15  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
16  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
17  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
18  *
19  */
20 #include <linux/fs.h>
21 #include <linux/file.h>
22 #include <linux/pagemap.h>
23 #include <linux/splice.h>
24 #include <linux/memcontrol.h>
25 #include <linux/mm_inline.h>
26 #include <linux/swap.h>
27 #include <linux/writeback.h>
28 #include <linux/export.h>
29 #include <linux/syscalls.h>
30 #include <linux/uio.h>
31 #include <linux/security.h>
32 #include <linux/gfp.h>
33 #include <linux/socket.h>
34 #include <linux/compat.h>
35 #include "internal.h"
36 
37 /*
38  * Attempt to steal a page from a pipe buffer. This should perhaps go into
39  * a vm helper function, it's already simplified quite a bit by the
40  * addition of remove_mapping(). If success is returned, the caller may
41  * attempt to reuse this page for another destination.
42  */
43 static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe,
44 				     struct pipe_buffer *buf)
45 {
46 	struct page *page = buf->page;
47 	struct address_space *mapping;
48 
49 	lock_page(page);
50 
51 	mapping = page_mapping(page);
52 	if (mapping) {
53 		WARN_ON(!PageUptodate(page));
54 
55 		/*
56 		 * At least for ext2 with nobh option, we need to wait on
57 		 * writeback completing on this page, since we'll remove it
58 		 * from the pagecache.  Otherwise truncate wont wait on the
59 		 * page, allowing the disk blocks to be reused by someone else
60 		 * before we actually wrote our data to them. fs corruption
61 		 * ensues.
62 		 */
63 		wait_on_page_writeback(page);
64 
65 		if (page_has_private(page) &&
66 		    !try_to_release_page(page, GFP_KERNEL))
67 			goto out_unlock;
68 
69 		/*
70 		 * If we succeeded in removing the mapping, set LRU flag
71 		 * and return good.
72 		 */
73 		if (remove_mapping(mapping, page)) {
74 			buf->flags |= PIPE_BUF_FLAG_LRU;
75 			return 0;
76 		}
77 	}
78 
79 	/*
80 	 * Raced with truncate or failed to remove page from current
81 	 * address space, unlock and return failure.
82 	 */
83 out_unlock:
84 	unlock_page(page);
85 	return 1;
86 }
87 
88 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
89 					struct pipe_buffer *buf)
90 {
91 	put_page(buf->page);
92 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
93 }
94 
95 /*
96  * Check whether the contents of buf is OK to access. Since the content
97  * is a page cache page, IO may be in flight.
98  */
99 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
100 				       struct pipe_buffer *buf)
101 {
102 	struct page *page = buf->page;
103 	int err;
104 
105 	if (!PageUptodate(page)) {
106 		lock_page(page);
107 
108 		/*
109 		 * Page got truncated/unhashed. This will cause a 0-byte
110 		 * splice, if this is the first page.
111 		 */
112 		if (!page->mapping) {
113 			err = -ENODATA;
114 			goto error;
115 		}
116 
117 		/*
118 		 * Uh oh, read-error from disk.
119 		 */
120 		if (!PageUptodate(page)) {
121 			err = -EIO;
122 			goto error;
123 		}
124 
125 		/*
126 		 * Page is ok afterall, we are done.
127 		 */
128 		unlock_page(page);
129 	}
130 
131 	return 0;
132 error:
133 	unlock_page(page);
134 	return err;
135 }
136 
137 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
138 	.can_merge = 0,
139 	.confirm = page_cache_pipe_buf_confirm,
140 	.release = page_cache_pipe_buf_release,
141 	.steal = page_cache_pipe_buf_steal,
142 	.get = generic_pipe_buf_get,
143 };
144 
145 static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe,
146 				    struct pipe_buffer *buf)
147 {
148 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
149 		return 1;
150 
151 	buf->flags |= PIPE_BUF_FLAG_LRU;
152 	return generic_pipe_buf_steal(pipe, buf);
153 }
154 
155 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
156 	.can_merge = 0,
157 	.confirm = generic_pipe_buf_confirm,
158 	.release = page_cache_pipe_buf_release,
159 	.steal = user_page_pipe_buf_steal,
160 	.get = generic_pipe_buf_get,
161 };
162 
163 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
164 {
165 	smp_mb();
166 	if (waitqueue_active(&pipe->wait))
167 		wake_up_interruptible(&pipe->wait);
168 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
169 }
170 
171 /**
172  * splice_to_pipe - fill passed data into a pipe
173  * @pipe:	pipe to fill
174  * @spd:	data to fill
175  *
176  * Description:
177  *    @spd contains a map of pages and len/offset tuples, along with
178  *    the struct pipe_buf_operations associated with these pages. This
179  *    function will link that data to the pipe.
180  *
181  */
182 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
183 		       struct splice_pipe_desc *spd)
184 {
185 	unsigned int spd_pages = spd->nr_pages;
186 	int ret = 0, page_nr = 0;
187 
188 	if (!spd_pages)
189 		return 0;
190 
191 	if (unlikely(!pipe->readers)) {
192 		send_sig(SIGPIPE, current, 0);
193 		ret = -EPIPE;
194 		goto out;
195 	}
196 
197 	while (pipe->nrbufs < pipe->buffers) {
198 		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
199 		struct pipe_buffer *buf = pipe->bufs + newbuf;
200 
201 		buf->page = spd->pages[page_nr];
202 		buf->offset = spd->partial[page_nr].offset;
203 		buf->len = spd->partial[page_nr].len;
204 		buf->private = spd->partial[page_nr].private;
205 		buf->ops = spd->ops;
206 
207 		pipe->nrbufs++;
208 		page_nr++;
209 		ret += buf->len;
210 
211 		if (!--spd->nr_pages)
212 			break;
213 	}
214 
215 	if (!ret)
216 		ret = -EAGAIN;
217 
218 out:
219 	while (page_nr < spd_pages)
220 		spd->spd_release(spd, page_nr++);
221 
222 	return ret;
223 }
224 EXPORT_SYMBOL_GPL(splice_to_pipe);
225 
226 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
227 {
228 	int ret;
229 
230 	if (unlikely(!pipe->readers)) {
231 		send_sig(SIGPIPE, current, 0);
232 		ret = -EPIPE;
233 	} else if (pipe->nrbufs == pipe->buffers) {
234 		ret = -EAGAIN;
235 	} else {
236 		int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
237 		pipe->bufs[newbuf] = *buf;
238 		pipe->nrbufs++;
239 		return buf->len;
240 	}
241 	pipe_buf_release(pipe, buf);
242 	return ret;
243 }
244 EXPORT_SYMBOL(add_to_pipe);
245 
246 void spd_release_page(struct splice_pipe_desc *spd, unsigned int i)
247 {
248 	put_page(spd->pages[i]);
249 }
250 
251 /*
252  * Check if we need to grow the arrays holding pages and partial page
253  * descriptions.
254  */
255 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
256 {
257 	unsigned int buffers = ACCESS_ONCE(pipe->buffers);
258 
259 	spd->nr_pages_max = buffers;
260 	if (buffers <= PIPE_DEF_BUFFERS)
261 		return 0;
262 
263 	spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL);
264 	spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL);
265 
266 	if (spd->pages && spd->partial)
267 		return 0;
268 
269 	kfree(spd->pages);
270 	kfree(spd->partial);
271 	return -ENOMEM;
272 }
273 
274 void splice_shrink_spd(struct splice_pipe_desc *spd)
275 {
276 	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
277 		return;
278 
279 	kfree(spd->pages);
280 	kfree(spd->partial);
281 }
282 
283 /**
284  * generic_file_splice_read - splice data from file to a pipe
285  * @in:		file to splice from
286  * @ppos:	position in @in
287  * @pipe:	pipe to splice to
288  * @len:	number of bytes to splice
289  * @flags:	splice modifier flags
290  *
291  * Description:
292  *    Will read pages from given file and fill them into a pipe. Can be
293  *    used as long as it has more or less sane ->read_iter().
294  *
295  */
296 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
297 				 struct pipe_inode_info *pipe, size_t len,
298 				 unsigned int flags)
299 {
300 	struct iov_iter to;
301 	struct kiocb kiocb;
302 	int idx, ret;
303 
304 	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len);
305 	idx = to.idx;
306 	init_sync_kiocb(&kiocb, in);
307 	kiocb.ki_pos = *ppos;
308 	ret = in->f_op->read_iter(&kiocb, &to);
309 	if (ret > 0) {
310 		*ppos = kiocb.ki_pos;
311 		file_accessed(in);
312 	} else if (ret < 0) {
313 		to.idx = idx;
314 		to.iov_offset = 0;
315 		iov_iter_advance(&to, 0); /* to free what was emitted */
316 		/*
317 		 * callers of ->splice_read() expect -EAGAIN on
318 		 * "can't put anything in there", rather than -EFAULT.
319 		 */
320 		if (ret == -EFAULT)
321 			ret = -EAGAIN;
322 	}
323 
324 	return ret;
325 }
326 EXPORT_SYMBOL(generic_file_splice_read);
327 
328 const struct pipe_buf_operations default_pipe_buf_ops = {
329 	.can_merge = 0,
330 	.confirm = generic_pipe_buf_confirm,
331 	.release = generic_pipe_buf_release,
332 	.steal = generic_pipe_buf_steal,
333 	.get = generic_pipe_buf_get,
334 };
335 
336 static int generic_pipe_buf_nosteal(struct pipe_inode_info *pipe,
337 				    struct pipe_buffer *buf)
338 {
339 	return 1;
340 }
341 
342 /* Pipe buffer operations for a socket and similar. */
343 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
344 	.can_merge = 0,
345 	.confirm = generic_pipe_buf_confirm,
346 	.release = generic_pipe_buf_release,
347 	.steal = generic_pipe_buf_nosteal,
348 	.get = generic_pipe_buf_get,
349 };
350 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
351 
352 static ssize_t kernel_readv(struct file *file, const struct kvec *vec,
353 			    unsigned long vlen, loff_t offset)
354 {
355 	mm_segment_t old_fs;
356 	loff_t pos = offset;
357 	ssize_t res;
358 
359 	old_fs = get_fs();
360 	set_fs(get_ds());
361 	/* The cast to a user pointer is valid due to the set_fs() */
362 	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos, 0);
363 	set_fs(old_fs);
364 
365 	return res;
366 }
367 
368 ssize_t kernel_write(struct file *file, const char *buf, size_t count,
369 			    loff_t pos)
370 {
371 	mm_segment_t old_fs;
372 	ssize_t res;
373 
374 	old_fs = get_fs();
375 	set_fs(get_ds());
376 	/* The cast to a user pointer is valid due to the set_fs() */
377 	res = vfs_write(file, (__force const char __user *)buf, count, &pos);
378 	set_fs(old_fs);
379 
380 	return res;
381 }
382 EXPORT_SYMBOL(kernel_write);
383 
384 static ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
385 				 struct pipe_inode_info *pipe, size_t len,
386 				 unsigned int flags)
387 {
388 	struct kvec *vec, __vec[PIPE_DEF_BUFFERS];
389 	struct iov_iter to;
390 	struct page **pages;
391 	unsigned int nr_pages;
392 	size_t offset, dummy, copied = 0;
393 	ssize_t res;
394 	int i;
395 
396 	if (pipe->nrbufs == pipe->buffers)
397 		return -EAGAIN;
398 
399 	/*
400 	 * Try to keep page boundaries matching to source pagecache ones -
401 	 * it probably won't be much help, but...
402 	 */
403 	offset = *ppos & ~PAGE_MASK;
404 
405 	iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset);
406 
407 	res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy);
408 	if (res <= 0)
409 		return -ENOMEM;
410 
411 	BUG_ON(dummy);
412 	nr_pages = DIV_ROUND_UP(res, PAGE_SIZE);
413 
414 	vec = __vec;
415 	if (nr_pages > PIPE_DEF_BUFFERS) {
416 		vec = kmalloc(nr_pages * sizeof(struct kvec), GFP_KERNEL);
417 		if (unlikely(!vec)) {
418 			res = -ENOMEM;
419 			goto out;
420 		}
421 	}
422 
423 	pipe->bufs[to.idx].offset = offset;
424 	pipe->bufs[to.idx].len -= offset;
425 
426 	for (i = 0; i < nr_pages; i++) {
427 		size_t this_len = min_t(size_t, len, PAGE_SIZE - offset);
428 		vec[i].iov_base = page_address(pages[i]) + offset;
429 		vec[i].iov_len = this_len;
430 		len -= this_len;
431 		offset = 0;
432 	}
433 
434 	res = kernel_readv(in, vec, nr_pages, *ppos);
435 	if (res > 0) {
436 		copied = res;
437 		*ppos += res;
438 	}
439 
440 	if (vec != __vec)
441 		kfree(vec);
442 out:
443 	for (i = 0; i < nr_pages; i++)
444 		put_page(pages[i]);
445 	kvfree(pages);
446 	iov_iter_advance(&to, copied);	/* truncates and discards */
447 	return res;
448 }
449 
450 /*
451  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
452  * using sendpage(). Return the number of bytes sent.
453  */
454 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
455 			    struct pipe_buffer *buf, struct splice_desc *sd)
456 {
457 	struct file *file = sd->u.file;
458 	loff_t pos = sd->pos;
459 	int more;
460 
461 	if (!likely(file->f_op->sendpage))
462 		return -EINVAL;
463 
464 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
465 
466 	if (sd->len < sd->total_len && pipe->nrbufs > 1)
467 		more |= MSG_SENDPAGE_NOTLAST;
468 
469 	return file->f_op->sendpage(file, buf->page, buf->offset,
470 				    sd->len, &pos, more);
471 }
472 
473 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
474 {
475 	smp_mb();
476 	if (waitqueue_active(&pipe->wait))
477 		wake_up_interruptible(&pipe->wait);
478 	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
479 }
480 
481 /**
482  * splice_from_pipe_feed - feed available data from a pipe to a file
483  * @pipe:	pipe to splice from
484  * @sd:		information to @actor
485  * @actor:	handler that splices the data
486  *
487  * Description:
488  *    This function loops over the pipe and calls @actor to do the
489  *    actual moving of a single struct pipe_buffer to the desired
490  *    destination.  It returns when there's no more buffers left in
491  *    the pipe or if the requested number of bytes (@sd->total_len)
492  *    have been copied.  It returns a positive number (one) if the
493  *    pipe needs to be filled with more data, zero if the required
494  *    number of bytes have been copied and -errno on error.
495  *
496  *    This, together with splice_from_pipe_{begin,end,next}, may be
497  *    used to implement the functionality of __splice_from_pipe() when
498  *    locking is required around copying the pipe buffers to the
499  *    destination.
500  */
501 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
502 			  splice_actor *actor)
503 {
504 	int ret;
505 
506 	while (pipe->nrbufs) {
507 		struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
508 
509 		sd->len = buf->len;
510 		if (sd->len > sd->total_len)
511 			sd->len = sd->total_len;
512 
513 		ret = pipe_buf_confirm(pipe, buf);
514 		if (unlikely(ret)) {
515 			if (ret == -ENODATA)
516 				ret = 0;
517 			return ret;
518 		}
519 
520 		ret = actor(pipe, buf, sd);
521 		if (ret <= 0)
522 			return ret;
523 
524 		buf->offset += ret;
525 		buf->len -= ret;
526 
527 		sd->num_spliced += ret;
528 		sd->len -= ret;
529 		sd->pos += ret;
530 		sd->total_len -= ret;
531 
532 		if (!buf->len) {
533 			pipe_buf_release(pipe, buf);
534 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
535 			pipe->nrbufs--;
536 			if (pipe->files)
537 				sd->need_wakeup = true;
538 		}
539 
540 		if (!sd->total_len)
541 			return 0;
542 	}
543 
544 	return 1;
545 }
546 
547 /**
548  * splice_from_pipe_next - wait for some data to splice from
549  * @pipe:	pipe to splice from
550  * @sd:		information about the splice operation
551  *
552  * Description:
553  *    This function will wait for some data and return a positive
554  *    value (one) if pipe buffers are available.  It will return zero
555  *    or -errno if no more data needs to be spliced.
556  */
557 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
558 {
559 	/*
560 	 * Check for signal early to make process killable when there are
561 	 * always buffers available
562 	 */
563 	if (signal_pending(current))
564 		return -ERESTARTSYS;
565 
566 	while (!pipe->nrbufs) {
567 		if (!pipe->writers)
568 			return 0;
569 
570 		if (!pipe->waiting_writers && sd->num_spliced)
571 			return 0;
572 
573 		if (sd->flags & SPLICE_F_NONBLOCK)
574 			return -EAGAIN;
575 
576 		if (signal_pending(current))
577 			return -ERESTARTSYS;
578 
579 		if (sd->need_wakeup) {
580 			wakeup_pipe_writers(pipe);
581 			sd->need_wakeup = false;
582 		}
583 
584 		pipe_wait(pipe);
585 	}
586 
587 	return 1;
588 }
589 
590 /**
591  * splice_from_pipe_begin - start splicing from pipe
592  * @sd:		information about the splice operation
593  *
594  * Description:
595  *    This function should be called before a loop containing
596  *    splice_from_pipe_next() and splice_from_pipe_feed() to
597  *    initialize the necessary fields of @sd.
598  */
599 static void splice_from_pipe_begin(struct splice_desc *sd)
600 {
601 	sd->num_spliced = 0;
602 	sd->need_wakeup = false;
603 }
604 
605 /**
606  * splice_from_pipe_end - finish splicing from pipe
607  * @pipe:	pipe to splice from
608  * @sd:		information about the splice operation
609  *
610  * Description:
611  *    This function will wake up pipe writers if necessary.  It should
612  *    be called after a loop containing splice_from_pipe_next() and
613  *    splice_from_pipe_feed().
614  */
615 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
616 {
617 	if (sd->need_wakeup)
618 		wakeup_pipe_writers(pipe);
619 }
620 
621 /**
622  * __splice_from_pipe - splice data from a pipe to given actor
623  * @pipe:	pipe to splice from
624  * @sd:		information to @actor
625  * @actor:	handler that splices the data
626  *
627  * Description:
628  *    This function does little more than loop over the pipe and call
629  *    @actor to do the actual moving of a single struct pipe_buffer to
630  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
631  *    pipe_to_user.
632  *
633  */
634 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
635 			   splice_actor *actor)
636 {
637 	int ret;
638 
639 	splice_from_pipe_begin(sd);
640 	do {
641 		cond_resched();
642 		ret = splice_from_pipe_next(pipe, sd);
643 		if (ret > 0)
644 			ret = splice_from_pipe_feed(pipe, sd, actor);
645 	} while (ret > 0);
646 	splice_from_pipe_end(pipe, sd);
647 
648 	return sd->num_spliced ? sd->num_spliced : ret;
649 }
650 EXPORT_SYMBOL(__splice_from_pipe);
651 
652 /**
653  * splice_from_pipe - splice data from a pipe to a file
654  * @pipe:	pipe to splice from
655  * @out:	file to splice to
656  * @ppos:	position in @out
657  * @len:	how many bytes to splice
658  * @flags:	splice modifier flags
659  * @actor:	handler that splices the data
660  *
661  * Description:
662  *    See __splice_from_pipe. This function locks the pipe inode,
663  *    otherwise it's identical to __splice_from_pipe().
664  *
665  */
666 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
667 			 loff_t *ppos, size_t len, unsigned int flags,
668 			 splice_actor *actor)
669 {
670 	ssize_t ret;
671 	struct splice_desc sd = {
672 		.total_len = len,
673 		.flags = flags,
674 		.pos = *ppos,
675 		.u.file = out,
676 	};
677 
678 	pipe_lock(pipe);
679 	ret = __splice_from_pipe(pipe, &sd, actor);
680 	pipe_unlock(pipe);
681 
682 	return ret;
683 }
684 
685 /**
686  * iter_file_splice_write - splice data from a pipe to a file
687  * @pipe:	pipe info
688  * @out:	file to write to
689  * @ppos:	position in @out
690  * @len:	number of bytes to splice
691  * @flags:	splice modifier flags
692  *
693  * Description:
694  *    Will either move or copy pages (determined by @flags options) from
695  *    the given pipe inode to the given file.
696  *    This one is ->write_iter-based.
697  *
698  */
699 ssize_t
700 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
701 			  loff_t *ppos, size_t len, unsigned int flags)
702 {
703 	struct splice_desc sd = {
704 		.total_len = len,
705 		.flags = flags,
706 		.pos = *ppos,
707 		.u.file = out,
708 	};
709 	int nbufs = pipe->buffers;
710 	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
711 					GFP_KERNEL);
712 	ssize_t ret;
713 
714 	if (unlikely(!array))
715 		return -ENOMEM;
716 
717 	pipe_lock(pipe);
718 
719 	splice_from_pipe_begin(&sd);
720 	while (sd.total_len) {
721 		struct iov_iter from;
722 		size_t left;
723 		int n, idx;
724 
725 		ret = splice_from_pipe_next(pipe, &sd);
726 		if (ret <= 0)
727 			break;
728 
729 		if (unlikely(nbufs < pipe->buffers)) {
730 			kfree(array);
731 			nbufs = pipe->buffers;
732 			array = kcalloc(nbufs, sizeof(struct bio_vec),
733 					GFP_KERNEL);
734 			if (!array) {
735 				ret = -ENOMEM;
736 				break;
737 			}
738 		}
739 
740 		/* build the vector */
741 		left = sd.total_len;
742 		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
743 			struct pipe_buffer *buf = pipe->bufs + idx;
744 			size_t this_len = buf->len;
745 
746 			if (this_len > left)
747 				this_len = left;
748 
749 			if (idx == pipe->buffers - 1)
750 				idx = -1;
751 
752 			ret = pipe_buf_confirm(pipe, buf);
753 			if (unlikely(ret)) {
754 				if (ret == -ENODATA)
755 					ret = 0;
756 				goto done;
757 			}
758 
759 			array[n].bv_page = buf->page;
760 			array[n].bv_len = this_len;
761 			array[n].bv_offset = buf->offset;
762 			left -= this_len;
763 		}
764 
765 		iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
766 			      sd.total_len - left);
767 		ret = vfs_iter_write(out, &from, &sd.pos);
768 		if (ret <= 0)
769 			break;
770 
771 		sd.num_spliced += ret;
772 		sd.total_len -= ret;
773 		*ppos = sd.pos;
774 
775 		/* dismiss the fully eaten buffers, adjust the partial one */
776 		while (ret) {
777 			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
778 			if (ret >= buf->len) {
779 				ret -= buf->len;
780 				buf->len = 0;
781 				pipe_buf_release(pipe, buf);
782 				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
783 				pipe->nrbufs--;
784 				if (pipe->files)
785 					sd.need_wakeup = true;
786 			} else {
787 				buf->offset += ret;
788 				buf->len -= ret;
789 				ret = 0;
790 			}
791 		}
792 	}
793 done:
794 	kfree(array);
795 	splice_from_pipe_end(pipe, &sd);
796 
797 	pipe_unlock(pipe);
798 
799 	if (sd.num_spliced)
800 		ret = sd.num_spliced;
801 
802 	return ret;
803 }
804 
805 EXPORT_SYMBOL(iter_file_splice_write);
806 
807 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
808 			  struct splice_desc *sd)
809 {
810 	int ret;
811 	void *data;
812 	loff_t tmp = sd->pos;
813 
814 	data = kmap(buf->page);
815 	ret = __kernel_write(sd->u.file, data + buf->offset, sd->len, &tmp);
816 	kunmap(buf->page);
817 
818 	return ret;
819 }
820 
821 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
822 					 struct file *out, loff_t *ppos,
823 					 size_t len, unsigned int flags)
824 {
825 	ssize_t ret;
826 
827 	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
828 	if (ret > 0)
829 		*ppos += ret;
830 
831 	return ret;
832 }
833 
834 /**
835  * generic_splice_sendpage - splice data from a pipe to a socket
836  * @pipe:	pipe to splice from
837  * @out:	socket to write to
838  * @ppos:	position in @out
839  * @len:	number of bytes to splice
840  * @flags:	splice modifier flags
841  *
842  * Description:
843  *    Will send @len bytes from the pipe to a network socket. No data copying
844  *    is involved.
845  *
846  */
847 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
848 				loff_t *ppos, size_t len, unsigned int flags)
849 {
850 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
851 }
852 
853 EXPORT_SYMBOL(generic_splice_sendpage);
854 
855 /*
856  * Attempt to initiate a splice from pipe to file.
857  */
858 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
859 			   loff_t *ppos, size_t len, unsigned int flags)
860 {
861 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
862 				loff_t *, size_t, unsigned int);
863 
864 	if (out->f_op->splice_write)
865 		splice_write = out->f_op->splice_write;
866 	else
867 		splice_write = default_file_splice_write;
868 
869 	return splice_write(pipe, out, ppos, len, flags);
870 }
871 
872 /*
873  * Attempt to initiate a splice from a file to a pipe.
874  */
875 static long do_splice_to(struct file *in, loff_t *ppos,
876 			 struct pipe_inode_info *pipe, size_t len,
877 			 unsigned int flags)
878 {
879 	ssize_t (*splice_read)(struct file *, loff_t *,
880 			       struct pipe_inode_info *, size_t, unsigned int);
881 	int ret;
882 
883 	if (unlikely(!(in->f_mode & FMODE_READ)))
884 		return -EBADF;
885 
886 	ret = rw_verify_area(READ, in, ppos, len);
887 	if (unlikely(ret < 0))
888 		return ret;
889 
890 	if (unlikely(len > MAX_RW_COUNT))
891 		len = MAX_RW_COUNT;
892 
893 	if (in->f_op->splice_read)
894 		splice_read = in->f_op->splice_read;
895 	else
896 		splice_read = default_file_splice_read;
897 
898 	return splice_read(in, ppos, pipe, len, flags);
899 }
900 
901 /**
902  * splice_direct_to_actor - splices data directly between two non-pipes
903  * @in:		file to splice from
904  * @sd:		actor information on where to splice to
905  * @actor:	handles the data splicing
906  *
907  * Description:
908  *    This is a special case helper to splice directly between two
909  *    points, without requiring an explicit pipe. Internally an allocated
910  *    pipe is cached in the process, and reused during the lifetime of
911  *    that process.
912  *
913  */
914 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
915 			       splice_direct_actor *actor)
916 {
917 	struct pipe_inode_info *pipe;
918 	long ret, bytes;
919 	umode_t i_mode;
920 	size_t len;
921 	int i, flags, more;
922 
923 	/*
924 	 * We require the input being a regular file, as we don't want to
925 	 * randomly drop data for eg socket -> socket splicing. Use the
926 	 * piped splicing for that!
927 	 */
928 	i_mode = file_inode(in)->i_mode;
929 	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
930 		return -EINVAL;
931 
932 	/*
933 	 * neither in nor out is a pipe, setup an internal pipe attached to
934 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
935 	 */
936 	pipe = current->splice_pipe;
937 	if (unlikely(!pipe)) {
938 		pipe = alloc_pipe_info();
939 		if (!pipe)
940 			return -ENOMEM;
941 
942 		/*
943 		 * We don't have an immediate reader, but we'll read the stuff
944 		 * out of the pipe right after the splice_to_pipe(). So set
945 		 * PIPE_READERS appropriately.
946 		 */
947 		pipe->readers = 1;
948 
949 		current->splice_pipe = pipe;
950 	}
951 
952 	/*
953 	 * Do the splice.
954 	 */
955 	ret = 0;
956 	bytes = 0;
957 	len = sd->total_len;
958 	flags = sd->flags;
959 
960 	/*
961 	 * Don't block on output, we have to drain the direct pipe.
962 	 */
963 	sd->flags &= ~SPLICE_F_NONBLOCK;
964 	more = sd->flags & SPLICE_F_MORE;
965 
966 	while (len) {
967 		size_t read_len;
968 		loff_t pos = sd->pos, prev_pos = pos;
969 
970 		ret = do_splice_to(in, &pos, pipe, len, flags);
971 		if (unlikely(ret <= 0))
972 			goto out_release;
973 
974 		read_len = ret;
975 		sd->total_len = read_len;
976 
977 		/*
978 		 * If more data is pending, set SPLICE_F_MORE
979 		 * If this is the last data and SPLICE_F_MORE was not set
980 		 * initially, clears it.
981 		 */
982 		if (read_len < len)
983 			sd->flags |= SPLICE_F_MORE;
984 		else if (!more)
985 			sd->flags &= ~SPLICE_F_MORE;
986 		/*
987 		 * NOTE: nonblocking mode only applies to the input. We
988 		 * must not do the output in nonblocking mode as then we
989 		 * could get stuck data in the internal pipe:
990 		 */
991 		ret = actor(pipe, sd);
992 		if (unlikely(ret <= 0)) {
993 			sd->pos = prev_pos;
994 			goto out_release;
995 		}
996 
997 		bytes += ret;
998 		len -= ret;
999 		sd->pos = pos;
1000 
1001 		if (ret < read_len) {
1002 			sd->pos = prev_pos + ret;
1003 			goto out_release;
1004 		}
1005 	}
1006 
1007 done:
1008 	pipe->nrbufs = pipe->curbuf = 0;
1009 	file_accessed(in);
1010 	return bytes;
1011 
1012 out_release:
1013 	/*
1014 	 * If we did an incomplete transfer we must release
1015 	 * the pipe buffers in question:
1016 	 */
1017 	for (i = 0; i < pipe->buffers; i++) {
1018 		struct pipe_buffer *buf = pipe->bufs + i;
1019 
1020 		if (buf->ops)
1021 			pipe_buf_release(pipe, buf);
1022 	}
1023 
1024 	if (!bytes)
1025 		bytes = ret;
1026 
1027 	goto done;
1028 }
1029 EXPORT_SYMBOL(splice_direct_to_actor);
1030 
1031 static int direct_splice_actor(struct pipe_inode_info *pipe,
1032 			       struct splice_desc *sd)
1033 {
1034 	struct file *file = sd->u.file;
1035 
1036 	return do_splice_from(pipe, file, sd->opos, sd->total_len,
1037 			      sd->flags);
1038 }
1039 
1040 /**
1041  * do_splice_direct - splices data directly between two files
1042  * @in:		file to splice from
1043  * @ppos:	input file offset
1044  * @out:	file to splice to
1045  * @opos:	output file offset
1046  * @len:	number of bytes to splice
1047  * @flags:	splice modifier flags
1048  *
1049  * Description:
1050  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1051  *    doing it in the application would incur an extra system call
1052  *    (splice in + splice out, as compared to just sendfile()). So this helper
1053  *    can splice directly through a process-private pipe.
1054  *
1055  */
1056 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1057 		      loff_t *opos, size_t len, unsigned int flags)
1058 {
1059 	struct splice_desc sd = {
1060 		.len		= len,
1061 		.total_len	= len,
1062 		.flags		= flags,
1063 		.pos		= *ppos,
1064 		.u.file		= out,
1065 		.opos		= opos,
1066 	};
1067 	long ret;
1068 
1069 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1070 		return -EBADF;
1071 
1072 	if (unlikely(out->f_flags & O_APPEND))
1073 		return -EINVAL;
1074 
1075 	ret = rw_verify_area(WRITE, out, opos, len);
1076 	if (unlikely(ret < 0))
1077 		return ret;
1078 
1079 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1080 	if (ret > 0)
1081 		*ppos = sd.pos;
1082 
1083 	return ret;
1084 }
1085 EXPORT_SYMBOL(do_splice_direct);
1086 
1087 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1088 {
1089 	while (pipe->nrbufs == pipe->buffers) {
1090 		if (flags & SPLICE_F_NONBLOCK)
1091 			return -EAGAIN;
1092 		if (signal_pending(current))
1093 			return -ERESTARTSYS;
1094 		pipe->waiting_writers++;
1095 		pipe_wait(pipe);
1096 		pipe->waiting_writers--;
1097 	}
1098 	return 0;
1099 }
1100 
1101 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1102 			       struct pipe_inode_info *opipe,
1103 			       size_t len, unsigned int flags);
1104 
1105 /*
1106  * Determine where to splice to/from.
1107  */
1108 static long do_splice(struct file *in, loff_t __user *off_in,
1109 		      struct file *out, loff_t __user *off_out,
1110 		      size_t len, unsigned int flags)
1111 {
1112 	struct pipe_inode_info *ipipe;
1113 	struct pipe_inode_info *opipe;
1114 	loff_t offset;
1115 	long ret;
1116 
1117 	ipipe = get_pipe_info(in);
1118 	opipe = get_pipe_info(out);
1119 
1120 	if (ipipe && opipe) {
1121 		if (off_in || off_out)
1122 			return -ESPIPE;
1123 
1124 		if (!(in->f_mode & FMODE_READ))
1125 			return -EBADF;
1126 
1127 		if (!(out->f_mode & FMODE_WRITE))
1128 			return -EBADF;
1129 
1130 		/* Splicing to self would be fun, but... */
1131 		if (ipipe == opipe)
1132 			return -EINVAL;
1133 
1134 		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1135 	}
1136 
1137 	if (ipipe) {
1138 		if (off_in)
1139 			return -ESPIPE;
1140 		if (off_out) {
1141 			if (!(out->f_mode & FMODE_PWRITE))
1142 				return -EINVAL;
1143 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1144 				return -EFAULT;
1145 		} else {
1146 			offset = out->f_pos;
1147 		}
1148 
1149 		if (unlikely(!(out->f_mode & FMODE_WRITE)))
1150 			return -EBADF;
1151 
1152 		if (unlikely(out->f_flags & O_APPEND))
1153 			return -EINVAL;
1154 
1155 		ret = rw_verify_area(WRITE, out, &offset, len);
1156 		if (unlikely(ret < 0))
1157 			return ret;
1158 
1159 		file_start_write(out);
1160 		ret = do_splice_from(ipipe, out, &offset, len, flags);
1161 		file_end_write(out);
1162 
1163 		if (!off_out)
1164 			out->f_pos = offset;
1165 		else if (copy_to_user(off_out, &offset, sizeof(loff_t)))
1166 			ret = -EFAULT;
1167 
1168 		return ret;
1169 	}
1170 
1171 	if (opipe) {
1172 		if (off_out)
1173 			return -ESPIPE;
1174 		if (off_in) {
1175 			if (!(in->f_mode & FMODE_PREAD))
1176 				return -EINVAL;
1177 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1178 				return -EFAULT;
1179 		} else {
1180 			offset = in->f_pos;
1181 		}
1182 
1183 		pipe_lock(opipe);
1184 		ret = wait_for_space(opipe, flags);
1185 		if (!ret)
1186 			ret = do_splice_to(in, &offset, opipe, len, flags);
1187 		pipe_unlock(opipe);
1188 		if (ret > 0)
1189 			wakeup_pipe_readers(opipe);
1190 		if (!off_in)
1191 			in->f_pos = offset;
1192 		else if (copy_to_user(off_in, &offset, sizeof(loff_t)))
1193 			ret = -EFAULT;
1194 
1195 		return ret;
1196 	}
1197 
1198 	return -EINVAL;
1199 }
1200 
1201 static int iter_to_pipe(struct iov_iter *from,
1202 			struct pipe_inode_info *pipe,
1203 			unsigned flags)
1204 {
1205 	struct pipe_buffer buf = {
1206 		.ops = &user_page_pipe_buf_ops,
1207 		.flags = flags
1208 	};
1209 	size_t total = 0;
1210 	int ret = 0;
1211 	bool failed = false;
1212 
1213 	while (iov_iter_count(from) && !failed) {
1214 		struct page *pages[16];
1215 		ssize_t copied;
1216 		size_t start;
1217 		int n;
1218 
1219 		copied = iov_iter_get_pages(from, pages, ~0UL, 16, &start);
1220 		if (copied <= 0) {
1221 			ret = copied;
1222 			break;
1223 		}
1224 
1225 		for (n = 0; copied; n++, start = 0) {
1226 			int size = min_t(int, copied, PAGE_SIZE - start);
1227 			if (!failed) {
1228 				buf.page = pages[n];
1229 				buf.offset = start;
1230 				buf.len = size;
1231 				ret = add_to_pipe(pipe, &buf);
1232 				if (unlikely(ret < 0)) {
1233 					failed = true;
1234 				} else {
1235 					iov_iter_advance(from, ret);
1236 					total += ret;
1237 				}
1238 			} else {
1239 				put_page(pages[n]);
1240 			}
1241 			copied -= size;
1242 		}
1243 	}
1244 	return total ? total : ret;
1245 }
1246 
1247 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1248 			struct splice_desc *sd)
1249 {
1250 	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1251 	return n == sd->len ? n : -EFAULT;
1252 }
1253 
1254 /*
1255  * For lack of a better implementation, implement vmsplice() to userspace
1256  * as a simple copy of the pipes pages to the user iov.
1257  */
1258 static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
1259 			     unsigned long nr_segs, unsigned int flags)
1260 {
1261 	struct pipe_inode_info *pipe;
1262 	struct splice_desc sd;
1263 	long ret;
1264 	struct iovec iovstack[UIO_FASTIOV];
1265 	struct iovec *iov = iovstack;
1266 	struct iov_iter iter;
1267 
1268 	pipe = get_pipe_info(file);
1269 	if (!pipe)
1270 		return -EBADF;
1271 
1272 	ret = import_iovec(READ, uiov, nr_segs,
1273 			   ARRAY_SIZE(iovstack), &iov, &iter);
1274 	if (ret < 0)
1275 		return ret;
1276 
1277 	sd.total_len = iov_iter_count(&iter);
1278 	sd.len = 0;
1279 	sd.flags = flags;
1280 	sd.u.data = &iter;
1281 	sd.pos = 0;
1282 
1283 	if (sd.total_len) {
1284 		pipe_lock(pipe);
1285 		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1286 		pipe_unlock(pipe);
1287 	}
1288 
1289 	kfree(iov);
1290 	return ret;
1291 }
1292 
1293 /*
1294  * vmsplice splices a user address range into a pipe. It can be thought of
1295  * as splice-from-memory, where the regular splice is splice-from-file (or
1296  * to file). In both cases the output is a pipe, naturally.
1297  */
1298 static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov,
1299 			     unsigned long nr_segs, unsigned int flags)
1300 {
1301 	struct pipe_inode_info *pipe;
1302 	struct iovec iovstack[UIO_FASTIOV];
1303 	struct iovec *iov = iovstack;
1304 	struct iov_iter from;
1305 	long ret;
1306 	unsigned buf_flag = 0;
1307 
1308 	if (flags & SPLICE_F_GIFT)
1309 		buf_flag = PIPE_BUF_FLAG_GIFT;
1310 
1311 	pipe = get_pipe_info(file);
1312 	if (!pipe)
1313 		return -EBADF;
1314 
1315 	ret = import_iovec(WRITE, uiov, nr_segs,
1316 			   ARRAY_SIZE(iovstack), &iov, &from);
1317 	if (ret < 0)
1318 		return ret;
1319 
1320 	pipe_lock(pipe);
1321 	ret = wait_for_space(pipe, flags);
1322 	if (!ret)
1323 		ret = iter_to_pipe(&from, pipe, buf_flag);
1324 	pipe_unlock(pipe);
1325 	if (ret > 0)
1326 		wakeup_pipe_readers(pipe);
1327 	kfree(iov);
1328 	return ret;
1329 }
1330 
1331 /*
1332  * Note that vmsplice only really supports true splicing _from_ user memory
1333  * to a pipe, not the other way around. Splicing from user memory is a simple
1334  * operation that can be supported without any funky alignment restrictions
1335  * or nasty vm tricks. We simply map in the user memory and fill them into
1336  * a pipe. The reverse isn't quite as easy, though. There are two possible
1337  * solutions for that:
1338  *
1339  *	- memcpy() the data internally, at which point we might as well just
1340  *	  do a regular read() on the buffer anyway.
1341  *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1342  *	  has restriction limitations on both ends of the pipe).
1343  *
1344  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1345  *
1346  */
1347 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
1348 		unsigned long, nr_segs, unsigned int, flags)
1349 {
1350 	struct fd f;
1351 	long error;
1352 
1353 	if (unlikely(nr_segs > UIO_MAXIOV))
1354 		return -EINVAL;
1355 	else if (unlikely(!nr_segs))
1356 		return 0;
1357 
1358 	error = -EBADF;
1359 	f = fdget(fd);
1360 	if (f.file) {
1361 		if (f.file->f_mode & FMODE_WRITE)
1362 			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
1363 		else if (f.file->f_mode & FMODE_READ)
1364 			error = vmsplice_to_user(f.file, iov, nr_segs, flags);
1365 
1366 		fdput(f);
1367 	}
1368 
1369 	return error;
1370 }
1371 
1372 #ifdef CONFIG_COMPAT
1373 COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32,
1374 		    unsigned int, nr_segs, unsigned int, flags)
1375 {
1376 	unsigned i;
1377 	struct iovec __user *iov;
1378 	if (nr_segs > UIO_MAXIOV)
1379 		return -EINVAL;
1380 	iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec));
1381 	for (i = 0; i < nr_segs; i++) {
1382 		struct compat_iovec v;
1383 		if (get_user(v.iov_base, &iov32[i].iov_base) ||
1384 		    get_user(v.iov_len, &iov32[i].iov_len) ||
1385 		    put_user(compat_ptr(v.iov_base), &iov[i].iov_base) ||
1386 		    put_user(v.iov_len, &iov[i].iov_len))
1387 			return -EFAULT;
1388 	}
1389 	return sys_vmsplice(fd, iov, nr_segs, flags);
1390 }
1391 #endif
1392 
1393 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1394 		int, fd_out, loff_t __user *, off_out,
1395 		size_t, len, unsigned int, flags)
1396 {
1397 	struct fd in, out;
1398 	long error;
1399 
1400 	if (unlikely(!len))
1401 		return 0;
1402 
1403 	error = -EBADF;
1404 	in = fdget(fd_in);
1405 	if (in.file) {
1406 		if (in.file->f_mode & FMODE_READ) {
1407 			out = fdget(fd_out);
1408 			if (out.file) {
1409 				if (out.file->f_mode & FMODE_WRITE)
1410 					error = do_splice(in.file, off_in,
1411 							  out.file, off_out,
1412 							  len, flags);
1413 				fdput(out);
1414 			}
1415 		}
1416 		fdput(in);
1417 	}
1418 	return error;
1419 }
1420 
1421 /*
1422  * Make sure there's data to read. Wait for input if we can, otherwise
1423  * return an appropriate error.
1424  */
1425 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1426 {
1427 	int ret;
1428 
1429 	/*
1430 	 * Check ->nrbufs without the inode lock first. This function
1431 	 * is speculative anyways, so missing one is ok.
1432 	 */
1433 	if (pipe->nrbufs)
1434 		return 0;
1435 
1436 	ret = 0;
1437 	pipe_lock(pipe);
1438 
1439 	while (!pipe->nrbufs) {
1440 		if (signal_pending(current)) {
1441 			ret = -ERESTARTSYS;
1442 			break;
1443 		}
1444 		if (!pipe->writers)
1445 			break;
1446 		if (!pipe->waiting_writers) {
1447 			if (flags & SPLICE_F_NONBLOCK) {
1448 				ret = -EAGAIN;
1449 				break;
1450 			}
1451 		}
1452 		pipe_wait(pipe);
1453 	}
1454 
1455 	pipe_unlock(pipe);
1456 	return ret;
1457 }
1458 
1459 /*
1460  * Make sure there's writeable room. Wait for room if we can, otherwise
1461  * return an appropriate error.
1462  */
1463 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1464 {
1465 	int ret;
1466 
1467 	/*
1468 	 * Check ->nrbufs without the inode lock first. This function
1469 	 * is speculative anyways, so missing one is ok.
1470 	 */
1471 	if (pipe->nrbufs < pipe->buffers)
1472 		return 0;
1473 
1474 	ret = 0;
1475 	pipe_lock(pipe);
1476 
1477 	while (pipe->nrbufs >= pipe->buffers) {
1478 		if (!pipe->readers) {
1479 			send_sig(SIGPIPE, current, 0);
1480 			ret = -EPIPE;
1481 			break;
1482 		}
1483 		if (flags & SPLICE_F_NONBLOCK) {
1484 			ret = -EAGAIN;
1485 			break;
1486 		}
1487 		if (signal_pending(current)) {
1488 			ret = -ERESTARTSYS;
1489 			break;
1490 		}
1491 		pipe->waiting_writers++;
1492 		pipe_wait(pipe);
1493 		pipe->waiting_writers--;
1494 	}
1495 
1496 	pipe_unlock(pipe);
1497 	return ret;
1498 }
1499 
1500 /*
1501  * Splice contents of ipipe to opipe.
1502  */
1503 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1504 			       struct pipe_inode_info *opipe,
1505 			       size_t len, unsigned int flags)
1506 {
1507 	struct pipe_buffer *ibuf, *obuf;
1508 	int ret = 0, nbuf;
1509 	bool input_wakeup = false;
1510 
1511 
1512 retry:
1513 	ret = ipipe_prep(ipipe, flags);
1514 	if (ret)
1515 		return ret;
1516 
1517 	ret = opipe_prep(opipe, flags);
1518 	if (ret)
1519 		return ret;
1520 
1521 	/*
1522 	 * Potential ABBA deadlock, work around it by ordering lock
1523 	 * grabbing by pipe info address. Otherwise two different processes
1524 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1525 	 */
1526 	pipe_double_lock(ipipe, opipe);
1527 
1528 	do {
1529 		if (!opipe->readers) {
1530 			send_sig(SIGPIPE, current, 0);
1531 			if (!ret)
1532 				ret = -EPIPE;
1533 			break;
1534 		}
1535 
1536 		if (!ipipe->nrbufs && !ipipe->writers)
1537 			break;
1538 
1539 		/*
1540 		 * Cannot make any progress, because either the input
1541 		 * pipe is empty or the output pipe is full.
1542 		 */
1543 		if (!ipipe->nrbufs || opipe->nrbufs >= opipe->buffers) {
1544 			/* Already processed some buffers, break */
1545 			if (ret)
1546 				break;
1547 
1548 			if (flags & SPLICE_F_NONBLOCK) {
1549 				ret = -EAGAIN;
1550 				break;
1551 			}
1552 
1553 			/*
1554 			 * We raced with another reader/writer and haven't
1555 			 * managed to process any buffers.  A zero return
1556 			 * value means EOF, so retry instead.
1557 			 */
1558 			pipe_unlock(ipipe);
1559 			pipe_unlock(opipe);
1560 			goto retry;
1561 		}
1562 
1563 		ibuf = ipipe->bufs + ipipe->curbuf;
1564 		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1565 		obuf = opipe->bufs + nbuf;
1566 
1567 		if (len >= ibuf->len) {
1568 			/*
1569 			 * Simply move the whole buffer from ipipe to opipe
1570 			 */
1571 			*obuf = *ibuf;
1572 			ibuf->ops = NULL;
1573 			opipe->nrbufs++;
1574 			ipipe->curbuf = (ipipe->curbuf + 1) & (ipipe->buffers - 1);
1575 			ipipe->nrbufs--;
1576 			input_wakeup = true;
1577 		} else {
1578 			/*
1579 			 * Get a reference to this pipe buffer,
1580 			 * so we can copy the contents over.
1581 			 */
1582 			pipe_buf_get(ipipe, ibuf);
1583 			*obuf = *ibuf;
1584 
1585 			/*
1586 			 * Don't inherit the gift flag, we need to
1587 			 * prevent multiple steals of this page.
1588 			 */
1589 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1590 
1591 			obuf->len = len;
1592 			opipe->nrbufs++;
1593 			ibuf->offset += obuf->len;
1594 			ibuf->len -= obuf->len;
1595 		}
1596 		ret += obuf->len;
1597 		len -= obuf->len;
1598 	} while (len);
1599 
1600 	pipe_unlock(ipipe);
1601 	pipe_unlock(opipe);
1602 
1603 	/*
1604 	 * If we put data in the output pipe, wakeup any potential readers.
1605 	 */
1606 	if (ret > 0)
1607 		wakeup_pipe_readers(opipe);
1608 
1609 	if (input_wakeup)
1610 		wakeup_pipe_writers(ipipe);
1611 
1612 	return ret;
1613 }
1614 
1615 /*
1616  * Link contents of ipipe to opipe.
1617  */
1618 static int link_pipe(struct pipe_inode_info *ipipe,
1619 		     struct pipe_inode_info *opipe,
1620 		     size_t len, unsigned int flags)
1621 {
1622 	struct pipe_buffer *ibuf, *obuf;
1623 	int ret = 0, i = 0, nbuf;
1624 
1625 	/*
1626 	 * Potential ABBA deadlock, work around it by ordering lock
1627 	 * grabbing by pipe info address. Otherwise two different processes
1628 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1629 	 */
1630 	pipe_double_lock(ipipe, opipe);
1631 
1632 	do {
1633 		if (!opipe->readers) {
1634 			send_sig(SIGPIPE, current, 0);
1635 			if (!ret)
1636 				ret = -EPIPE;
1637 			break;
1638 		}
1639 
1640 		/*
1641 		 * If we have iterated all input buffers or ran out of
1642 		 * output room, break.
1643 		 */
1644 		if (i >= ipipe->nrbufs || opipe->nrbufs >= opipe->buffers)
1645 			break;
1646 
1647 		ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (ipipe->buffers-1));
1648 		nbuf = (opipe->curbuf + opipe->nrbufs) & (opipe->buffers - 1);
1649 
1650 		/*
1651 		 * Get a reference to this pipe buffer,
1652 		 * so we can copy the contents over.
1653 		 */
1654 		pipe_buf_get(ipipe, ibuf);
1655 
1656 		obuf = opipe->bufs + nbuf;
1657 		*obuf = *ibuf;
1658 
1659 		/*
1660 		 * Don't inherit the gift flag, we need to
1661 		 * prevent multiple steals of this page.
1662 		 */
1663 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1664 
1665 		if (obuf->len > len)
1666 			obuf->len = len;
1667 
1668 		opipe->nrbufs++;
1669 		ret += obuf->len;
1670 		len -= obuf->len;
1671 		i++;
1672 	} while (len);
1673 
1674 	/*
1675 	 * return EAGAIN if we have the potential of some data in the
1676 	 * future, otherwise just return 0
1677 	 */
1678 	if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK))
1679 		ret = -EAGAIN;
1680 
1681 	pipe_unlock(ipipe);
1682 	pipe_unlock(opipe);
1683 
1684 	/*
1685 	 * If we put data in the output pipe, wakeup any potential readers.
1686 	 */
1687 	if (ret > 0)
1688 		wakeup_pipe_readers(opipe);
1689 
1690 	return ret;
1691 }
1692 
1693 /*
1694  * This is a tee(1) implementation that works on pipes. It doesn't copy
1695  * any data, it simply references the 'in' pages on the 'out' pipe.
1696  * The 'flags' used are the SPLICE_F_* variants, currently the only
1697  * applicable one is SPLICE_F_NONBLOCK.
1698  */
1699 static long do_tee(struct file *in, struct file *out, size_t len,
1700 		   unsigned int flags)
1701 {
1702 	struct pipe_inode_info *ipipe = get_pipe_info(in);
1703 	struct pipe_inode_info *opipe = get_pipe_info(out);
1704 	int ret = -EINVAL;
1705 
1706 	/*
1707 	 * Duplicate the contents of ipipe to opipe without actually
1708 	 * copying the data.
1709 	 */
1710 	if (ipipe && opipe && ipipe != opipe) {
1711 		/*
1712 		 * Keep going, unless we encounter an error. The ipipe/opipe
1713 		 * ordering doesn't really matter.
1714 		 */
1715 		ret = ipipe_prep(ipipe, flags);
1716 		if (!ret) {
1717 			ret = opipe_prep(opipe, flags);
1718 			if (!ret)
1719 				ret = link_pipe(ipipe, opipe, len, flags);
1720 		}
1721 	}
1722 
1723 	return ret;
1724 }
1725 
1726 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1727 {
1728 	struct fd in;
1729 	int error;
1730 
1731 	if (unlikely(!len))
1732 		return 0;
1733 
1734 	error = -EBADF;
1735 	in = fdget(fdin);
1736 	if (in.file) {
1737 		if (in.file->f_mode & FMODE_READ) {
1738 			struct fd out = fdget(fdout);
1739 			if (out.file) {
1740 				if (out.file->f_mode & FMODE_WRITE)
1741 					error = do_tee(in.file, out.file,
1742 							len, flags);
1743 				fdput(out);
1744 			}
1745 		}
1746  		fdput(in);
1747  	}
1748 
1749 	return error;
1750 }
1751