xref: /openbmc/linux/fs/splice.c (revision 724ba6751532055db75992fc6ae21c3e322e94a7)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * "splice": joining two ropes together by interweaving their strands.
4  *
5  * This is the "extended pipe" functionality, where a pipe is used as
6  * an arbitrary in-memory buffer. Think of a pipe as a small kernel
7  * buffer that you can use to transfer data from one end to the other.
8  *
9  * The traditional unix read/write is extended with a "splice()" operation
10  * that transfers data buffers to or from a pipe buffer.
11  *
12  * Named by Larry McVoy, original implementation from Linus, extended by
13  * Jens to support splicing to files, network, direct splicing, etc and
14  * fixing lots of bugs.
15  *
16  * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk>
17  * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
18  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
19  *
20  */
21 #include <linux/bvec.h>
22 #include <linux/fs.h>
23 #include <linux/file.h>
24 #include <linux/pagemap.h>
25 #include <linux/splice.h>
26 #include <linux/memcontrol.h>
27 #include <linux/mm_inline.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/export.h>
31 #include <linux/syscalls.h>
32 #include <linux/uio.h>
33 #include <linux/fsnotify.h>
34 #include <linux/security.h>
35 #include <linux/gfp.h>
36 #include <linux/socket.h>
37 #include <linux/sched/signal.h>
38 
39 #include "internal.h"
40 
41 /*
42  * Splice doesn't support FMODE_NOWAIT. Since pipes may set this flag to
43  * indicate they support non-blocking reads or writes, we must clear it
44  * here if set to avoid blocking other users of this pipe if splice is
45  * being done on it.
46  */
47 static noinline void noinline pipe_clear_nowait(struct file *file)
48 {
49 	fmode_t fmode = READ_ONCE(file->f_mode);
50 
51 	do {
52 		if (!(fmode & FMODE_NOWAIT))
53 			break;
54 	} while (!try_cmpxchg(&file->f_mode, &fmode, fmode & ~FMODE_NOWAIT));
55 }
56 
57 /*
58  * Attempt to steal a page from a pipe buffer. This should perhaps go into
59  * a vm helper function, it's already simplified quite a bit by the
60  * addition of remove_mapping(). If success is returned, the caller may
61  * attempt to reuse this page for another destination.
62  */
63 static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe,
64 		struct pipe_buffer *buf)
65 {
66 	struct folio *folio = page_folio(buf->page);
67 	struct address_space *mapping;
68 
69 	folio_lock(folio);
70 
71 	mapping = folio_mapping(folio);
72 	if (mapping) {
73 		WARN_ON(!folio_test_uptodate(folio));
74 
75 		/*
76 		 * At least for ext2 with nobh option, we need to wait on
77 		 * writeback completing on this folio, since we'll remove it
78 		 * from the pagecache.  Otherwise truncate wont wait on the
79 		 * folio, allowing the disk blocks to be reused by someone else
80 		 * before we actually wrote our data to them. fs corruption
81 		 * ensues.
82 		 */
83 		folio_wait_writeback(folio);
84 
85 		if (folio_has_private(folio) &&
86 		    !filemap_release_folio(folio, GFP_KERNEL))
87 			goto out_unlock;
88 
89 		/*
90 		 * If we succeeded in removing the mapping, set LRU flag
91 		 * and return good.
92 		 */
93 		if (remove_mapping(mapping, folio)) {
94 			buf->flags |= PIPE_BUF_FLAG_LRU;
95 			return true;
96 		}
97 	}
98 
99 	/*
100 	 * Raced with truncate or failed to remove folio from current
101 	 * address space, unlock and return failure.
102 	 */
103 out_unlock:
104 	folio_unlock(folio);
105 	return false;
106 }
107 
108 static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe,
109 					struct pipe_buffer *buf)
110 {
111 	put_page(buf->page);
112 	buf->flags &= ~PIPE_BUF_FLAG_LRU;
113 }
114 
115 /*
116  * Check whether the contents of buf is OK to access. Since the content
117  * is a page cache page, IO may be in flight.
118  */
119 static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
120 				       struct pipe_buffer *buf)
121 {
122 	struct page *page = buf->page;
123 	int err;
124 
125 	if (!PageUptodate(page)) {
126 		lock_page(page);
127 
128 		/*
129 		 * Page got truncated/unhashed. This will cause a 0-byte
130 		 * splice, if this is the first page.
131 		 */
132 		if (!page->mapping) {
133 			err = -ENODATA;
134 			goto error;
135 		}
136 
137 		/*
138 		 * Uh oh, read-error from disk.
139 		 */
140 		if (!PageUptodate(page)) {
141 			err = -EIO;
142 			goto error;
143 		}
144 
145 		/*
146 		 * Page is ok afterall, we are done.
147 		 */
148 		unlock_page(page);
149 	}
150 
151 	return 0;
152 error:
153 	unlock_page(page);
154 	return err;
155 }
156 
157 const struct pipe_buf_operations page_cache_pipe_buf_ops = {
158 	.confirm	= page_cache_pipe_buf_confirm,
159 	.release	= page_cache_pipe_buf_release,
160 	.try_steal	= page_cache_pipe_buf_try_steal,
161 	.get		= generic_pipe_buf_get,
162 };
163 
164 static bool user_page_pipe_buf_try_steal(struct pipe_inode_info *pipe,
165 		struct pipe_buffer *buf)
166 {
167 	if (!(buf->flags & PIPE_BUF_FLAG_GIFT))
168 		return false;
169 
170 	buf->flags |= PIPE_BUF_FLAG_LRU;
171 	return generic_pipe_buf_try_steal(pipe, buf);
172 }
173 
174 static const struct pipe_buf_operations user_page_pipe_buf_ops = {
175 	.release	= page_cache_pipe_buf_release,
176 	.try_steal	= user_page_pipe_buf_try_steal,
177 	.get		= generic_pipe_buf_get,
178 };
179 
180 static void wakeup_pipe_readers(struct pipe_inode_info *pipe)
181 {
182 	smp_mb();
183 	if (waitqueue_active(&pipe->rd_wait))
184 		wake_up_interruptible(&pipe->rd_wait);
185 	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
186 }
187 
188 /**
189  * splice_to_pipe - fill passed data into a pipe
190  * @pipe:	pipe to fill
191  * @spd:	data to fill
192  *
193  * Description:
194  *    @spd contains a map of pages and len/offset tuples, along with
195  *    the struct pipe_buf_operations associated with these pages. This
196  *    function will link that data to the pipe.
197  *
198  */
199 ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
200 		       struct splice_pipe_desc *spd)
201 {
202 	unsigned int spd_pages = spd->nr_pages;
203 	unsigned int tail = pipe->tail;
204 	unsigned int head = pipe->head;
205 	unsigned int mask = pipe->ring_size - 1;
206 	int ret = 0, page_nr = 0;
207 
208 	if (!spd_pages)
209 		return 0;
210 
211 	if (unlikely(!pipe->readers)) {
212 		send_sig(SIGPIPE, current, 0);
213 		ret = -EPIPE;
214 		goto out;
215 	}
216 
217 	while (!pipe_full(head, tail, pipe->max_usage)) {
218 		struct pipe_buffer *buf = &pipe->bufs[head & mask];
219 
220 		buf->page = spd->pages[page_nr];
221 		buf->offset = spd->partial[page_nr].offset;
222 		buf->len = spd->partial[page_nr].len;
223 		buf->private = spd->partial[page_nr].private;
224 		buf->ops = spd->ops;
225 		buf->flags = 0;
226 
227 		head++;
228 		pipe->head = head;
229 		page_nr++;
230 		ret += buf->len;
231 
232 		if (!--spd->nr_pages)
233 			break;
234 	}
235 
236 	if (!ret)
237 		ret = -EAGAIN;
238 
239 out:
240 	while (page_nr < spd_pages)
241 		spd->spd_release(spd, page_nr++);
242 
243 	return ret;
244 }
245 EXPORT_SYMBOL_GPL(splice_to_pipe);
246 
247 ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
248 {
249 	unsigned int head = pipe->head;
250 	unsigned int tail = pipe->tail;
251 	unsigned int mask = pipe->ring_size - 1;
252 	int ret;
253 
254 	if (unlikely(!pipe->readers)) {
255 		send_sig(SIGPIPE, current, 0);
256 		ret = -EPIPE;
257 	} else if (pipe_full(head, tail, pipe->max_usage)) {
258 		ret = -EAGAIN;
259 	} else {
260 		pipe->bufs[head & mask] = *buf;
261 		pipe->head = head + 1;
262 		return buf->len;
263 	}
264 	pipe_buf_release(pipe, buf);
265 	return ret;
266 }
267 EXPORT_SYMBOL(add_to_pipe);
268 
269 /*
270  * Check if we need to grow the arrays holding pages and partial page
271  * descriptions.
272  */
273 int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd)
274 {
275 	unsigned int max_usage = READ_ONCE(pipe->max_usage);
276 
277 	spd->nr_pages_max = max_usage;
278 	if (max_usage <= PIPE_DEF_BUFFERS)
279 		return 0;
280 
281 	spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL);
282 	spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page),
283 				     GFP_KERNEL);
284 
285 	if (spd->pages && spd->partial)
286 		return 0;
287 
288 	kfree(spd->pages);
289 	kfree(spd->partial);
290 	return -ENOMEM;
291 }
292 
293 void splice_shrink_spd(struct splice_pipe_desc *spd)
294 {
295 	if (spd->nr_pages_max <= PIPE_DEF_BUFFERS)
296 		return;
297 
298 	kfree(spd->pages);
299 	kfree(spd->partial);
300 }
301 
302 /*
303  * Splice data from an O_DIRECT file into pages and then add them to the output
304  * pipe.
305  */
306 ssize_t direct_splice_read(struct file *in, loff_t *ppos,
307 			   struct pipe_inode_info *pipe,
308 			   size_t len, unsigned int flags)
309 {
310 	struct iov_iter to;
311 	struct bio_vec *bv;
312 	struct kiocb kiocb;
313 	struct page **pages;
314 	ssize_t ret;
315 	size_t used, npages, chunk, remain, reclaim;
316 	int i;
317 
318 	/* Work out how much data we can actually add into the pipe */
319 	used = pipe_occupancy(pipe->head, pipe->tail);
320 	npages = max_t(ssize_t, pipe->max_usage - used, 0);
321 	len = min_t(size_t, len, npages * PAGE_SIZE);
322 	npages = DIV_ROUND_UP(len, PAGE_SIZE);
323 
324 	bv = kzalloc(array_size(npages, sizeof(bv[0])) +
325 		     array_size(npages, sizeof(struct page *)), GFP_KERNEL);
326 	if (!bv)
327 		return -ENOMEM;
328 
329 	pages = (void *)(bv + npages);
330 	npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
331 	if (!npages) {
332 		kfree(bv);
333 		return -ENOMEM;
334 	}
335 
336 	remain = len = min_t(size_t, len, npages * PAGE_SIZE);
337 
338 	for (i = 0; i < npages; i++) {
339 		chunk = min_t(size_t, PAGE_SIZE, remain);
340 		bv[i].bv_page = pages[i];
341 		bv[i].bv_offset = 0;
342 		bv[i].bv_len = chunk;
343 		remain -= chunk;
344 	}
345 
346 	/* Do the I/O */
347 	iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
348 	init_sync_kiocb(&kiocb, in);
349 	kiocb.ki_pos = *ppos;
350 	ret = call_read_iter(in, &kiocb, &to);
351 
352 	reclaim = npages * PAGE_SIZE;
353 	remain = 0;
354 	if (ret > 0) {
355 		reclaim -= ret;
356 		remain = ret;
357 		*ppos = kiocb.ki_pos;
358 		file_accessed(in);
359 	} else if (ret < 0) {
360 		/*
361 		 * callers of ->splice_read() expect -EAGAIN on
362 		 * "can't put anything in there", rather than -EFAULT.
363 		 */
364 		if (ret == -EFAULT)
365 			ret = -EAGAIN;
366 	}
367 
368 	/* Free any pages that didn't get touched at all. */
369 	reclaim /= PAGE_SIZE;
370 	if (reclaim) {
371 		npages -= reclaim;
372 		release_pages(pages + npages, reclaim);
373 	}
374 
375 	/* Push the remaining pages into the pipe. */
376 	for (i = 0; i < npages; i++) {
377 		struct pipe_buffer *buf = pipe_head_buf(pipe);
378 
379 		chunk = min_t(size_t, remain, PAGE_SIZE);
380 		*buf = (struct pipe_buffer) {
381 			.ops	= &default_pipe_buf_ops,
382 			.page	= bv[i].bv_page,
383 			.offset	= 0,
384 			.len	= chunk,
385 		};
386 		pipe->head++;
387 		remain -= chunk;
388 	}
389 
390 	kfree(bv);
391 	return ret;
392 }
393 EXPORT_SYMBOL(direct_splice_read);
394 
395 /**
396  * generic_file_splice_read - splice data from file to a pipe
397  * @in:		file to splice from
398  * @ppos:	position in @in
399  * @pipe:	pipe to splice to
400  * @len:	number of bytes to splice
401  * @flags:	splice modifier flags
402  *
403  * Description:
404  *    Will read pages from given file and fill them into a pipe. Can be
405  *    used as long as it has more or less sane ->read_iter().
406  *
407  */
408 ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
409 				 struct pipe_inode_info *pipe, size_t len,
410 				 unsigned int flags)
411 {
412 	struct iov_iter to;
413 	struct kiocb kiocb;
414 	int ret;
415 
416 	iov_iter_pipe(&to, ITER_DEST, pipe, len);
417 	init_sync_kiocb(&kiocb, in);
418 	kiocb.ki_pos = *ppos;
419 	ret = call_read_iter(in, &kiocb, &to);
420 	if (ret > 0) {
421 		*ppos = kiocb.ki_pos;
422 		file_accessed(in);
423 	} else if (ret < 0) {
424 		/* free what was emitted */
425 		pipe_discard_from(pipe, to.start_head);
426 		/*
427 		 * callers of ->splice_read() expect -EAGAIN on
428 		 * "can't put anything in there", rather than -EFAULT.
429 		 */
430 		if (ret == -EFAULT)
431 			ret = -EAGAIN;
432 	}
433 
434 	return ret;
435 }
436 EXPORT_SYMBOL(generic_file_splice_read);
437 
438 const struct pipe_buf_operations default_pipe_buf_ops = {
439 	.release	= generic_pipe_buf_release,
440 	.try_steal	= generic_pipe_buf_try_steal,
441 	.get		= generic_pipe_buf_get,
442 };
443 
444 /* Pipe buffer operations for a socket and similar. */
445 const struct pipe_buf_operations nosteal_pipe_buf_ops = {
446 	.release	= generic_pipe_buf_release,
447 	.get		= generic_pipe_buf_get,
448 };
449 EXPORT_SYMBOL(nosteal_pipe_buf_ops);
450 
451 /*
452  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
453  * using sendpage(). Return the number of bytes sent.
454  */
455 static int pipe_to_sendpage(struct pipe_inode_info *pipe,
456 			    struct pipe_buffer *buf, struct splice_desc *sd)
457 {
458 	struct file *file = sd->u.file;
459 	loff_t pos = sd->pos;
460 	int more;
461 
462 	if (!likely(file->f_op->sendpage))
463 		return -EINVAL;
464 
465 	more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
466 
467 	if (sd->len < sd->total_len &&
468 	    pipe_occupancy(pipe->head, pipe->tail) > 1)
469 		more |= MSG_SENDPAGE_NOTLAST;
470 
471 	return file->f_op->sendpage(file, buf->page, buf->offset,
472 				    sd->len, &pos, more);
473 }
474 
475 static void wakeup_pipe_writers(struct pipe_inode_info *pipe)
476 {
477 	smp_mb();
478 	if (waitqueue_active(&pipe->wr_wait))
479 		wake_up_interruptible(&pipe->wr_wait);
480 	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
481 }
482 
483 /**
484  * splice_from_pipe_feed - feed available data from a pipe to a file
485  * @pipe:	pipe to splice from
486  * @sd:		information to @actor
487  * @actor:	handler that splices the data
488  *
489  * Description:
490  *    This function loops over the pipe and calls @actor to do the
491  *    actual moving of a single struct pipe_buffer to the desired
492  *    destination.  It returns when there's no more buffers left in
493  *    the pipe or if the requested number of bytes (@sd->total_len)
494  *    have been copied.  It returns a positive number (one) if the
495  *    pipe needs to be filled with more data, zero if the required
496  *    number of bytes have been copied and -errno on error.
497  *
498  *    This, together with splice_from_pipe_{begin,end,next}, may be
499  *    used to implement the functionality of __splice_from_pipe() when
500  *    locking is required around copying the pipe buffers to the
501  *    destination.
502  */
503 static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
504 			  splice_actor *actor)
505 {
506 	unsigned int head = pipe->head;
507 	unsigned int tail = pipe->tail;
508 	unsigned int mask = pipe->ring_size - 1;
509 	int ret;
510 
511 	while (!pipe_empty(head, tail)) {
512 		struct pipe_buffer *buf = &pipe->bufs[tail & mask];
513 
514 		sd->len = buf->len;
515 		if (sd->len > sd->total_len)
516 			sd->len = sd->total_len;
517 
518 		ret = pipe_buf_confirm(pipe, buf);
519 		if (unlikely(ret)) {
520 			if (ret == -ENODATA)
521 				ret = 0;
522 			return ret;
523 		}
524 
525 		ret = actor(pipe, buf, sd);
526 		if (ret <= 0)
527 			return ret;
528 
529 		buf->offset += ret;
530 		buf->len -= ret;
531 
532 		sd->num_spliced += ret;
533 		sd->len -= ret;
534 		sd->pos += ret;
535 		sd->total_len -= ret;
536 
537 		if (!buf->len) {
538 			pipe_buf_release(pipe, buf);
539 			tail++;
540 			pipe->tail = tail;
541 			if (pipe->files)
542 				sd->need_wakeup = true;
543 		}
544 
545 		if (!sd->total_len)
546 			return 0;
547 	}
548 
549 	return 1;
550 }
551 
552 /* We know we have a pipe buffer, but maybe it's empty? */
553 static inline bool eat_empty_buffer(struct pipe_inode_info *pipe)
554 {
555 	unsigned int tail = pipe->tail;
556 	unsigned int mask = pipe->ring_size - 1;
557 	struct pipe_buffer *buf = &pipe->bufs[tail & mask];
558 
559 	if (unlikely(!buf->len)) {
560 		pipe_buf_release(pipe, buf);
561 		pipe->tail = tail+1;
562 		return true;
563 	}
564 
565 	return false;
566 }
567 
568 /**
569  * splice_from_pipe_next - wait for some data to splice from
570  * @pipe:	pipe to splice from
571  * @sd:		information about the splice operation
572  *
573  * Description:
574  *    This function will wait for some data and return a positive
575  *    value (one) if pipe buffers are available.  It will return zero
576  *    or -errno if no more data needs to be spliced.
577  */
578 static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_desc *sd)
579 {
580 	/*
581 	 * Check for signal early to make process killable when there are
582 	 * always buffers available
583 	 */
584 	if (signal_pending(current))
585 		return -ERESTARTSYS;
586 
587 repeat:
588 	while (pipe_empty(pipe->head, pipe->tail)) {
589 		if (!pipe->writers)
590 			return 0;
591 
592 		if (sd->num_spliced)
593 			return 0;
594 
595 		if (sd->flags & SPLICE_F_NONBLOCK)
596 			return -EAGAIN;
597 
598 		if (signal_pending(current))
599 			return -ERESTARTSYS;
600 
601 		if (sd->need_wakeup) {
602 			wakeup_pipe_writers(pipe);
603 			sd->need_wakeup = false;
604 		}
605 
606 		pipe_wait_readable(pipe);
607 	}
608 
609 	if (eat_empty_buffer(pipe))
610 		goto repeat;
611 
612 	return 1;
613 }
614 
615 /**
616  * splice_from_pipe_begin - start splicing from pipe
617  * @sd:		information about the splice operation
618  *
619  * Description:
620  *    This function should be called before a loop containing
621  *    splice_from_pipe_next() and splice_from_pipe_feed() to
622  *    initialize the necessary fields of @sd.
623  */
624 static void splice_from_pipe_begin(struct splice_desc *sd)
625 {
626 	sd->num_spliced = 0;
627 	sd->need_wakeup = false;
628 }
629 
630 /**
631  * splice_from_pipe_end - finish splicing from pipe
632  * @pipe:	pipe to splice from
633  * @sd:		information about the splice operation
634  *
635  * Description:
636  *    This function will wake up pipe writers if necessary.  It should
637  *    be called after a loop containing splice_from_pipe_next() and
638  *    splice_from_pipe_feed().
639  */
640 static void splice_from_pipe_end(struct pipe_inode_info *pipe, struct splice_desc *sd)
641 {
642 	if (sd->need_wakeup)
643 		wakeup_pipe_writers(pipe);
644 }
645 
646 /**
647  * __splice_from_pipe - splice data from a pipe to given actor
648  * @pipe:	pipe to splice from
649  * @sd:		information to @actor
650  * @actor:	handler that splices the data
651  *
652  * Description:
653  *    This function does little more than loop over the pipe and call
654  *    @actor to do the actual moving of a single struct pipe_buffer to
655  *    the desired destination. See pipe_to_file, pipe_to_sendpage, or
656  *    pipe_to_user.
657  *
658  */
659 ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd,
660 			   splice_actor *actor)
661 {
662 	int ret;
663 
664 	splice_from_pipe_begin(sd);
665 	do {
666 		cond_resched();
667 		ret = splice_from_pipe_next(pipe, sd);
668 		if (ret > 0)
669 			ret = splice_from_pipe_feed(pipe, sd, actor);
670 	} while (ret > 0);
671 	splice_from_pipe_end(pipe, sd);
672 
673 	return sd->num_spliced ? sd->num_spliced : ret;
674 }
675 EXPORT_SYMBOL(__splice_from_pipe);
676 
677 /**
678  * splice_from_pipe - splice data from a pipe to a file
679  * @pipe:	pipe to splice from
680  * @out:	file to splice to
681  * @ppos:	position in @out
682  * @len:	how many bytes to splice
683  * @flags:	splice modifier flags
684  * @actor:	handler that splices the data
685  *
686  * Description:
687  *    See __splice_from_pipe. This function locks the pipe inode,
688  *    otherwise it's identical to __splice_from_pipe().
689  *
690  */
691 ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
692 			 loff_t *ppos, size_t len, unsigned int flags,
693 			 splice_actor *actor)
694 {
695 	ssize_t ret;
696 	struct splice_desc sd = {
697 		.total_len = len,
698 		.flags = flags,
699 		.pos = *ppos,
700 		.u.file = out,
701 	};
702 
703 	pipe_lock(pipe);
704 	ret = __splice_from_pipe(pipe, &sd, actor);
705 	pipe_unlock(pipe);
706 
707 	return ret;
708 }
709 
710 /**
711  * iter_file_splice_write - splice data from a pipe to a file
712  * @pipe:	pipe info
713  * @out:	file to write to
714  * @ppos:	position in @out
715  * @len:	number of bytes to splice
716  * @flags:	splice modifier flags
717  *
718  * Description:
719  *    Will either move or copy pages (determined by @flags options) from
720  *    the given pipe inode to the given file.
721  *    This one is ->write_iter-based.
722  *
723  */
724 ssize_t
725 iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
726 			  loff_t *ppos, size_t len, unsigned int flags)
727 {
728 	struct splice_desc sd = {
729 		.total_len = len,
730 		.flags = flags,
731 		.pos = *ppos,
732 		.u.file = out,
733 	};
734 	int nbufs = pipe->max_usage;
735 	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
736 					GFP_KERNEL);
737 	ssize_t ret;
738 
739 	if (unlikely(!array))
740 		return -ENOMEM;
741 
742 	pipe_lock(pipe);
743 
744 	splice_from_pipe_begin(&sd);
745 	while (sd.total_len) {
746 		struct iov_iter from;
747 		unsigned int head, tail, mask;
748 		size_t left;
749 		int n;
750 
751 		ret = splice_from_pipe_next(pipe, &sd);
752 		if (ret <= 0)
753 			break;
754 
755 		if (unlikely(nbufs < pipe->max_usage)) {
756 			kfree(array);
757 			nbufs = pipe->max_usage;
758 			array = kcalloc(nbufs, sizeof(struct bio_vec),
759 					GFP_KERNEL);
760 			if (!array) {
761 				ret = -ENOMEM;
762 				break;
763 			}
764 		}
765 
766 		head = pipe->head;
767 		tail = pipe->tail;
768 		mask = pipe->ring_size - 1;
769 
770 		/* build the vector */
771 		left = sd.total_len;
772 		for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) {
773 			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
774 			size_t this_len = buf->len;
775 
776 			/* zero-length bvecs are not supported, skip them */
777 			if (!this_len)
778 				continue;
779 			this_len = min(this_len, left);
780 
781 			ret = pipe_buf_confirm(pipe, buf);
782 			if (unlikely(ret)) {
783 				if (ret == -ENODATA)
784 					ret = 0;
785 				goto done;
786 			}
787 
788 			bvec_set_page(&array[n], buf->page, this_len,
789 				      buf->offset);
790 			left -= this_len;
791 			n++;
792 		}
793 
794 		iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
795 		ret = vfs_iter_write(out, &from, &sd.pos, 0);
796 		if (ret <= 0)
797 			break;
798 
799 		sd.num_spliced += ret;
800 		sd.total_len -= ret;
801 		*ppos = sd.pos;
802 
803 		/* dismiss the fully eaten buffers, adjust the partial one */
804 		tail = pipe->tail;
805 		while (ret) {
806 			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
807 			if (ret >= buf->len) {
808 				ret -= buf->len;
809 				buf->len = 0;
810 				pipe_buf_release(pipe, buf);
811 				tail++;
812 				pipe->tail = tail;
813 				if (pipe->files)
814 					sd.need_wakeup = true;
815 			} else {
816 				buf->offset += ret;
817 				buf->len -= ret;
818 				ret = 0;
819 			}
820 		}
821 	}
822 done:
823 	kfree(array);
824 	splice_from_pipe_end(pipe, &sd);
825 
826 	pipe_unlock(pipe);
827 
828 	if (sd.num_spliced)
829 		ret = sd.num_spliced;
830 
831 	return ret;
832 }
833 
834 EXPORT_SYMBOL(iter_file_splice_write);
835 
836 /**
837  * generic_splice_sendpage - splice data from a pipe to a socket
838  * @pipe:	pipe to splice from
839  * @out:	socket to write to
840  * @ppos:	position in @out
841  * @len:	number of bytes to splice
842  * @flags:	splice modifier flags
843  *
844  * Description:
845  *    Will send @len bytes from the pipe to a network socket. No data copying
846  *    is involved.
847  *
848  */
849 ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
850 				loff_t *ppos, size_t len, unsigned int flags)
851 {
852 	return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
853 }
854 
855 EXPORT_SYMBOL(generic_splice_sendpage);
856 
857 static int warn_unsupported(struct file *file, const char *op)
858 {
859 	pr_debug_ratelimited(
860 		"splice %s not supported for file %pD4 (pid: %d comm: %.20s)\n",
861 		op, file, current->pid, current->comm);
862 	return -EINVAL;
863 }
864 
865 /*
866  * Attempt to initiate a splice from pipe to file.
867  */
868 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
869 			   loff_t *ppos, size_t len, unsigned int flags)
870 {
871 	if (unlikely(!out->f_op->splice_write))
872 		return warn_unsupported(out, "write");
873 	return out->f_op->splice_write(pipe, out, ppos, len, flags);
874 }
875 
876 /*
877  * Attempt to initiate a splice from a file to a pipe.
878  */
879 static long do_splice_to(struct file *in, loff_t *ppos,
880 			 struct pipe_inode_info *pipe, size_t len,
881 			 unsigned int flags)
882 {
883 	unsigned int p_space;
884 	int ret;
885 
886 	if (unlikely(!(in->f_mode & FMODE_READ)))
887 		return -EBADF;
888 
889 	/* Don't try to read more the pipe has space for. */
890 	p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
891 	len = min_t(size_t, len, p_space << PAGE_SHIFT);
892 
893 	ret = rw_verify_area(READ, in, ppos, len);
894 	if (unlikely(ret < 0))
895 		return ret;
896 
897 	if (unlikely(len > MAX_RW_COUNT))
898 		len = MAX_RW_COUNT;
899 
900 	if (unlikely(!in->f_op->splice_read))
901 		return warn_unsupported(in, "read");
902 	return in->f_op->splice_read(in, ppos, pipe, len, flags);
903 }
904 
905 /**
906  * splice_direct_to_actor - splices data directly between two non-pipes
907  * @in:		file to splice from
908  * @sd:		actor information on where to splice to
909  * @actor:	handles the data splicing
910  *
911  * Description:
912  *    This is a special case helper to splice directly between two
913  *    points, without requiring an explicit pipe. Internally an allocated
914  *    pipe is cached in the process, and reused during the lifetime of
915  *    that process.
916  *
917  */
918 ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
919 			       splice_direct_actor *actor)
920 {
921 	struct pipe_inode_info *pipe;
922 	long ret, bytes;
923 	size_t len;
924 	int i, flags, more;
925 
926 	/*
927 	 * We require the input to be seekable, as we don't want to randomly
928 	 * drop data for eg socket -> socket splicing. Use the piped splicing
929 	 * for that!
930 	 */
931 	if (unlikely(!(in->f_mode & FMODE_LSEEK)))
932 		return -EINVAL;
933 
934 	/*
935 	 * neither in nor out is a pipe, setup an internal pipe attached to
936 	 * 'out' and transfer the wanted data from 'in' to 'out' through that
937 	 */
938 	pipe = current->splice_pipe;
939 	if (unlikely(!pipe)) {
940 		pipe = alloc_pipe_info();
941 		if (!pipe)
942 			return -ENOMEM;
943 
944 		/*
945 		 * We don't have an immediate reader, but we'll read the stuff
946 		 * out of the pipe right after the splice_to_pipe(). So set
947 		 * PIPE_READERS appropriately.
948 		 */
949 		pipe->readers = 1;
950 
951 		current->splice_pipe = pipe;
952 	}
953 
954 	/*
955 	 * Do the splice.
956 	 */
957 	bytes = 0;
958 	len = sd->total_len;
959 	flags = sd->flags;
960 
961 	/*
962 	 * Don't block on output, we have to drain the direct pipe.
963 	 */
964 	sd->flags &= ~SPLICE_F_NONBLOCK;
965 	more = sd->flags & SPLICE_F_MORE;
966 
967 	WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail));
968 
969 	while (len) {
970 		size_t read_len;
971 		loff_t pos = sd->pos, prev_pos = pos;
972 
973 		ret = do_splice_to(in, &pos, pipe, len, flags);
974 		if (unlikely(ret <= 0))
975 			goto out_release;
976 
977 		read_len = ret;
978 		sd->total_len = read_len;
979 
980 		/*
981 		 * If more data is pending, set SPLICE_F_MORE
982 		 * If this is the last data and SPLICE_F_MORE was not set
983 		 * initially, clears it.
984 		 */
985 		if (read_len < len)
986 			sd->flags |= SPLICE_F_MORE;
987 		else if (!more)
988 			sd->flags &= ~SPLICE_F_MORE;
989 		/*
990 		 * NOTE: nonblocking mode only applies to the input. We
991 		 * must not do the output in nonblocking mode as then we
992 		 * could get stuck data in the internal pipe:
993 		 */
994 		ret = actor(pipe, sd);
995 		if (unlikely(ret <= 0)) {
996 			sd->pos = prev_pos;
997 			goto out_release;
998 		}
999 
1000 		bytes += ret;
1001 		len -= ret;
1002 		sd->pos = pos;
1003 
1004 		if (ret < read_len) {
1005 			sd->pos = prev_pos + ret;
1006 			goto out_release;
1007 		}
1008 	}
1009 
1010 done:
1011 	pipe->tail = pipe->head = 0;
1012 	file_accessed(in);
1013 	return bytes;
1014 
1015 out_release:
1016 	/*
1017 	 * If we did an incomplete transfer we must release
1018 	 * the pipe buffers in question:
1019 	 */
1020 	for (i = 0; i < pipe->ring_size; i++) {
1021 		struct pipe_buffer *buf = &pipe->bufs[i];
1022 
1023 		if (buf->ops)
1024 			pipe_buf_release(pipe, buf);
1025 	}
1026 
1027 	if (!bytes)
1028 		bytes = ret;
1029 
1030 	goto done;
1031 }
1032 EXPORT_SYMBOL(splice_direct_to_actor);
1033 
1034 static int direct_splice_actor(struct pipe_inode_info *pipe,
1035 			       struct splice_desc *sd)
1036 {
1037 	struct file *file = sd->u.file;
1038 
1039 	return do_splice_from(pipe, file, sd->opos, sd->total_len,
1040 			      sd->flags);
1041 }
1042 
1043 /**
1044  * do_splice_direct - splices data directly between two files
1045  * @in:		file to splice from
1046  * @ppos:	input file offset
1047  * @out:	file to splice to
1048  * @opos:	output file offset
1049  * @len:	number of bytes to splice
1050  * @flags:	splice modifier flags
1051  *
1052  * Description:
1053  *    For use by do_sendfile(). splice can easily emulate sendfile, but
1054  *    doing it in the application would incur an extra system call
1055  *    (splice in + splice out, as compared to just sendfile()). So this helper
1056  *    can splice directly through a process-private pipe.
1057  *
1058  */
1059 long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
1060 		      loff_t *opos, size_t len, unsigned int flags)
1061 {
1062 	struct splice_desc sd = {
1063 		.len		= len,
1064 		.total_len	= len,
1065 		.flags		= flags,
1066 		.pos		= *ppos,
1067 		.u.file		= out,
1068 		.opos		= opos,
1069 	};
1070 	long ret;
1071 
1072 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
1073 		return -EBADF;
1074 
1075 	if (unlikely(out->f_flags & O_APPEND))
1076 		return -EINVAL;
1077 
1078 	ret = rw_verify_area(WRITE, out, opos, len);
1079 	if (unlikely(ret < 0))
1080 		return ret;
1081 
1082 	ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
1083 	if (ret > 0)
1084 		*ppos = sd.pos;
1085 
1086 	return ret;
1087 }
1088 EXPORT_SYMBOL(do_splice_direct);
1089 
1090 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
1091 {
1092 	for (;;) {
1093 		if (unlikely(!pipe->readers)) {
1094 			send_sig(SIGPIPE, current, 0);
1095 			return -EPIPE;
1096 		}
1097 		if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1098 			return 0;
1099 		if (flags & SPLICE_F_NONBLOCK)
1100 			return -EAGAIN;
1101 		if (signal_pending(current))
1102 			return -ERESTARTSYS;
1103 		pipe_wait_writable(pipe);
1104 	}
1105 }
1106 
1107 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1108 			       struct pipe_inode_info *opipe,
1109 			       size_t len, unsigned int flags);
1110 
1111 long splice_file_to_pipe(struct file *in,
1112 			 struct pipe_inode_info *opipe,
1113 			 loff_t *offset,
1114 			 size_t len, unsigned int flags)
1115 {
1116 	long ret;
1117 
1118 	pipe_lock(opipe);
1119 	ret = wait_for_space(opipe, flags);
1120 	if (!ret)
1121 		ret = do_splice_to(in, offset, opipe, len, flags);
1122 	pipe_unlock(opipe);
1123 	if (ret > 0)
1124 		wakeup_pipe_readers(opipe);
1125 	return ret;
1126 }
1127 
1128 /*
1129  * Determine where to splice to/from.
1130  */
1131 long do_splice(struct file *in, loff_t *off_in, struct file *out,
1132 	       loff_t *off_out, size_t len, unsigned int flags)
1133 {
1134 	struct pipe_inode_info *ipipe;
1135 	struct pipe_inode_info *opipe;
1136 	loff_t offset;
1137 	long ret;
1138 
1139 	if (unlikely(!(in->f_mode & FMODE_READ) ||
1140 		     !(out->f_mode & FMODE_WRITE)))
1141 		return -EBADF;
1142 
1143 	ipipe = get_pipe_info(in, true);
1144 	opipe = get_pipe_info(out, true);
1145 
1146 	if (ipipe && opipe) {
1147 		if (off_in || off_out)
1148 			return -ESPIPE;
1149 
1150 		/* Splicing to self would be fun, but... */
1151 		if (ipipe == opipe)
1152 			return -EINVAL;
1153 
1154 		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1155 			flags |= SPLICE_F_NONBLOCK;
1156 
1157 		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
1158 	}
1159 
1160 	if (ipipe) {
1161 		if (off_in)
1162 			return -ESPIPE;
1163 		if (off_out) {
1164 			if (!(out->f_mode & FMODE_PWRITE))
1165 				return -EINVAL;
1166 			offset = *off_out;
1167 		} else {
1168 			offset = out->f_pos;
1169 		}
1170 
1171 		if (unlikely(out->f_flags & O_APPEND))
1172 			return -EINVAL;
1173 
1174 		ret = rw_verify_area(WRITE, out, &offset, len);
1175 		if (unlikely(ret < 0))
1176 			return ret;
1177 
1178 		if (in->f_flags & O_NONBLOCK)
1179 			flags |= SPLICE_F_NONBLOCK;
1180 
1181 		file_start_write(out);
1182 		ret = do_splice_from(ipipe, out, &offset, len, flags);
1183 		file_end_write(out);
1184 
1185 		if (ret > 0)
1186 			fsnotify_modify(out);
1187 
1188 		if (!off_out)
1189 			out->f_pos = offset;
1190 		else
1191 			*off_out = offset;
1192 
1193 		return ret;
1194 	}
1195 
1196 	if (opipe) {
1197 		if (off_out)
1198 			return -ESPIPE;
1199 		if (off_in) {
1200 			if (!(in->f_mode & FMODE_PREAD))
1201 				return -EINVAL;
1202 			offset = *off_in;
1203 		} else {
1204 			offset = in->f_pos;
1205 		}
1206 
1207 		if (out->f_flags & O_NONBLOCK)
1208 			flags |= SPLICE_F_NONBLOCK;
1209 
1210 		ret = splice_file_to_pipe(in, opipe, &offset, len, flags);
1211 
1212 		if (ret > 0)
1213 			fsnotify_access(in);
1214 
1215 		if (!off_in)
1216 			in->f_pos = offset;
1217 		else
1218 			*off_in = offset;
1219 
1220 		return ret;
1221 	}
1222 
1223 	return -EINVAL;
1224 }
1225 
1226 static long __do_splice(struct file *in, loff_t __user *off_in,
1227 			struct file *out, loff_t __user *off_out,
1228 			size_t len, unsigned int flags)
1229 {
1230 	struct pipe_inode_info *ipipe;
1231 	struct pipe_inode_info *opipe;
1232 	loff_t offset, *__off_in = NULL, *__off_out = NULL;
1233 	long ret;
1234 
1235 	ipipe = get_pipe_info(in, true);
1236 	opipe = get_pipe_info(out, true);
1237 
1238 	if (ipipe) {
1239 		if (off_in)
1240 			return -ESPIPE;
1241 		pipe_clear_nowait(in);
1242 	}
1243 	if (opipe) {
1244 		if (off_out)
1245 			return -ESPIPE;
1246 		pipe_clear_nowait(out);
1247 	}
1248 
1249 	if (off_out) {
1250 		if (copy_from_user(&offset, off_out, sizeof(loff_t)))
1251 			return -EFAULT;
1252 		__off_out = &offset;
1253 	}
1254 	if (off_in) {
1255 		if (copy_from_user(&offset, off_in, sizeof(loff_t)))
1256 			return -EFAULT;
1257 		__off_in = &offset;
1258 	}
1259 
1260 	ret = do_splice(in, __off_in, out, __off_out, len, flags);
1261 	if (ret < 0)
1262 		return ret;
1263 
1264 	if (__off_out && copy_to_user(off_out, __off_out, sizeof(loff_t)))
1265 		return -EFAULT;
1266 	if (__off_in && copy_to_user(off_in, __off_in, sizeof(loff_t)))
1267 		return -EFAULT;
1268 
1269 	return ret;
1270 }
1271 
1272 static int iter_to_pipe(struct iov_iter *from,
1273 			struct pipe_inode_info *pipe,
1274 			unsigned flags)
1275 {
1276 	struct pipe_buffer buf = {
1277 		.ops = &user_page_pipe_buf_ops,
1278 		.flags = flags
1279 	};
1280 	size_t total = 0;
1281 	int ret = 0;
1282 
1283 	while (iov_iter_count(from)) {
1284 		struct page *pages[16];
1285 		ssize_t left;
1286 		size_t start;
1287 		int i, n;
1288 
1289 		left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start);
1290 		if (left <= 0) {
1291 			ret = left;
1292 			break;
1293 		}
1294 
1295 		n = DIV_ROUND_UP(left + start, PAGE_SIZE);
1296 		for (i = 0; i < n; i++) {
1297 			int size = min_t(int, left, PAGE_SIZE - start);
1298 
1299 			buf.page = pages[i];
1300 			buf.offset = start;
1301 			buf.len = size;
1302 			ret = add_to_pipe(pipe, &buf);
1303 			if (unlikely(ret < 0)) {
1304 				iov_iter_revert(from, left);
1305 				// this one got dropped by add_to_pipe()
1306 				while (++i < n)
1307 					put_page(pages[i]);
1308 				goto out;
1309 			}
1310 			total += ret;
1311 			left -= size;
1312 			start = 0;
1313 		}
1314 	}
1315 out:
1316 	return total ? total : ret;
1317 }
1318 
1319 static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
1320 			struct splice_desc *sd)
1321 {
1322 	int n = copy_page_to_iter(buf->page, buf->offset, sd->len, sd->u.data);
1323 	return n == sd->len ? n : -EFAULT;
1324 }
1325 
1326 /*
1327  * For lack of a better implementation, implement vmsplice() to userspace
1328  * as a simple copy of the pipes pages to the user iov.
1329  */
1330 static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
1331 			     unsigned int flags)
1332 {
1333 	struct pipe_inode_info *pipe = get_pipe_info(file, true);
1334 	struct splice_desc sd = {
1335 		.total_len = iov_iter_count(iter),
1336 		.flags = flags,
1337 		.u.data = iter
1338 	};
1339 	long ret = 0;
1340 
1341 	if (!pipe)
1342 		return -EBADF;
1343 
1344 	pipe_clear_nowait(file);
1345 
1346 	if (sd.total_len) {
1347 		pipe_lock(pipe);
1348 		ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
1349 		pipe_unlock(pipe);
1350 	}
1351 
1352 	return ret;
1353 }
1354 
1355 /*
1356  * vmsplice splices a user address range into a pipe. It can be thought of
1357  * as splice-from-memory, where the regular splice is splice-from-file (or
1358  * to file). In both cases the output is a pipe, naturally.
1359  */
1360 static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
1361 			     unsigned int flags)
1362 {
1363 	struct pipe_inode_info *pipe;
1364 	long ret = 0;
1365 	unsigned buf_flag = 0;
1366 
1367 	if (flags & SPLICE_F_GIFT)
1368 		buf_flag = PIPE_BUF_FLAG_GIFT;
1369 
1370 	pipe = get_pipe_info(file, true);
1371 	if (!pipe)
1372 		return -EBADF;
1373 
1374 	pipe_clear_nowait(file);
1375 
1376 	pipe_lock(pipe);
1377 	ret = wait_for_space(pipe, flags);
1378 	if (!ret)
1379 		ret = iter_to_pipe(iter, pipe, buf_flag);
1380 	pipe_unlock(pipe);
1381 	if (ret > 0)
1382 		wakeup_pipe_readers(pipe);
1383 	return ret;
1384 }
1385 
1386 static int vmsplice_type(struct fd f, int *type)
1387 {
1388 	if (!f.file)
1389 		return -EBADF;
1390 	if (f.file->f_mode & FMODE_WRITE) {
1391 		*type = ITER_SOURCE;
1392 	} else if (f.file->f_mode & FMODE_READ) {
1393 		*type = ITER_DEST;
1394 	} else {
1395 		fdput(f);
1396 		return -EBADF;
1397 	}
1398 	return 0;
1399 }
1400 
1401 /*
1402  * Note that vmsplice only really supports true splicing _from_ user memory
1403  * to a pipe, not the other way around. Splicing from user memory is a simple
1404  * operation that can be supported without any funky alignment restrictions
1405  * or nasty vm tricks. We simply map in the user memory and fill them into
1406  * a pipe. The reverse isn't quite as easy, though. There are two possible
1407  * solutions for that:
1408  *
1409  *	- memcpy() the data internally, at which point we might as well just
1410  *	  do a regular read() on the buffer anyway.
1411  *	- Lots of nasty vm tricks, that are neither fast nor flexible (it
1412  *	  has restriction limitations on both ends of the pipe).
1413  *
1414  * Currently we punt and implement it as a normal copy, see pipe_to_user().
1415  *
1416  */
1417 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
1418 		unsigned long, nr_segs, unsigned int, flags)
1419 {
1420 	struct iovec iovstack[UIO_FASTIOV];
1421 	struct iovec *iov = iovstack;
1422 	struct iov_iter iter;
1423 	ssize_t error;
1424 	struct fd f;
1425 	int type;
1426 
1427 	if (unlikely(flags & ~SPLICE_F_ALL))
1428 		return -EINVAL;
1429 
1430 	f = fdget(fd);
1431 	error = vmsplice_type(f, &type);
1432 	if (error)
1433 		return error;
1434 
1435 	error = import_iovec(type, uiov, nr_segs,
1436 			     ARRAY_SIZE(iovstack), &iov, &iter);
1437 	if (error < 0)
1438 		goto out_fdput;
1439 
1440 	if (!iov_iter_count(&iter))
1441 		error = 0;
1442 	else if (type == ITER_SOURCE)
1443 		error = vmsplice_to_pipe(f.file, &iter, flags);
1444 	else
1445 		error = vmsplice_to_user(f.file, &iter, flags);
1446 
1447 	kfree(iov);
1448 out_fdput:
1449 	fdput(f);
1450 	return error;
1451 }
1452 
1453 SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
1454 		int, fd_out, loff_t __user *, off_out,
1455 		size_t, len, unsigned int, flags)
1456 {
1457 	struct fd in, out;
1458 	long error;
1459 
1460 	if (unlikely(!len))
1461 		return 0;
1462 
1463 	if (unlikely(flags & ~SPLICE_F_ALL))
1464 		return -EINVAL;
1465 
1466 	error = -EBADF;
1467 	in = fdget(fd_in);
1468 	if (in.file) {
1469 		out = fdget(fd_out);
1470 		if (out.file) {
1471 			error = __do_splice(in.file, off_in, out.file, off_out,
1472 						len, flags);
1473 			fdput(out);
1474 		}
1475 		fdput(in);
1476 	}
1477 	return error;
1478 }
1479 
1480 /*
1481  * Make sure there's data to read. Wait for input if we can, otherwise
1482  * return an appropriate error.
1483  */
1484 static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1485 {
1486 	int ret;
1487 
1488 	/*
1489 	 * Check the pipe occupancy without the inode lock first. This function
1490 	 * is speculative anyways, so missing one is ok.
1491 	 */
1492 	if (!pipe_empty(pipe->head, pipe->tail))
1493 		return 0;
1494 
1495 	ret = 0;
1496 	pipe_lock(pipe);
1497 
1498 	while (pipe_empty(pipe->head, pipe->tail)) {
1499 		if (signal_pending(current)) {
1500 			ret = -ERESTARTSYS;
1501 			break;
1502 		}
1503 		if (!pipe->writers)
1504 			break;
1505 		if (flags & SPLICE_F_NONBLOCK) {
1506 			ret = -EAGAIN;
1507 			break;
1508 		}
1509 		pipe_wait_readable(pipe);
1510 	}
1511 
1512 	pipe_unlock(pipe);
1513 	return ret;
1514 }
1515 
1516 /*
1517  * Make sure there's writeable room. Wait for room if we can, otherwise
1518  * return an appropriate error.
1519  */
1520 static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
1521 {
1522 	int ret;
1523 
1524 	/*
1525 	 * Check pipe occupancy without the inode lock first. This function
1526 	 * is speculative anyways, so missing one is ok.
1527 	 */
1528 	if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage))
1529 		return 0;
1530 
1531 	ret = 0;
1532 	pipe_lock(pipe);
1533 
1534 	while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
1535 		if (!pipe->readers) {
1536 			send_sig(SIGPIPE, current, 0);
1537 			ret = -EPIPE;
1538 			break;
1539 		}
1540 		if (flags & SPLICE_F_NONBLOCK) {
1541 			ret = -EAGAIN;
1542 			break;
1543 		}
1544 		if (signal_pending(current)) {
1545 			ret = -ERESTARTSYS;
1546 			break;
1547 		}
1548 		pipe_wait_writable(pipe);
1549 	}
1550 
1551 	pipe_unlock(pipe);
1552 	return ret;
1553 }
1554 
1555 /*
1556  * Splice contents of ipipe to opipe.
1557  */
1558 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
1559 			       struct pipe_inode_info *opipe,
1560 			       size_t len, unsigned int flags)
1561 {
1562 	struct pipe_buffer *ibuf, *obuf;
1563 	unsigned int i_head, o_head;
1564 	unsigned int i_tail, o_tail;
1565 	unsigned int i_mask, o_mask;
1566 	int ret = 0;
1567 	bool input_wakeup = false;
1568 
1569 
1570 retry:
1571 	ret = ipipe_prep(ipipe, flags);
1572 	if (ret)
1573 		return ret;
1574 
1575 	ret = opipe_prep(opipe, flags);
1576 	if (ret)
1577 		return ret;
1578 
1579 	/*
1580 	 * Potential ABBA deadlock, work around it by ordering lock
1581 	 * grabbing by pipe info address. Otherwise two different processes
1582 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1583 	 */
1584 	pipe_double_lock(ipipe, opipe);
1585 
1586 	i_tail = ipipe->tail;
1587 	i_mask = ipipe->ring_size - 1;
1588 	o_head = opipe->head;
1589 	o_mask = opipe->ring_size - 1;
1590 
1591 	do {
1592 		size_t o_len;
1593 
1594 		if (!opipe->readers) {
1595 			send_sig(SIGPIPE, current, 0);
1596 			if (!ret)
1597 				ret = -EPIPE;
1598 			break;
1599 		}
1600 
1601 		i_head = ipipe->head;
1602 		o_tail = opipe->tail;
1603 
1604 		if (pipe_empty(i_head, i_tail) && !ipipe->writers)
1605 			break;
1606 
1607 		/*
1608 		 * Cannot make any progress, because either the input
1609 		 * pipe is empty or the output pipe is full.
1610 		 */
1611 		if (pipe_empty(i_head, i_tail) ||
1612 		    pipe_full(o_head, o_tail, opipe->max_usage)) {
1613 			/* Already processed some buffers, break */
1614 			if (ret)
1615 				break;
1616 
1617 			if (flags & SPLICE_F_NONBLOCK) {
1618 				ret = -EAGAIN;
1619 				break;
1620 			}
1621 
1622 			/*
1623 			 * We raced with another reader/writer and haven't
1624 			 * managed to process any buffers.  A zero return
1625 			 * value means EOF, so retry instead.
1626 			 */
1627 			pipe_unlock(ipipe);
1628 			pipe_unlock(opipe);
1629 			goto retry;
1630 		}
1631 
1632 		ibuf = &ipipe->bufs[i_tail & i_mask];
1633 		obuf = &opipe->bufs[o_head & o_mask];
1634 
1635 		if (len >= ibuf->len) {
1636 			/*
1637 			 * Simply move the whole buffer from ipipe to opipe
1638 			 */
1639 			*obuf = *ibuf;
1640 			ibuf->ops = NULL;
1641 			i_tail++;
1642 			ipipe->tail = i_tail;
1643 			input_wakeup = true;
1644 			o_len = obuf->len;
1645 			o_head++;
1646 			opipe->head = o_head;
1647 		} else {
1648 			/*
1649 			 * Get a reference to this pipe buffer,
1650 			 * so we can copy the contents over.
1651 			 */
1652 			if (!pipe_buf_get(ipipe, ibuf)) {
1653 				if (ret == 0)
1654 					ret = -EFAULT;
1655 				break;
1656 			}
1657 			*obuf = *ibuf;
1658 
1659 			/*
1660 			 * Don't inherit the gift and merge flags, we need to
1661 			 * prevent multiple steals of this page.
1662 			 */
1663 			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1664 			obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1665 
1666 			obuf->len = len;
1667 			ibuf->offset += len;
1668 			ibuf->len -= len;
1669 			o_len = len;
1670 			o_head++;
1671 			opipe->head = o_head;
1672 		}
1673 		ret += o_len;
1674 		len -= o_len;
1675 	} while (len);
1676 
1677 	pipe_unlock(ipipe);
1678 	pipe_unlock(opipe);
1679 
1680 	/*
1681 	 * If we put data in the output pipe, wakeup any potential readers.
1682 	 */
1683 	if (ret > 0)
1684 		wakeup_pipe_readers(opipe);
1685 
1686 	if (input_wakeup)
1687 		wakeup_pipe_writers(ipipe);
1688 
1689 	return ret;
1690 }
1691 
1692 /*
1693  * Link contents of ipipe to opipe.
1694  */
1695 static int link_pipe(struct pipe_inode_info *ipipe,
1696 		     struct pipe_inode_info *opipe,
1697 		     size_t len, unsigned int flags)
1698 {
1699 	struct pipe_buffer *ibuf, *obuf;
1700 	unsigned int i_head, o_head;
1701 	unsigned int i_tail, o_tail;
1702 	unsigned int i_mask, o_mask;
1703 	int ret = 0;
1704 
1705 	/*
1706 	 * Potential ABBA deadlock, work around it by ordering lock
1707 	 * grabbing by pipe info address. Otherwise two different processes
1708 	 * could deadlock (one doing tee from A -> B, the other from B -> A).
1709 	 */
1710 	pipe_double_lock(ipipe, opipe);
1711 
1712 	i_tail = ipipe->tail;
1713 	i_mask = ipipe->ring_size - 1;
1714 	o_head = opipe->head;
1715 	o_mask = opipe->ring_size - 1;
1716 
1717 	do {
1718 		if (!opipe->readers) {
1719 			send_sig(SIGPIPE, current, 0);
1720 			if (!ret)
1721 				ret = -EPIPE;
1722 			break;
1723 		}
1724 
1725 		i_head = ipipe->head;
1726 		o_tail = opipe->tail;
1727 
1728 		/*
1729 		 * If we have iterated all input buffers or run out of
1730 		 * output room, break.
1731 		 */
1732 		if (pipe_empty(i_head, i_tail) ||
1733 		    pipe_full(o_head, o_tail, opipe->max_usage))
1734 			break;
1735 
1736 		ibuf = &ipipe->bufs[i_tail & i_mask];
1737 		obuf = &opipe->bufs[o_head & o_mask];
1738 
1739 		/*
1740 		 * Get a reference to this pipe buffer,
1741 		 * so we can copy the contents over.
1742 		 */
1743 		if (!pipe_buf_get(ipipe, ibuf)) {
1744 			if (ret == 0)
1745 				ret = -EFAULT;
1746 			break;
1747 		}
1748 
1749 		*obuf = *ibuf;
1750 
1751 		/*
1752 		 * Don't inherit the gift and merge flag, we need to prevent
1753 		 * multiple steals of this page.
1754 		 */
1755 		obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
1756 		obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE;
1757 
1758 		if (obuf->len > len)
1759 			obuf->len = len;
1760 		ret += obuf->len;
1761 		len -= obuf->len;
1762 
1763 		o_head++;
1764 		opipe->head = o_head;
1765 		i_tail++;
1766 	} while (len);
1767 
1768 	pipe_unlock(ipipe);
1769 	pipe_unlock(opipe);
1770 
1771 	/*
1772 	 * If we put data in the output pipe, wakeup any potential readers.
1773 	 */
1774 	if (ret > 0)
1775 		wakeup_pipe_readers(opipe);
1776 
1777 	return ret;
1778 }
1779 
1780 /*
1781  * This is a tee(1) implementation that works on pipes. It doesn't copy
1782  * any data, it simply references the 'in' pages on the 'out' pipe.
1783  * The 'flags' used are the SPLICE_F_* variants, currently the only
1784  * applicable one is SPLICE_F_NONBLOCK.
1785  */
1786 long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
1787 {
1788 	struct pipe_inode_info *ipipe = get_pipe_info(in, true);
1789 	struct pipe_inode_info *opipe = get_pipe_info(out, true);
1790 	int ret = -EINVAL;
1791 
1792 	if (unlikely(!(in->f_mode & FMODE_READ) ||
1793 		     !(out->f_mode & FMODE_WRITE)))
1794 		return -EBADF;
1795 
1796 	/*
1797 	 * Duplicate the contents of ipipe to opipe without actually
1798 	 * copying the data.
1799 	 */
1800 	if (ipipe && opipe && ipipe != opipe) {
1801 		if ((in->f_flags | out->f_flags) & O_NONBLOCK)
1802 			flags |= SPLICE_F_NONBLOCK;
1803 
1804 		/*
1805 		 * Keep going, unless we encounter an error. The ipipe/opipe
1806 		 * ordering doesn't really matter.
1807 		 */
1808 		ret = ipipe_prep(ipipe, flags);
1809 		if (!ret) {
1810 			ret = opipe_prep(opipe, flags);
1811 			if (!ret)
1812 				ret = link_pipe(ipipe, opipe, len, flags);
1813 		}
1814 	}
1815 
1816 	return ret;
1817 }
1818 
1819 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
1820 {
1821 	struct fd in, out;
1822 	int error;
1823 
1824 	if (unlikely(flags & ~SPLICE_F_ALL))
1825 		return -EINVAL;
1826 
1827 	if (unlikely(!len))
1828 		return 0;
1829 
1830 	error = -EBADF;
1831 	in = fdget(fdin);
1832 	if (in.file) {
1833 		out = fdget(fdout);
1834 		if (out.file) {
1835 			error = do_tee(in.file, out.file, len, flags);
1836 			fdput(out);
1837 		}
1838  		fdput(in);
1839  	}
1840 
1841 	return error;
1842 }
1843