xref: /openbmc/linux/fs/pipe.c (revision f4b00eab5004e823f28a268580ae4ed16df9fabf)
1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
31da177e4SLinus Torvalds  *  linux/fs/pipe.c
41da177e4SLinus Torvalds  *
51da177e4SLinus Torvalds  *  Copyright (C) 1991, 1992, 1999  Linus Torvalds
61da177e4SLinus Torvalds  */
71da177e4SLinus Torvalds 
81da177e4SLinus Torvalds #include <linux/mm.h>
91da177e4SLinus Torvalds #include <linux/file.h>
101da177e4SLinus Torvalds #include <linux/poll.h>
111da177e4SLinus Torvalds #include <linux/slab.h>
121da177e4SLinus Torvalds #include <linux/module.h>
131da177e4SLinus Torvalds #include <linux/init.h>
141da177e4SLinus Torvalds #include <linux/fs.h>
1535f3d14dSJens Axboe #include <linux/log2.h>
161da177e4SLinus Torvalds #include <linux/mount.h>
174fa7ec5dSDavid Howells #include <linux/pseudo_fs.h>
18b502bd11SMuthu Kumar #include <linux/magic.h>
191da177e4SLinus Torvalds #include <linux/pipe_fs_i.h>
201da177e4SLinus Torvalds #include <linux/uio.h>
211da177e4SLinus Torvalds #include <linux/highmem.h>
225274f052SJens Axboe #include <linux/pagemap.h>
23db349509SAl Viro #include <linux/audit.h>
24ba719baeSUlrich Drepper #include <linux/syscalls.h>
25b492e95bSJens Axboe #include <linux/fcntl.h>
26d86133bdSVladimir Davydov #include <linux/memcontrol.h>
271da177e4SLinus Torvalds 
287c0f6ba6SLinus Torvalds #include <linux/uaccess.h>
291da177e4SLinus Torvalds #include <asm/ioctls.h>
301da177e4SLinus Torvalds 
31599a0ac1SAl Viro #include "internal.h"
32599a0ac1SAl Viro 
331da177e4SLinus Torvalds /*
34b492e95bSJens Axboe  * The max size that a non-root user is allowed to grow the pipe. Can
35ff9da691SJens Axboe  * be set by root in /proc/sys/fs/pipe-max-size
36b492e95bSJens Axboe  */
37ff9da691SJens Axboe unsigned int pipe_max_size = 1048576;
38ff9da691SJens Axboe 
39759c0114SWilly Tarreau /* Maximum allocatable pages per user. Hard limit is unset by default, soft
40759c0114SWilly Tarreau  * matches default values.
41759c0114SWilly Tarreau  */
42759c0114SWilly Tarreau unsigned long pipe_user_pages_hard;
43759c0114SWilly Tarreau unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
44759c0114SWilly Tarreau 
45b492e95bSJens Axboe /*
468cefc107SDavid Howells  * We use head and tail indices that aren't masked off, except at the point of
478cefc107SDavid Howells  * dereference, but rather they're allowed to wrap naturally.  This means there
488cefc107SDavid Howells  * isn't a dead spot in the buffer, but the ring has to be a power of two and
498cefc107SDavid Howells  * <= 2^31.
508cefc107SDavid Howells  * -- David Howells 2019-09-23.
511da177e4SLinus Torvalds  *
521da177e4SLinus Torvalds  * Reads with count = 0 should always return 0.
531da177e4SLinus Torvalds  * -- Julian Bradfield 1999-06-07.
541da177e4SLinus Torvalds  *
551da177e4SLinus Torvalds  * FIFOs and Pipes now generate SIGIO for both readers and writers.
561da177e4SLinus Torvalds  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
571da177e4SLinus Torvalds  *
581da177e4SLinus Torvalds  * pipe_read & write cleanup
591da177e4SLinus Torvalds  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
601da177e4SLinus Torvalds  */
611da177e4SLinus Torvalds 
6261e0d47cSMiklos Szeredi static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
6361e0d47cSMiklos Szeredi {
646447a3cfSAl Viro 	if (pipe->files)
6572b0d9aaSAl Viro 		mutex_lock_nested(&pipe->mutex, subclass);
6661e0d47cSMiklos Szeredi }
6761e0d47cSMiklos Szeredi 
6861e0d47cSMiklos Szeredi void pipe_lock(struct pipe_inode_info *pipe)
6961e0d47cSMiklos Szeredi {
7061e0d47cSMiklos Szeredi 	/*
7161e0d47cSMiklos Szeredi 	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
7261e0d47cSMiklos Szeredi 	 */
7361e0d47cSMiklos Szeredi 	pipe_lock_nested(pipe, I_MUTEX_PARENT);
7461e0d47cSMiklos Szeredi }
7561e0d47cSMiklos Szeredi EXPORT_SYMBOL(pipe_lock);
7661e0d47cSMiklos Szeredi 
7761e0d47cSMiklos Szeredi void pipe_unlock(struct pipe_inode_info *pipe)
7861e0d47cSMiklos Szeredi {
796447a3cfSAl Viro 	if (pipe->files)
8072b0d9aaSAl Viro 		mutex_unlock(&pipe->mutex);
8161e0d47cSMiklos Szeredi }
8261e0d47cSMiklos Szeredi EXPORT_SYMBOL(pipe_unlock);
8361e0d47cSMiklos Szeredi 
84ebec73f4SAl Viro static inline void __pipe_lock(struct pipe_inode_info *pipe)
85ebec73f4SAl Viro {
86ebec73f4SAl Viro 	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
87ebec73f4SAl Viro }
88ebec73f4SAl Viro 
89ebec73f4SAl Viro static inline void __pipe_unlock(struct pipe_inode_info *pipe)
90ebec73f4SAl Viro {
91ebec73f4SAl Viro 	mutex_unlock(&pipe->mutex);
92ebec73f4SAl Viro }
93ebec73f4SAl Viro 
9461e0d47cSMiklos Szeredi void pipe_double_lock(struct pipe_inode_info *pipe1,
9561e0d47cSMiklos Szeredi 		      struct pipe_inode_info *pipe2)
9661e0d47cSMiklos Szeredi {
9761e0d47cSMiklos Szeredi 	BUG_ON(pipe1 == pipe2);
9861e0d47cSMiklos Szeredi 
9961e0d47cSMiklos Szeredi 	if (pipe1 < pipe2) {
10061e0d47cSMiklos Szeredi 		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
10161e0d47cSMiklos Szeredi 		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
10261e0d47cSMiklos Szeredi 	} else {
103023d43c7SPeter Zijlstra 		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
104023d43c7SPeter Zijlstra 		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
10561e0d47cSMiklos Szeredi 	}
10661e0d47cSMiklos Szeredi }
10761e0d47cSMiklos Szeredi 
1081da177e4SLinus Torvalds /* Drop the inode semaphore and wait for a pipe event, atomically */
1093a326a2cSIngo Molnar void pipe_wait(struct pipe_inode_info *pipe)
1101da177e4SLinus Torvalds {
1110ddad21dSLinus Torvalds 	DEFINE_WAIT(rdwait);
1120ddad21dSLinus Torvalds 	DEFINE_WAIT(wrwait);
1131da177e4SLinus Torvalds 
114d79fc0fcSIngo Molnar 	/*
115d79fc0fcSIngo Molnar 	 * Pipes are system-local resources, so sleeping on them
116d79fc0fcSIngo Molnar 	 * is considered a noninteractive wait:
117d79fc0fcSIngo Molnar 	 */
1180ddad21dSLinus Torvalds 	prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
1190ddad21dSLinus Torvalds 	prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE);
12061e0d47cSMiklos Szeredi 	pipe_unlock(pipe);
1211da177e4SLinus Torvalds 	schedule();
1220ddad21dSLinus Torvalds 	finish_wait(&pipe->rd_wait, &rdwait);
1230ddad21dSLinus Torvalds 	finish_wait(&pipe->wr_wait, &wrwait);
12461e0d47cSMiklos Szeredi 	pipe_lock(pipe);
1251da177e4SLinus Torvalds }
1261da177e4SLinus Torvalds 
127341b446bSIngo Molnar static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
128341b446bSIngo Molnar 				  struct pipe_buffer *buf)
1291da177e4SLinus Torvalds {
1301da177e4SLinus Torvalds 	struct page *page = buf->page;
1311da177e4SLinus Torvalds 
1325274f052SJens Axboe 	/*
1335274f052SJens Axboe 	 * If nobody else uses this page, and we don't already have a
1345274f052SJens Axboe 	 * temporary page, let's keep track of it as a one-deep
135341b446bSIngo Molnar 	 * allocation cache. (Otherwise just release our reference to it)
1365274f052SJens Axboe 	 */
137341b446bSIngo Molnar 	if (page_count(page) == 1 && !pipe->tmp_page)
138923f4f23SIngo Molnar 		pipe->tmp_page = page;
139341b446bSIngo Molnar 	else
14009cbfeafSKirill A. Shutemov 		put_page(page);
1411da177e4SLinus Torvalds }
1421da177e4SLinus Torvalds 
143d86133bdSVladimir Davydov static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
144d86133bdSVladimir Davydov 			       struct pipe_buffer *buf)
145d86133bdSVladimir Davydov {
146d86133bdSVladimir Davydov 	struct page *page = buf->page;
147d86133bdSVladimir Davydov 
148d86133bdSVladimir Davydov 	if (page_count(page) == 1) {
149*f4b00eabSRoman Gushchin 		memcg_kmem_uncharge_page(page, 0);
150d86133bdSVladimir Davydov 		__SetPageLocked(page);
151d86133bdSVladimir Davydov 		return 0;
152d86133bdSVladimir Davydov 	}
153d86133bdSVladimir Davydov 	return 1;
154d86133bdSVladimir Davydov }
155d86133bdSVladimir Davydov 
1560845718dSJens Axboe /**
157b51d63c6SRandy Dunlap  * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
1580845718dSJens Axboe  * @pipe:	the pipe that the buffer belongs to
1590845718dSJens Axboe  * @buf:	the buffer to attempt to steal
1600845718dSJens Axboe  *
1610845718dSJens Axboe  * Description:
162b51d63c6SRandy Dunlap  *	This function attempts to steal the &struct page attached to
1630845718dSJens Axboe  *	@buf. If successful, this function returns 0 and returns with
1640845718dSJens Axboe  *	the page locked. The caller may then reuse the page for whatever
165b51d63c6SRandy Dunlap  *	he wishes; the typical use is insertion into a different file
1660845718dSJens Axboe  *	page cache.
1670845718dSJens Axboe  */
168330ab716SJens Axboe int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
1695abc97aaSJens Axboe 			   struct pipe_buffer *buf)
1705abc97aaSJens Axboe {
17146e678c9SJens Axboe 	struct page *page = buf->page;
17246e678c9SJens Axboe 
1730845718dSJens Axboe 	/*
1740845718dSJens Axboe 	 * A reference of one is golden, that means that the owner of this
1750845718dSJens Axboe 	 * page is the only one holding a reference to it. lock the page
1760845718dSJens Axboe 	 * and return OK.
1770845718dSJens Axboe 	 */
17846e678c9SJens Axboe 	if (page_count(page) == 1) {
17946e678c9SJens Axboe 		lock_page(page);
1805abc97aaSJens Axboe 		return 0;
1815abc97aaSJens Axboe 	}
1825abc97aaSJens Axboe 
18346e678c9SJens Axboe 	return 1;
18446e678c9SJens Axboe }
18551921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_steal);
18646e678c9SJens Axboe 
1870845718dSJens Axboe /**
188b51d63c6SRandy Dunlap  * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
1890845718dSJens Axboe  * @pipe:	the pipe that the buffer belongs to
1900845718dSJens Axboe  * @buf:	the buffer to get a reference to
1910845718dSJens Axboe  *
1920845718dSJens Axboe  * Description:
1930845718dSJens Axboe  *	This function grabs an extra reference to @buf. It's used in
1940845718dSJens Axboe  *	in the tee() system call, when we duplicate the buffers in one
1950845718dSJens Axboe  *	pipe into another.
1960845718dSJens Axboe  */
19715fab63eSMatthew Wilcox bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
19870524490SJens Axboe {
19915fab63eSMatthew Wilcox 	return try_get_page(buf->page);
20070524490SJens Axboe }
20151921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_get);
20270524490SJens Axboe 
2030845718dSJens Axboe /**
2040845718dSJens Axboe  * generic_pipe_buf_confirm - verify contents of the pipe buffer
20579685b8dSRandy Dunlap  * @info:	the pipe that the buffer belongs to
2060845718dSJens Axboe  * @buf:	the buffer to confirm
2070845718dSJens Axboe  *
2080845718dSJens Axboe  * Description:
2090845718dSJens Axboe  *	This function does nothing, because the generic pipe code uses
2100845718dSJens Axboe  *	pages that are always good when inserted into the pipe.
2110845718dSJens Axboe  */
212cac36bb0SJens Axboe int generic_pipe_buf_confirm(struct pipe_inode_info *info,
213cac36bb0SJens Axboe 			     struct pipe_buffer *buf)
214f84d7519SJens Axboe {
215f84d7519SJens Axboe 	return 0;
216f84d7519SJens Axboe }
21751921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_confirm);
218f84d7519SJens Axboe 
2196818173bSMiklos Szeredi /**
2206818173bSMiklos Szeredi  * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
2216818173bSMiklos Szeredi  * @pipe:	the pipe that the buffer belongs to
2226818173bSMiklos Szeredi  * @buf:	the buffer to put a reference to
2236818173bSMiklos Szeredi  *
2246818173bSMiklos Szeredi  * Description:
2256818173bSMiklos Szeredi  *	This function releases a reference to @buf.
2266818173bSMiklos Szeredi  */
2276818173bSMiklos Szeredi void generic_pipe_buf_release(struct pipe_inode_info *pipe,
2286818173bSMiklos Szeredi 			      struct pipe_buffer *buf)
2296818173bSMiklos Szeredi {
23009cbfeafSKirill A. Shutemov 	put_page(buf->page);
2316818173bSMiklos Szeredi }
23251921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_release);
2336818173bSMiklos Szeredi 
23401e7187bSJann Horn /* New data written to a pipe may be appended to a buffer with this type. */
235d4c3cca9SEric Dumazet static const struct pipe_buf_operations anon_pipe_buf_ops = {
236cac36bb0SJens Axboe 	.confirm = generic_pipe_buf_confirm,
2371da177e4SLinus Torvalds 	.release = anon_pipe_buf_release,
238d86133bdSVladimir Davydov 	.steal = anon_pipe_buf_steal,
239f84d7519SJens Axboe 	.get = generic_pipe_buf_get,
2401da177e4SLinus Torvalds };
2411da177e4SLinus Torvalds 
242a0ce2f0aSJann Horn static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = {
2431da177e4SLinus Torvalds 	.confirm = generic_pipe_buf_confirm,
2441da177e4SLinus Torvalds 	.release = anon_pipe_buf_release,
2451da177e4SLinus Torvalds 	.steal = anon_pipe_buf_steal,
2461da177e4SLinus Torvalds 	.get = generic_pipe_buf_get,
247923f4f23SIngo Molnar };
2481da177e4SLinus Torvalds 
2499883035aSLinus Torvalds static const struct pipe_buf_operations packet_pipe_buf_ops = {
2509883035aSLinus Torvalds 	.confirm = generic_pipe_buf_confirm,
2519883035aSLinus Torvalds 	.release = anon_pipe_buf_release,
252d86133bdSVladimir Davydov 	.steal = anon_pipe_buf_steal,
2539883035aSLinus Torvalds 	.get = generic_pipe_buf_get,
2549883035aSLinus Torvalds };
2559883035aSLinus Torvalds 
25601e7187bSJann Horn /**
25701e7187bSJann Horn  * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable
25801e7187bSJann Horn  * @buf:	the buffer to mark
25901e7187bSJann Horn  *
26001e7187bSJann Horn  * Description:
26101e7187bSJann Horn  *	This function ensures that no future writes will be merged into the
26201e7187bSJann Horn  *	given &struct pipe_buffer. This is necessary when multiple pipe buffers
26301e7187bSJann Horn  *	share the same backing page.
26401e7187bSJann Horn  */
265a0ce2f0aSJann Horn void pipe_buf_mark_unmergeable(struct pipe_buffer *buf)
266a0ce2f0aSJann Horn {
267a0ce2f0aSJann Horn 	if (buf->ops == &anon_pipe_buf_ops)
268a0ce2f0aSJann Horn 		buf->ops = &anon_pipe_buf_nomerge_ops;
269a0ce2f0aSJann Horn }
270a0ce2f0aSJann Horn 
27101e7187bSJann Horn static bool pipe_buf_can_merge(struct pipe_buffer *buf)
27201e7187bSJann Horn {
27301e7187bSJann Horn 	return buf->ops == &anon_pipe_buf_ops;
27401e7187bSJann Horn }
27501e7187bSJann Horn 
27685190d15SLinus Torvalds /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
27785190d15SLinus Torvalds static inline bool pipe_readable(const struct pipe_inode_info *pipe)
27885190d15SLinus Torvalds {
27985190d15SLinus Torvalds 	unsigned int head = READ_ONCE(pipe->head);
28085190d15SLinus Torvalds 	unsigned int tail = READ_ONCE(pipe->tail);
28185190d15SLinus Torvalds 	unsigned int writers = READ_ONCE(pipe->writers);
28285190d15SLinus Torvalds 
28385190d15SLinus Torvalds 	return !pipe_empty(head, tail) || !writers;
28485190d15SLinus Torvalds }
28585190d15SLinus Torvalds 
2861da177e4SLinus Torvalds static ssize_t
287fb9096a3SAl Viro pipe_read(struct kiocb *iocb, struct iov_iter *to)
2881da177e4SLinus Torvalds {
289fb9096a3SAl Viro 	size_t total_len = iov_iter_count(to);
290ee0b3e67SBadari Pulavarty 	struct file *filp = iocb->ki_filp;
291de32ec4cSAl Viro 	struct pipe_inode_info *pipe = filp->private_data;
2920ddad21dSLinus Torvalds 	bool was_full, wake_next_reader = false;
2931da177e4SLinus Torvalds 	ssize_t ret;
2941da177e4SLinus Torvalds 
2951da177e4SLinus Torvalds 	/* Null read succeeds. */
2961da177e4SLinus Torvalds 	if (unlikely(total_len == 0))
2971da177e4SLinus Torvalds 		return 0;
2981da177e4SLinus Torvalds 
2991da177e4SLinus Torvalds 	ret = 0;
300ebec73f4SAl Viro 	__pipe_lock(pipe);
301f467a6a6SLinus Torvalds 
302f467a6a6SLinus Torvalds 	/*
303f467a6a6SLinus Torvalds 	 * We only wake up writers if the pipe was full when we started
304f467a6a6SLinus Torvalds 	 * reading in order to avoid unnecessary wakeups.
305f467a6a6SLinus Torvalds 	 *
306f467a6a6SLinus Torvalds 	 * But when we do wake up writers, we do so using a sync wakeup
307f467a6a6SLinus Torvalds 	 * (WF_SYNC), because we want them to get going and generate more
308f467a6a6SLinus Torvalds 	 * data for us.
309f467a6a6SLinus Torvalds 	 */
310f467a6a6SLinus Torvalds 	was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
3111da177e4SLinus Torvalds 	for (;;) {
3128cefc107SDavid Howells 		unsigned int head = pipe->head;
3138cefc107SDavid Howells 		unsigned int tail = pipe->tail;
3148cefc107SDavid Howells 		unsigned int mask = pipe->ring_size - 1;
3158cefc107SDavid Howells 
3168cefc107SDavid Howells 		if (!pipe_empty(head, tail)) {
3178cefc107SDavid Howells 			struct pipe_buffer *buf = &pipe->bufs[tail & mask];
3181da177e4SLinus Torvalds 			size_t chars = buf->len;
319637b58c2SAl Viro 			size_t written;
320637b58c2SAl Viro 			int error;
3211da177e4SLinus Torvalds 
3221da177e4SLinus Torvalds 			if (chars > total_len)
3231da177e4SLinus Torvalds 				chars = total_len;
3241da177e4SLinus Torvalds 
325fba597dbSMiklos Szeredi 			error = pipe_buf_confirm(pipe, buf);
326f84d7519SJens Axboe 			if (error) {
3275274f052SJens Axboe 				if (!ret)
328e5953cbdSNicolas Kaiser 					ret = error;
3295274f052SJens Axboe 				break;
3305274f052SJens Axboe 			}
331f84d7519SJens Axboe 
332fb9096a3SAl Viro 			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
333637b58c2SAl Viro 			if (unlikely(written < chars)) {
334341b446bSIngo Molnar 				if (!ret)
335637b58c2SAl Viro 					ret = -EFAULT;
3361da177e4SLinus Torvalds 				break;
3371da177e4SLinus Torvalds 			}
3381da177e4SLinus Torvalds 			ret += chars;
3391da177e4SLinus Torvalds 			buf->offset += chars;
3401da177e4SLinus Torvalds 			buf->len -= chars;
3419883035aSLinus Torvalds 
3429883035aSLinus Torvalds 			/* Was it a packet buffer? Clean up and exit */
3439883035aSLinus Torvalds 			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
3449883035aSLinus Torvalds 				total_len = chars;
3459883035aSLinus Torvalds 				buf->len = 0;
3469883035aSLinus Torvalds 			}
3479883035aSLinus Torvalds 
3481da177e4SLinus Torvalds 			if (!buf->len) {
349a779638cSMiklos Szeredi 				pipe_buf_release(pipe, buf);
3500ddad21dSLinus Torvalds 				spin_lock_irq(&pipe->rd_wait.lock);
3518cefc107SDavid Howells 				tail++;
3528cefc107SDavid Howells 				pipe->tail = tail;
3530ddad21dSLinus Torvalds 				spin_unlock_irq(&pipe->rd_wait.lock);
3541da177e4SLinus Torvalds 			}
3551da177e4SLinus Torvalds 			total_len -= chars;
3561da177e4SLinus Torvalds 			if (!total_len)
3571da177e4SLinus Torvalds 				break;	/* common path: read succeeded */
3588cefc107SDavid Howells 			if (!pipe_empty(head, tail))	/* More to do? */
3591da177e4SLinus Torvalds 				continue;
3608cefc107SDavid Howells 		}
3618cefc107SDavid Howells 
362923f4f23SIngo Molnar 		if (!pipe->writers)
3631da177e4SLinus Torvalds 			break;
3641da177e4SLinus Torvalds 		if (ret)
3651da177e4SLinus Torvalds 			break;
3661da177e4SLinus Torvalds 		if (filp->f_flags & O_NONBLOCK) {
3671da177e4SLinus Torvalds 			ret = -EAGAIN;
3681da177e4SLinus Torvalds 			break;
3691da177e4SLinus Torvalds 		}
37085190d15SLinus Torvalds 		__pipe_unlock(pipe);
371d1c6a2aaSLinus Torvalds 
372d1c6a2aaSLinus Torvalds 		/*
373d1c6a2aaSLinus Torvalds 		 * We only get here if we didn't actually read anything.
374d1c6a2aaSLinus Torvalds 		 *
375d1c6a2aaSLinus Torvalds 		 * However, we could have seen (and removed) a zero-sized
376d1c6a2aaSLinus Torvalds 		 * pipe buffer, and might have made space in the buffers
377d1c6a2aaSLinus Torvalds 		 * that way.
378d1c6a2aaSLinus Torvalds 		 *
379d1c6a2aaSLinus Torvalds 		 * You can't make zero-sized pipe buffers by doing an empty
380d1c6a2aaSLinus Torvalds 		 * write (not even in packet mode), but they can happen if
381d1c6a2aaSLinus Torvalds 		 * the writer gets an EFAULT when trying to fill a buffer
382d1c6a2aaSLinus Torvalds 		 * that already got allocated and inserted in the buffer
383d1c6a2aaSLinus Torvalds 		 * array.
384d1c6a2aaSLinus Torvalds 		 *
385d1c6a2aaSLinus Torvalds 		 * So we still need to wake up any pending writers in the
386d1c6a2aaSLinus Torvalds 		 * _very_ unlikely case that the pipe was full, but we got
387d1c6a2aaSLinus Torvalds 		 * no data.
388d1c6a2aaSLinus Torvalds 		 */
389d1c6a2aaSLinus Torvalds 		if (unlikely(was_full)) {
3900ddad21dSLinus Torvalds 			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
391f467a6a6SLinus Torvalds 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
392f467a6a6SLinus Torvalds 		}
393d1c6a2aaSLinus Torvalds 
394d1c6a2aaSLinus Torvalds 		/*
395d1c6a2aaSLinus Torvalds 		 * But because we didn't read anything, at this point we can
396d1c6a2aaSLinus Torvalds 		 * just return directly with -ERESTARTSYS if we're interrupted,
397d1c6a2aaSLinus Torvalds 		 * since we've done any required wakeups and there's no need
398d1c6a2aaSLinus Torvalds 		 * to mark anything accessed. And we've dropped the lock.
399d1c6a2aaSLinus Torvalds 		 */
4000ddad21dSLinus Torvalds 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
401d1c6a2aaSLinus Torvalds 			return -ERESTARTSYS;
402d1c6a2aaSLinus Torvalds 
40385190d15SLinus Torvalds 		__pipe_lock(pipe);
404f467a6a6SLinus Torvalds 		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
4050ddad21dSLinus Torvalds 		wake_next_reader = true;
4061da177e4SLinus Torvalds 	}
4070ddad21dSLinus Torvalds 	if (pipe_empty(pipe->head, pipe->tail))
4080ddad21dSLinus Torvalds 		wake_next_reader = false;
409ebec73f4SAl Viro 	__pipe_unlock(pipe);
410341b446bSIngo Molnar 
411f467a6a6SLinus Torvalds 	if (was_full) {
4120ddad21dSLinus Torvalds 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
413923f4f23SIngo Molnar 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
4141da177e4SLinus Torvalds 	}
4150ddad21dSLinus Torvalds 	if (wake_next_reader)
4160ddad21dSLinus Torvalds 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
4171da177e4SLinus Torvalds 	if (ret > 0)
4181da177e4SLinus Torvalds 		file_accessed(filp);
4191da177e4SLinus Torvalds 	return ret;
4201da177e4SLinus Torvalds }
4211da177e4SLinus Torvalds 
4229883035aSLinus Torvalds static inline int is_packetized(struct file *file)
4239883035aSLinus Torvalds {
4249883035aSLinus Torvalds 	return (file->f_flags & O_DIRECT) != 0;
4259883035aSLinus Torvalds }
4269883035aSLinus Torvalds 
42785190d15SLinus Torvalds /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
42885190d15SLinus Torvalds static inline bool pipe_writable(const struct pipe_inode_info *pipe)
42985190d15SLinus Torvalds {
43085190d15SLinus Torvalds 	unsigned int head = READ_ONCE(pipe->head);
43185190d15SLinus Torvalds 	unsigned int tail = READ_ONCE(pipe->tail);
43285190d15SLinus Torvalds 	unsigned int max_usage = READ_ONCE(pipe->max_usage);
43385190d15SLinus Torvalds 
43485190d15SLinus Torvalds 	return !pipe_full(head, tail, max_usage) ||
43585190d15SLinus Torvalds 		!READ_ONCE(pipe->readers);
43685190d15SLinus Torvalds }
43785190d15SLinus Torvalds 
4381da177e4SLinus Torvalds static ssize_t
439f0d1bec9SAl Viro pipe_write(struct kiocb *iocb, struct iov_iter *from)
4401da177e4SLinus Torvalds {
441ee0b3e67SBadari Pulavarty 	struct file *filp = iocb->ki_filp;
442de32ec4cSAl Viro 	struct pipe_inode_info *pipe = filp->private_data;
4438f868d68SDavid Howells 	unsigned int head;
444f0d1bec9SAl Viro 	ssize_t ret = 0;
445f0d1bec9SAl Viro 	size_t total_len = iov_iter_count(from);
4461da177e4SLinus Torvalds 	ssize_t chars;
4471b6b26aeSLinus Torvalds 	bool was_empty = false;
4480ddad21dSLinus Torvalds 	bool wake_next_writer = false;
4491da177e4SLinus Torvalds 
4501da177e4SLinus Torvalds 	/* Null write succeeds. */
4511da177e4SLinus Torvalds 	if (unlikely(total_len == 0))
4521da177e4SLinus Torvalds 		return 0;
4531da177e4SLinus Torvalds 
454ebec73f4SAl Viro 	__pipe_lock(pipe);
4551da177e4SLinus Torvalds 
456923f4f23SIngo Molnar 	if (!pipe->readers) {
4571da177e4SLinus Torvalds 		send_sig(SIGPIPE, current, 0);
4581da177e4SLinus Torvalds 		ret = -EPIPE;
4591da177e4SLinus Torvalds 		goto out;
4601da177e4SLinus Torvalds 	}
4611da177e4SLinus Torvalds 
4621b6b26aeSLinus Torvalds 	/*
4631b6b26aeSLinus Torvalds 	 * Only wake up if the pipe started out empty, since
4641b6b26aeSLinus Torvalds 	 * otherwise there should be no readers waiting.
4651b6b26aeSLinus Torvalds 	 *
4661b6b26aeSLinus Torvalds 	 * If it wasn't empty we try to merge new data into
4671b6b26aeSLinus Torvalds 	 * the last buffer.
4681b6b26aeSLinus Torvalds 	 *
4691b6b26aeSLinus Torvalds 	 * That naturally merges small writes, but it also
4701b6b26aeSLinus Torvalds 	 * page-aligs the rest of the writes for large writes
4711b6b26aeSLinus Torvalds 	 * spanning multiple pages.
4721b6b26aeSLinus Torvalds 	 */
4738cefc107SDavid Howells 	head = pipe->head;
4741b6b26aeSLinus Torvalds 	was_empty = pipe_empty(head, pipe->tail);
4751b6b26aeSLinus Torvalds 	chars = total_len & (PAGE_SIZE-1);
4761b6b26aeSLinus Torvalds 	if (chars && !was_empty) {
4778f868d68SDavid Howells 		unsigned int mask = pipe->ring_size - 1;
4788cefc107SDavid Howells 		struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
4791da177e4SLinus Torvalds 		int offset = buf->offset + buf->len;
480341b446bSIngo Molnar 
48101e7187bSJann Horn 		if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) {
482fba597dbSMiklos Szeredi 			ret = pipe_buf_confirm(pipe, buf);
4836ae08069SEric Biggers 			if (ret)
4845274f052SJens Axboe 				goto out;
485f84d7519SJens Axboe 
486f0d1bec9SAl Viro 			ret = copy_page_from_iter(buf->page, offset, chars, from);
487f0d1bec9SAl Viro 			if (unlikely(ret < chars)) {
4886ae08069SEric Biggers 				ret = -EFAULT;
4891da177e4SLinus Torvalds 				goto out;
490f6762b7aSJens Axboe 			}
4911b6b26aeSLinus Torvalds 
4926ae08069SEric Biggers 			buf->len += ret;
493f0d1bec9SAl Viro 			if (!iov_iter_count(from))
4941da177e4SLinus Torvalds 				goto out;
4951da177e4SLinus Torvalds 		}
4961da177e4SLinus Torvalds 	}
4971da177e4SLinus Torvalds 
4981da177e4SLinus Torvalds 	for (;;) {
499923f4f23SIngo Molnar 		if (!pipe->readers) {
5001da177e4SLinus Torvalds 			send_sig(SIGPIPE, current, 0);
501341b446bSIngo Molnar 			if (!ret)
502341b446bSIngo Molnar 				ret = -EPIPE;
5031da177e4SLinus Torvalds 			break;
5041da177e4SLinus Torvalds 		}
5058cefc107SDavid Howells 
506a194dfe6SDavid Howells 		head = pipe->head;
5078f868d68SDavid Howells 		if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
5088f868d68SDavid Howells 			unsigned int mask = pipe->ring_size - 1;
5098cefc107SDavid Howells 			struct pipe_buffer *buf = &pipe->bufs[head & mask];
510923f4f23SIngo Molnar 			struct page *page = pipe->tmp_page;
511f0d1bec9SAl Viro 			int copied;
5121da177e4SLinus Torvalds 
5131da177e4SLinus Torvalds 			if (!page) {
514d86133bdSVladimir Davydov 				page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
5151da177e4SLinus Torvalds 				if (unlikely(!page)) {
5161da177e4SLinus Torvalds 					ret = ret ? : -ENOMEM;
5171da177e4SLinus Torvalds 					break;
5181da177e4SLinus Torvalds 				}
519923f4f23SIngo Molnar 				pipe->tmp_page = page;
5201da177e4SLinus Torvalds 			}
521a194dfe6SDavid Howells 
522a194dfe6SDavid Howells 			/* Allocate a slot in the ring in advance and attach an
523a194dfe6SDavid Howells 			 * empty buffer.  If we fault or otherwise fail to use
524a194dfe6SDavid Howells 			 * it, either the reader will consume it or it'll still
525a194dfe6SDavid Howells 			 * be there for the next write.
526a194dfe6SDavid Howells 			 */
5270ddad21dSLinus Torvalds 			spin_lock_irq(&pipe->rd_wait.lock);
528a194dfe6SDavid Howells 
529a194dfe6SDavid Howells 			head = pipe->head;
5308f868d68SDavid Howells 			if (pipe_full(head, pipe->tail, pipe->max_usage)) {
5310ddad21dSLinus Torvalds 				spin_unlock_irq(&pipe->rd_wait.lock);
5328df44129SDavid Howells 				continue;
5338df44129SDavid Howells 			}
5348df44129SDavid Howells 
535a194dfe6SDavid Howells 			pipe->head = head + 1;
5360ddad21dSLinus Torvalds 			spin_unlock_irq(&pipe->rd_wait.lock);
537a194dfe6SDavid Howells 
538a194dfe6SDavid Howells 			/* Insert it into the buffer array */
539a194dfe6SDavid Howells 			buf = &pipe->bufs[head & mask];
540a194dfe6SDavid Howells 			buf->page = page;
541a194dfe6SDavid Howells 			buf->ops = &anon_pipe_buf_ops;
542a194dfe6SDavid Howells 			buf->offset = 0;
543a194dfe6SDavid Howells 			buf->len = 0;
544a194dfe6SDavid Howells 			buf->flags = 0;
545a194dfe6SDavid Howells 			if (is_packetized(filp)) {
546a194dfe6SDavid Howells 				buf->ops = &packet_pipe_buf_ops;
547a194dfe6SDavid Howells 				buf->flags = PIPE_BUF_FLAG_PACKET;
548a194dfe6SDavid Howells 			}
549a194dfe6SDavid Howells 			pipe->tmp_page = NULL;
550a194dfe6SDavid Howells 
551f0d1bec9SAl Viro 			copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
552f0d1bec9SAl Viro 			if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
553341b446bSIngo Molnar 				if (!ret)
554f0d1bec9SAl Viro 					ret = -EFAULT;
5551da177e4SLinus Torvalds 				break;
5561da177e4SLinus Torvalds 			}
557f0d1bec9SAl Viro 			ret += copied;
5581da177e4SLinus Torvalds 			buf->offset = 0;
559f0d1bec9SAl Viro 			buf->len = copied;
5601da177e4SLinus Torvalds 
561f0d1bec9SAl Viro 			if (!iov_iter_count(from))
5621da177e4SLinus Torvalds 				break;
5631da177e4SLinus Torvalds 		}
5648cefc107SDavid Howells 
5658f868d68SDavid Howells 		if (!pipe_full(head, pipe->tail, pipe->max_usage))
5661da177e4SLinus Torvalds 			continue;
5678cefc107SDavid Howells 
5688cefc107SDavid Howells 		/* Wait for buffer space to become available. */
5691da177e4SLinus Torvalds 		if (filp->f_flags & O_NONBLOCK) {
570341b446bSIngo Molnar 			if (!ret)
571341b446bSIngo Molnar 				ret = -EAGAIN;
5721da177e4SLinus Torvalds 			break;
5731da177e4SLinus Torvalds 		}
5741da177e4SLinus Torvalds 		if (signal_pending(current)) {
575341b446bSIngo Molnar 			if (!ret)
576341b446bSIngo Molnar 				ret = -ERESTARTSYS;
5771da177e4SLinus Torvalds 			break;
5781da177e4SLinus Torvalds 		}
5791b6b26aeSLinus Torvalds 
5801b6b26aeSLinus Torvalds 		/*
5811b6b26aeSLinus Torvalds 		 * We're going to release the pipe lock and wait for more
5821b6b26aeSLinus Torvalds 		 * space. We wake up any readers if necessary, and then
5831b6b26aeSLinus Torvalds 		 * after waiting we need to re-check whether the pipe
5841b6b26aeSLinus Torvalds 		 * become empty while we dropped the lock.
5851b6b26aeSLinus Torvalds 		 */
58685190d15SLinus Torvalds 		__pipe_unlock(pipe);
5871b6b26aeSLinus Torvalds 		if (was_empty) {
5880ddad21dSLinus Torvalds 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
5891b6b26aeSLinus Torvalds 			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
5901b6b26aeSLinus Torvalds 		}
5910ddad21dSLinus Torvalds 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
59285190d15SLinus Torvalds 		__pipe_lock(pipe);
5930dd1e377SJan Stancek 		was_empty = pipe_empty(pipe->head, pipe->tail);
5940ddad21dSLinus Torvalds 		wake_next_writer = true;
5951da177e4SLinus Torvalds 	}
5961da177e4SLinus Torvalds out:
5970ddad21dSLinus Torvalds 	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
5980ddad21dSLinus Torvalds 		wake_next_writer = false;
599ebec73f4SAl Viro 	__pipe_unlock(pipe);
6001b6b26aeSLinus Torvalds 
6011b6b26aeSLinus Torvalds 	/*
6021b6b26aeSLinus Torvalds 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
6031b6b26aeSLinus Torvalds 	 * want the reader to start processing things asap, rather than
6041b6b26aeSLinus Torvalds 	 * leave the data pending.
6051b6b26aeSLinus Torvalds 	 *
6061b6b26aeSLinus Torvalds 	 * This is particularly important for small writes, because of
6071b6b26aeSLinus Torvalds 	 * how (for example) the GNU make jobserver uses small writes to
6081b6b26aeSLinus Torvalds 	 * wake up pending jobs
6091b6b26aeSLinus Torvalds 	 */
6101b6b26aeSLinus Torvalds 	if (was_empty) {
6110ddad21dSLinus Torvalds 		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
612923f4f23SIngo Molnar 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
6131da177e4SLinus Torvalds 	}
6140ddad21dSLinus Torvalds 	if (wake_next_writer)
6150ddad21dSLinus Torvalds 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
6167e775f46SDmitry Monakhov 	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
617c3b2da31SJosef Bacik 		int err = file_update_time(filp);
618c3b2da31SJosef Bacik 		if (err)
619c3b2da31SJosef Bacik 			ret = err;
6207e775f46SDmitry Monakhov 		sb_end_write(file_inode(filp)->i_sb);
621c3b2da31SJosef Bacik 	}
6221da177e4SLinus Torvalds 	return ret;
6231da177e4SLinus Torvalds }
6241da177e4SLinus Torvalds 
625d59d0b1bSAndi Kleen static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
6261da177e4SLinus Torvalds {
627de32ec4cSAl Viro 	struct pipe_inode_info *pipe = filp->private_data;
6288cefc107SDavid Howells 	int count, head, tail, mask;
6291da177e4SLinus Torvalds 
6301da177e4SLinus Torvalds 	switch (cmd) {
6311da177e4SLinus Torvalds 		case FIONREAD:
632ebec73f4SAl Viro 			__pipe_lock(pipe);
6331da177e4SLinus Torvalds 			count = 0;
6348cefc107SDavid Howells 			head = pipe->head;
6358cefc107SDavid Howells 			tail = pipe->tail;
6368cefc107SDavid Howells 			mask = pipe->ring_size - 1;
6378cefc107SDavid Howells 
6388cefc107SDavid Howells 			while (tail != head) {
6398cefc107SDavid Howells 				count += pipe->bufs[tail & mask].len;
6408cefc107SDavid Howells 				tail++;
6411da177e4SLinus Torvalds 			}
642ebec73f4SAl Viro 			__pipe_unlock(pipe);
643923f4f23SIngo Molnar 
6441da177e4SLinus Torvalds 			return put_user(count, (int __user *)arg);
6451da177e4SLinus Torvalds 		default:
64646ce341bSWill Deacon 			return -ENOIOCTLCMD;
6471da177e4SLinus Torvalds 	}
6481da177e4SLinus Torvalds }
6491da177e4SLinus Torvalds 
650dd67081bSChristoph Hellwig /* No kernel lock held - fine */
651a11e1d43SLinus Torvalds static __poll_t
652a11e1d43SLinus Torvalds pipe_poll(struct file *filp, poll_table *wait)
653dd67081bSChristoph Hellwig {
654a11e1d43SLinus Torvalds 	__poll_t mask;
655dd67081bSChristoph Hellwig 	struct pipe_inode_info *pipe = filp->private_data;
656ad910e36SLinus Torvalds 	unsigned int head, tail;
657a11e1d43SLinus Torvalds 
658ad910e36SLinus Torvalds 	/*
6590ddad21dSLinus Torvalds 	 * Reading pipe state only -- no need for acquiring the semaphore.
660ad910e36SLinus Torvalds 	 *
661ad910e36SLinus Torvalds 	 * But because this is racy, the code has to add the
662ad910e36SLinus Torvalds 	 * entry to the poll table _first_ ..
663ad910e36SLinus Torvalds 	 */
6640ddad21dSLinus Torvalds 	if (filp->f_mode & FMODE_READ)
6650ddad21dSLinus Torvalds 		poll_wait(filp, &pipe->rd_wait, wait);
6660ddad21dSLinus Torvalds 	if (filp->f_mode & FMODE_WRITE)
6670ddad21dSLinus Torvalds 		poll_wait(filp, &pipe->wr_wait, wait);
6681da177e4SLinus Torvalds 
669ad910e36SLinus Torvalds 	/*
670ad910e36SLinus Torvalds 	 * .. and only then can you do the racy tests. That way,
671ad910e36SLinus Torvalds 	 * if something changes and you got it wrong, the poll
672ad910e36SLinus Torvalds 	 * table entry will wake you up and fix it.
673ad910e36SLinus Torvalds 	 */
674ad910e36SLinus Torvalds 	head = READ_ONCE(pipe->head);
675ad910e36SLinus Torvalds 	tail = READ_ONCE(pipe->tail);
676ad910e36SLinus Torvalds 
677a11e1d43SLinus Torvalds 	mask = 0;
6781da177e4SLinus Torvalds 	if (filp->f_mode & FMODE_READ) {
6798cefc107SDavid Howells 		if (!pipe_empty(head, tail))
6808cefc107SDavid Howells 			mask |= EPOLLIN | EPOLLRDNORM;
681923f4f23SIngo Molnar 		if (!pipe->writers && filp->f_version != pipe->w_counter)
682a9a08845SLinus Torvalds 			mask |= EPOLLHUP;
6831da177e4SLinus Torvalds 	}
6841da177e4SLinus Torvalds 
6851da177e4SLinus Torvalds 	if (filp->f_mode & FMODE_WRITE) {
6866718b6f8SDavid Howells 		if (!pipe_full(head, tail, pipe->max_usage))
6878cefc107SDavid Howells 			mask |= EPOLLOUT | EPOLLWRNORM;
6885e5d7a22SPekka Enberg 		/*
689a9a08845SLinus Torvalds 		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
6905e5d7a22SPekka Enberg 		 * behave exactly like pipes for poll().
6915e5d7a22SPekka Enberg 		 */
692923f4f23SIngo Molnar 		if (!pipe->readers)
693a9a08845SLinus Torvalds 			mask |= EPOLLERR;
6941da177e4SLinus Torvalds 	}
6951da177e4SLinus Torvalds 
6961da177e4SLinus Torvalds 	return mask;
6971da177e4SLinus Torvalds }
6981da177e4SLinus Torvalds 
699b0d8d229SLinus Torvalds static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
700b0d8d229SLinus Torvalds {
701b0d8d229SLinus Torvalds 	int kill = 0;
702b0d8d229SLinus Torvalds 
703b0d8d229SLinus Torvalds 	spin_lock(&inode->i_lock);
704b0d8d229SLinus Torvalds 	if (!--pipe->files) {
705b0d8d229SLinus Torvalds 		inode->i_pipe = NULL;
706b0d8d229SLinus Torvalds 		kill = 1;
707b0d8d229SLinus Torvalds 	}
708b0d8d229SLinus Torvalds 	spin_unlock(&inode->i_lock);
709b0d8d229SLinus Torvalds 
710b0d8d229SLinus Torvalds 	if (kill)
711b0d8d229SLinus Torvalds 		free_pipe_info(pipe);
712b0d8d229SLinus Torvalds }
713b0d8d229SLinus Torvalds 
7141da177e4SLinus Torvalds static int
715599a0ac1SAl Viro pipe_release(struct inode *inode, struct file *file)
7161da177e4SLinus Torvalds {
717b0d8d229SLinus Torvalds 	struct pipe_inode_info *pipe = file->private_data;
718923f4f23SIngo Molnar 
719ebec73f4SAl Viro 	__pipe_lock(pipe);
720599a0ac1SAl Viro 	if (file->f_mode & FMODE_READ)
721599a0ac1SAl Viro 		pipe->readers--;
722599a0ac1SAl Viro 	if (file->f_mode & FMODE_WRITE)
723599a0ac1SAl Viro 		pipe->writers--;
724341b446bSIngo Molnar 
7256551d5c5SLinus Torvalds 	/* Was that the last reader or writer, but not the other side? */
7266551d5c5SLinus Torvalds 	if (!pipe->readers != !pipe->writers) {
7276551d5c5SLinus Torvalds 		wake_up_interruptible_all(&pipe->rd_wait);
7286551d5c5SLinus Torvalds 		wake_up_interruptible_all(&pipe->wr_wait);
729923f4f23SIngo Molnar 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
730923f4f23SIngo Molnar 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
7311da177e4SLinus Torvalds 	}
732ebec73f4SAl Viro 	__pipe_unlock(pipe);
733ba5bb147SAl Viro 
734b0d8d229SLinus Torvalds 	put_pipe_info(inode, pipe);
7351da177e4SLinus Torvalds 	return 0;
7361da177e4SLinus Torvalds }
7371da177e4SLinus Torvalds 
7381da177e4SLinus Torvalds static int
739599a0ac1SAl Viro pipe_fasync(int fd, struct file *filp, int on)
7401da177e4SLinus Torvalds {
741de32ec4cSAl Viro 	struct pipe_inode_info *pipe = filp->private_data;
742599a0ac1SAl Viro 	int retval = 0;
7431da177e4SLinus Torvalds 
744ebec73f4SAl Viro 	__pipe_lock(pipe);
745599a0ac1SAl Viro 	if (filp->f_mode & FMODE_READ)
746341b446bSIngo Molnar 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
747599a0ac1SAl Viro 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
748341b446bSIngo Molnar 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
749599a0ac1SAl Viro 		if (retval < 0 && (filp->f_mode & FMODE_READ))
750599a0ac1SAl Viro 			/* this can happen only if on == T */
751e5bc49baSOleg Nesterov 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
752e5bc49baSOleg Nesterov 	}
753ebec73f4SAl Viro 	__pipe_unlock(pipe);
7541da177e4SLinus Torvalds 	return retval;
7551da177e4SLinus Torvalds }
7561da177e4SLinus Torvalds 
7579c87bcf0SMichael Kerrisk (man-pages) static unsigned long account_pipe_buffers(struct user_struct *user,
758759c0114SWilly Tarreau                                  unsigned long old, unsigned long new)
759759c0114SWilly Tarreau {
7609c87bcf0SMichael Kerrisk (man-pages) 	return atomic_long_add_return(new - old, &user->pipe_bufs);
761759c0114SWilly Tarreau }
762759c0114SWilly Tarreau 
7639c87bcf0SMichael Kerrisk (man-pages) static bool too_many_pipe_buffers_soft(unsigned long user_bufs)
764759c0114SWilly Tarreau {
765f7340761SEric Biggers 	unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
766f7340761SEric Biggers 
767f7340761SEric Biggers 	return soft_limit && user_bufs > soft_limit;
768759c0114SWilly Tarreau }
769759c0114SWilly Tarreau 
7709c87bcf0SMichael Kerrisk (man-pages) static bool too_many_pipe_buffers_hard(unsigned long user_bufs)
771759c0114SWilly Tarreau {
772f7340761SEric Biggers 	unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
773f7340761SEric Biggers 
774f7340761SEric Biggers 	return hard_limit && user_bufs > hard_limit;
775759c0114SWilly Tarreau }
776759c0114SWilly Tarreau 
77785c2dd54SEric Biggers static bool is_unprivileged_user(void)
77885c2dd54SEric Biggers {
77985c2dd54SEric Biggers 	return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
78085c2dd54SEric Biggers }
78185c2dd54SEric Biggers 
7827bee130eSAl Viro struct pipe_inode_info *alloc_pipe_info(void)
7833a326a2cSIngo Molnar {
784923f4f23SIngo Molnar 	struct pipe_inode_info *pipe;
785759c0114SWilly Tarreau 	unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
786759c0114SWilly Tarreau 	struct user_struct *user = get_current_user();
7879c87bcf0SMichael Kerrisk (man-pages) 	unsigned long user_bufs;
788f7340761SEric Biggers 	unsigned int max_size = READ_ONCE(pipe_max_size);
789759c0114SWilly Tarreau 
79009b4d199SMichael Kerrisk (man-pages) 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
79109b4d199SMichael Kerrisk (man-pages) 	if (pipe == NULL)
79209b4d199SMichael Kerrisk (man-pages) 		goto out_free_uid;
79309b4d199SMichael Kerrisk (man-pages) 
794f7340761SEric Biggers 	if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
795f7340761SEric Biggers 		pipe_bufs = max_size >> PAGE_SHIFT;
796086e774aSMichael Kerrisk (man-pages) 
7979c87bcf0SMichael Kerrisk (man-pages) 	user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
798a005ca0eSMichael Kerrisk (man-pages) 
79985c2dd54SEric Biggers 	if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) {
8009c87bcf0SMichael Kerrisk (man-pages) 		user_bufs = account_pipe_buffers(user, pipe_bufs, 1);
801759c0114SWilly Tarreau 		pipe_bufs = 1;
802759c0114SWilly Tarreau 	}
803759c0114SWilly Tarreau 
80485c2dd54SEric Biggers 	if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
805a005ca0eSMichael Kerrisk (man-pages) 		goto out_revert_acct;
806a005ca0eSMichael Kerrisk (man-pages) 
807a005ca0eSMichael Kerrisk (man-pages) 	pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
808a005ca0eSMichael Kerrisk (man-pages) 			     GFP_KERNEL_ACCOUNT);
809a005ca0eSMichael Kerrisk (man-pages) 
81035f3d14dSJens Axboe 	if (pipe->bufs) {
8110ddad21dSLinus Torvalds 		init_waitqueue_head(&pipe->rd_wait);
8120ddad21dSLinus Torvalds 		init_waitqueue_head(&pipe->wr_wait);
813923f4f23SIngo Molnar 		pipe->r_counter = pipe->w_counter = 1;
8146718b6f8SDavid Howells 		pipe->max_usage = pipe_bufs;
8158cefc107SDavid Howells 		pipe->ring_size = pipe_bufs;
816759c0114SWilly Tarreau 		pipe->user = user;
81772b0d9aaSAl Viro 		mutex_init(&pipe->mutex);
81835f3d14dSJens Axboe 		return pipe;
81935f3d14dSJens Axboe 	}
8203a326a2cSIngo Molnar 
821a005ca0eSMichael Kerrisk (man-pages) out_revert_acct:
8229c87bcf0SMichael Kerrisk (man-pages) 	(void) account_pipe_buffers(user, pipe_bufs, 0);
82309b4d199SMichael Kerrisk (man-pages) 	kfree(pipe);
82409b4d199SMichael Kerrisk (man-pages) out_free_uid:
82509b4d199SMichael Kerrisk (man-pages) 	free_uid(user);
82635f3d14dSJens Axboe 	return NULL;
8273a326a2cSIngo Molnar }
8283a326a2cSIngo Molnar 
8294b8a8f1eSAl Viro void free_pipe_info(struct pipe_inode_info *pipe)
8301da177e4SLinus Torvalds {
8311da177e4SLinus Torvalds 	int i;
8321da177e4SLinus Torvalds 
8338cefc107SDavid Howells 	(void) account_pipe_buffers(pipe->user, pipe->ring_size, 0);
834759c0114SWilly Tarreau 	free_uid(pipe->user);
8358cefc107SDavid Howells 	for (i = 0; i < pipe->ring_size; i++) {
836923f4f23SIngo Molnar 		struct pipe_buffer *buf = pipe->bufs + i;
8371da177e4SLinus Torvalds 		if (buf->ops)
838a779638cSMiklos Szeredi 			pipe_buf_release(pipe, buf);
8391da177e4SLinus Torvalds 	}
840923f4f23SIngo Molnar 	if (pipe->tmp_page)
841923f4f23SIngo Molnar 		__free_page(pipe->tmp_page);
84235f3d14dSJens Axboe 	kfree(pipe->bufs);
843923f4f23SIngo Molnar 	kfree(pipe);
8441da177e4SLinus Torvalds }
8451da177e4SLinus Torvalds 
846fa3536ccSEric Dumazet static struct vfsmount *pipe_mnt __read_mostly;
847341b446bSIngo Molnar 
848c23fbb6bSEric Dumazet /*
849c23fbb6bSEric Dumazet  * pipefs_dname() is called from d_path().
850c23fbb6bSEric Dumazet  */
851c23fbb6bSEric Dumazet static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
852c23fbb6bSEric Dumazet {
853c23fbb6bSEric Dumazet 	return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
85475c3cfa8SDavid Howells 				d_inode(dentry)->i_ino);
855c23fbb6bSEric Dumazet }
856c23fbb6bSEric Dumazet 
8573ba13d17SAl Viro static const struct dentry_operations pipefs_dentry_operations = {
858c23fbb6bSEric Dumazet 	.d_dname	= pipefs_dname,
8591da177e4SLinus Torvalds };
8601da177e4SLinus Torvalds 
8611da177e4SLinus Torvalds static struct inode * get_pipe_inode(void)
8621da177e4SLinus Torvalds {
863a209dfc7SEric Dumazet 	struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
864923f4f23SIngo Molnar 	struct pipe_inode_info *pipe;
8651da177e4SLinus Torvalds 
8661da177e4SLinus Torvalds 	if (!inode)
8671da177e4SLinus Torvalds 		goto fail_inode;
8681da177e4SLinus Torvalds 
86985fe4025SChristoph Hellwig 	inode->i_ino = get_next_ino();
87085fe4025SChristoph Hellwig 
8717bee130eSAl Viro 	pipe = alloc_pipe_info();
872923f4f23SIngo Molnar 	if (!pipe)
8731da177e4SLinus Torvalds 		goto fail_iput;
8743a326a2cSIngo Molnar 
875ba5bb147SAl Viro 	inode->i_pipe = pipe;
876ba5bb147SAl Viro 	pipe->files = 2;
877923f4f23SIngo Molnar 	pipe->readers = pipe->writers = 1;
878599a0ac1SAl Viro 	inode->i_fop = &pipefifo_fops;
8791da177e4SLinus Torvalds 
8801da177e4SLinus Torvalds 	/*
8811da177e4SLinus Torvalds 	 * Mark the inode dirty from the very beginning,
8821da177e4SLinus Torvalds 	 * that way it will never be moved to the dirty
8831da177e4SLinus Torvalds 	 * list because "mark_inode_dirty()" will think
8841da177e4SLinus Torvalds 	 * that it already _is_ on the dirty list.
8851da177e4SLinus Torvalds 	 */
8861da177e4SLinus Torvalds 	inode->i_state = I_DIRTY;
8871da177e4SLinus Torvalds 	inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
888da9592edSDavid Howells 	inode->i_uid = current_fsuid();
889da9592edSDavid Howells 	inode->i_gid = current_fsgid();
890078cd827SDeepa Dinamani 	inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
891923f4f23SIngo Molnar 
8921da177e4SLinus Torvalds 	return inode;
8931da177e4SLinus Torvalds 
8941da177e4SLinus Torvalds fail_iput:
8951da177e4SLinus Torvalds 	iput(inode);
896341b446bSIngo Molnar 
8971da177e4SLinus Torvalds fail_inode:
8981da177e4SLinus Torvalds 	return NULL;
8991da177e4SLinus Torvalds }
9001da177e4SLinus Torvalds 
901e4fad8e5SAl Viro int create_pipe_files(struct file **res, int flags)
9021da177e4SLinus Torvalds {
903e4fad8e5SAl Viro 	struct inode *inode = get_pipe_inode();
904d6cbd281SAndi Kleen 	struct file *f;
9051da177e4SLinus Torvalds 
9061da177e4SLinus Torvalds 	if (!inode)
907e4fad8e5SAl Viro 		return -ENFILE;
9081da177e4SLinus Torvalds 
909152b6372SAl Viro 	f = alloc_file_pseudo(inode, pipe_mnt, "",
910152b6372SAl Viro 				O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
911c9c554f2SAl Viro 				&pipefifo_fops);
912e9bb1f9bSEric Biggers 	if (IS_ERR(f)) {
9134b8a8f1eSAl Viro 		free_pipe_info(inode->i_pipe);
914d6cbd281SAndi Kleen 		iput(inode);
915152b6372SAl Viro 		return PTR_ERR(f);
9161da177e4SLinus Torvalds 	}
9171da177e4SLinus Torvalds 
9181da177e4SLinus Torvalds 	f->private_data = inode->i_pipe;
9191da177e4SLinus Torvalds 
920183266f2SAl Viro 	res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
921c9c554f2SAl Viro 				  &pipefifo_fops);
9221da177e4SLinus Torvalds 	if (IS_ERR(res[0])) {
923b10a4a9fSAl Viro 		put_pipe_info(inode, inode->i_pipe);
924b10a4a9fSAl Viro 		fput(f);
925b10a4a9fSAl Viro 		return PTR_ERR(res[0]);
9261da177e4SLinus Torvalds 	}
9271da177e4SLinus Torvalds 	res[0]->private_data = inode->i_pipe;
9281da177e4SLinus Torvalds 	res[1] = f;
929d8e464ecSLinus Torvalds 	stream_open(inode, res[0]);
930d8e464ecSLinus Torvalds 	stream_open(inode, res[1]);
9311da177e4SLinus Torvalds 	return 0;
932d6cbd281SAndi Kleen }
933d6cbd281SAndi Kleen 
9345b249b1bSAl Viro static int __do_pipe_flags(int *fd, struct file **files, int flags)
935d6cbd281SAndi Kleen {
936d6cbd281SAndi Kleen 	int error;
937d6cbd281SAndi Kleen 	int fdw, fdr;
938d6cbd281SAndi Kleen 
9399883035aSLinus Torvalds 	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
940ed8cae8bSUlrich Drepper 		return -EINVAL;
941ed8cae8bSUlrich Drepper 
942e4fad8e5SAl Viro 	error = create_pipe_files(files, flags);
943e4fad8e5SAl Viro 	if (error)
944e4fad8e5SAl Viro 		return error;
945d6cbd281SAndi Kleen 
946ed8cae8bSUlrich Drepper 	error = get_unused_fd_flags(flags);
947d6cbd281SAndi Kleen 	if (error < 0)
948d6cbd281SAndi Kleen 		goto err_read_pipe;
949d6cbd281SAndi Kleen 	fdr = error;
950d6cbd281SAndi Kleen 
951ed8cae8bSUlrich Drepper 	error = get_unused_fd_flags(flags);
952d6cbd281SAndi Kleen 	if (error < 0)
953d6cbd281SAndi Kleen 		goto err_fdr;
954d6cbd281SAndi Kleen 	fdw = error;
955d6cbd281SAndi Kleen 
956157cf649SAl Viro 	audit_fd_pair(fdr, fdw);
957d6cbd281SAndi Kleen 	fd[0] = fdr;
958d6cbd281SAndi Kleen 	fd[1] = fdw;
9591da177e4SLinus Torvalds 	return 0;
9601da177e4SLinus Torvalds 
961d6cbd281SAndi Kleen  err_fdr:
962d6cbd281SAndi Kleen 	put_unused_fd(fdr);
963d6cbd281SAndi Kleen  err_read_pipe:
964e4fad8e5SAl Viro 	fput(files[0]);
965e4fad8e5SAl Viro 	fput(files[1]);
9661da177e4SLinus Torvalds 	return error;
9671da177e4SLinus Torvalds }
9681da177e4SLinus Torvalds 
9695b249b1bSAl Viro int do_pipe_flags(int *fd, int flags)
9705b249b1bSAl Viro {
9715b249b1bSAl Viro 	struct file *files[2];
9725b249b1bSAl Viro 	int error = __do_pipe_flags(fd, files, flags);
9735b249b1bSAl Viro 	if (!error) {
9745b249b1bSAl Viro 		fd_install(fd[0], files[0]);
9755b249b1bSAl Viro 		fd_install(fd[1], files[1]);
9765b249b1bSAl Viro 	}
9775b249b1bSAl Viro 	return error;
9785b249b1bSAl Viro }
9795b249b1bSAl Viro 
9801da177e4SLinus Torvalds /*
981d35c7b0eSUlrich Drepper  * sys_pipe() is the normal C calling standard for creating
982d35c7b0eSUlrich Drepper  * a pipe. It's not the way Unix traditionally does this, though.
983d35c7b0eSUlrich Drepper  */
9840a216dd1SDominik Brodowski static int do_pipe2(int __user *fildes, int flags)
985d35c7b0eSUlrich Drepper {
9865b249b1bSAl Viro 	struct file *files[2];
987d35c7b0eSUlrich Drepper 	int fd[2];
988d35c7b0eSUlrich Drepper 	int error;
989d35c7b0eSUlrich Drepper 
9905b249b1bSAl Viro 	error = __do_pipe_flags(fd, files, flags);
991d35c7b0eSUlrich Drepper 	if (!error) {
9925b249b1bSAl Viro 		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
9935b249b1bSAl Viro 			fput(files[0]);
9945b249b1bSAl Viro 			fput(files[1]);
9955b249b1bSAl Viro 			put_unused_fd(fd[0]);
9965b249b1bSAl Viro 			put_unused_fd(fd[1]);
997d35c7b0eSUlrich Drepper 			error = -EFAULT;
9985b249b1bSAl Viro 		} else {
9995b249b1bSAl Viro 			fd_install(fd[0], files[0]);
10005b249b1bSAl Viro 			fd_install(fd[1], files[1]);
1001d35c7b0eSUlrich Drepper 		}
1002ba719baeSUlrich Drepper 	}
1003d35c7b0eSUlrich Drepper 	return error;
1004d35c7b0eSUlrich Drepper }
1005d35c7b0eSUlrich Drepper 
10060a216dd1SDominik Brodowski SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
10070a216dd1SDominik Brodowski {
10080a216dd1SDominik Brodowski 	return do_pipe2(fildes, flags);
10090a216dd1SDominik Brodowski }
10100a216dd1SDominik Brodowski 
10112b664219SHeiko Carstens SYSCALL_DEFINE1(pipe, int __user *, fildes)
1012ed8cae8bSUlrich Drepper {
10130a216dd1SDominik Brodowski 	return do_pipe2(fildes, 0);
1014ed8cae8bSUlrich Drepper }
1015ed8cae8bSUlrich Drepper 
1016fc7478a2SAl Viro static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
1017f776c738SAl Viro {
1018f776c738SAl Viro 	int cur = *cnt;
1019f776c738SAl Viro 
1020f776c738SAl Viro 	while (cur == *cnt) {
1021fc7478a2SAl Viro 		pipe_wait(pipe);
1022f776c738SAl Viro 		if (signal_pending(current))
1023f776c738SAl Viro 			break;
1024f776c738SAl Viro 	}
1025f776c738SAl Viro 	return cur == *cnt ? -ERESTARTSYS : 0;
1026f776c738SAl Viro }
1027f776c738SAl Viro 
1028fc7478a2SAl Viro static void wake_up_partner(struct pipe_inode_info *pipe)
1029f776c738SAl Viro {
10306551d5c5SLinus Torvalds 	wake_up_interruptible_all(&pipe->rd_wait);
10316551d5c5SLinus Torvalds 	wake_up_interruptible_all(&pipe->wr_wait);
1032f776c738SAl Viro }
1033f776c738SAl Viro 
1034f776c738SAl Viro static int fifo_open(struct inode *inode, struct file *filp)
1035f776c738SAl Viro {
1036f776c738SAl Viro 	struct pipe_inode_info *pipe;
1037599a0ac1SAl Viro 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
1038f776c738SAl Viro 	int ret;
1039f776c738SAl Viro 
1040ba5bb147SAl Viro 	filp->f_version = 0;
1041ba5bb147SAl Viro 
1042ba5bb147SAl Viro 	spin_lock(&inode->i_lock);
1043ba5bb147SAl Viro 	if (inode->i_pipe) {
1044f776c738SAl Viro 		pipe = inode->i_pipe;
1045ba5bb147SAl Viro 		pipe->files++;
1046ba5bb147SAl Viro 		spin_unlock(&inode->i_lock);
1047ba5bb147SAl Viro 	} else {
1048ba5bb147SAl Viro 		spin_unlock(&inode->i_lock);
10497bee130eSAl Viro 		pipe = alloc_pipe_info();
1050f776c738SAl Viro 		if (!pipe)
1051ba5bb147SAl Viro 			return -ENOMEM;
1052ba5bb147SAl Viro 		pipe->files = 1;
1053ba5bb147SAl Viro 		spin_lock(&inode->i_lock);
1054ba5bb147SAl Viro 		if (unlikely(inode->i_pipe)) {
1055ba5bb147SAl Viro 			inode->i_pipe->files++;
1056ba5bb147SAl Viro 			spin_unlock(&inode->i_lock);
10574b8a8f1eSAl Viro 			free_pipe_info(pipe);
1058ba5bb147SAl Viro 			pipe = inode->i_pipe;
1059ba5bb147SAl Viro 		} else {
1060f776c738SAl Viro 			inode->i_pipe = pipe;
1061ba5bb147SAl Viro 			spin_unlock(&inode->i_lock);
1062f776c738SAl Viro 		}
1063ba5bb147SAl Viro 	}
1064de32ec4cSAl Viro 	filp->private_data = pipe;
1065ba5bb147SAl Viro 	/* OK, we have a pipe and it's pinned down */
1066ba5bb147SAl Viro 
1067ebec73f4SAl Viro 	__pipe_lock(pipe);
1068f776c738SAl Viro 
1069f776c738SAl Viro 	/* We can only do regular read/write on fifos */
1070d8e464ecSLinus Torvalds 	stream_open(inode, filp);
1071f776c738SAl Viro 
1072d8e464ecSLinus Torvalds 	switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
1073f776c738SAl Viro 	case FMODE_READ:
1074f776c738SAl Viro 	/*
1075f776c738SAl Viro 	 *  O_RDONLY
1076f776c738SAl Viro 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
1077f776c738SAl Viro 	 *  opened, even when there is no process writing the FIFO.
1078f776c738SAl Viro 	 */
1079f776c738SAl Viro 		pipe->r_counter++;
1080f776c738SAl Viro 		if (pipe->readers++ == 0)
1081fc7478a2SAl Viro 			wake_up_partner(pipe);
1082f776c738SAl Viro 
1083599a0ac1SAl Viro 		if (!is_pipe && !pipe->writers) {
1084f776c738SAl Viro 			if ((filp->f_flags & O_NONBLOCK)) {
1085a9a08845SLinus Torvalds 				/* suppress EPOLLHUP until we have
1086f776c738SAl Viro 				 * seen a writer */
1087f776c738SAl Viro 				filp->f_version = pipe->w_counter;
1088f776c738SAl Viro 			} else {
1089fc7478a2SAl Viro 				if (wait_for_partner(pipe, &pipe->w_counter))
1090f776c738SAl Viro 					goto err_rd;
1091f776c738SAl Viro 			}
1092f776c738SAl Viro 		}
1093f776c738SAl Viro 		break;
1094f776c738SAl Viro 
1095f776c738SAl Viro 	case FMODE_WRITE:
1096f776c738SAl Viro 	/*
1097f776c738SAl Viro 	 *  O_WRONLY
1098f776c738SAl Viro 	 *  POSIX.1 says that O_NONBLOCK means return -1 with
1099f776c738SAl Viro 	 *  errno=ENXIO when there is no process reading the FIFO.
1100f776c738SAl Viro 	 */
1101f776c738SAl Viro 		ret = -ENXIO;
1102599a0ac1SAl Viro 		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
1103f776c738SAl Viro 			goto err;
1104f776c738SAl Viro 
1105f776c738SAl Viro 		pipe->w_counter++;
1106f776c738SAl Viro 		if (!pipe->writers++)
1107fc7478a2SAl Viro 			wake_up_partner(pipe);
1108f776c738SAl Viro 
1109599a0ac1SAl Viro 		if (!is_pipe && !pipe->readers) {
1110fc7478a2SAl Viro 			if (wait_for_partner(pipe, &pipe->r_counter))
1111f776c738SAl Viro 				goto err_wr;
1112f776c738SAl Viro 		}
1113f776c738SAl Viro 		break;
1114f776c738SAl Viro 
1115f776c738SAl Viro 	case FMODE_READ | FMODE_WRITE:
1116f776c738SAl Viro 	/*
1117f776c738SAl Viro 	 *  O_RDWR
1118f776c738SAl Viro 	 *  POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
1119f776c738SAl Viro 	 *  This implementation will NEVER block on a O_RDWR open, since
1120f776c738SAl Viro 	 *  the process can at least talk to itself.
1121f776c738SAl Viro 	 */
1122f776c738SAl Viro 
1123f776c738SAl Viro 		pipe->readers++;
1124f776c738SAl Viro 		pipe->writers++;
1125f776c738SAl Viro 		pipe->r_counter++;
1126f776c738SAl Viro 		pipe->w_counter++;
1127f776c738SAl Viro 		if (pipe->readers == 1 || pipe->writers == 1)
1128fc7478a2SAl Viro 			wake_up_partner(pipe);
1129f776c738SAl Viro 		break;
1130f776c738SAl Viro 
1131f776c738SAl Viro 	default:
1132f776c738SAl Viro 		ret = -EINVAL;
1133f776c738SAl Viro 		goto err;
1134f776c738SAl Viro 	}
1135f776c738SAl Viro 
1136f776c738SAl Viro 	/* Ok! */
1137ebec73f4SAl Viro 	__pipe_unlock(pipe);
1138f776c738SAl Viro 	return 0;
1139f776c738SAl Viro 
1140f776c738SAl Viro err_rd:
1141f776c738SAl Viro 	if (!--pipe->readers)
11420ddad21dSLinus Torvalds 		wake_up_interruptible(&pipe->wr_wait);
1143f776c738SAl Viro 	ret = -ERESTARTSYS;
1144f776c738SAl Viro 	goto err;
1145f776c738SAl Viro 
1146f776c738SAl Viro err_wr:
1147f776c738SAl Viro 	if (!--pipe->writers)
11486551d5c5SLinus Torvalds 		wake_up_interruptible_all(&pipe->rd_wait);
1149f776c738SAl Viro 	ret = -ERESTARTSYS;
1150f776c738SAl Viro 	goto err;
1151f776c738SAl Viro 
1152f776c738SAl Viro err:
1153ebec73f4SAl Viro 	__pipe_unlock(pipe);
1154b0d8d229SLinus Torvalds 
1155b0d8d229SLinus Torvalds 	put_pipe_info(inode, pipe);
1156f776c738SAl Viro 	return ret;
1157f776c738SAl Viro }
1158f776c738SAl Viro 
1159599a0ac1SAl Viro const struct file_operations pipefifo_fops = {
1160599a0ac1SAl Viro 	.open		= fifo_open,
1161599a0ac1SAl Viro 	.llseek		= no_llseek,
1162fb9096a3SAl Viro 	.read_iter	= pipe_read,
1163f0d1bec9SAl Viro 	.write_iter	= pipe_write,
1164a11e1d43SLinus Torvalds 	.poll		= pipe_poll,
1165599a0ac1SAl Viro 	.unlocked_ioctl	= pipe_ioctl,
1166599a0ac1SAl Viro 	.release	= pipe_release,
1167599a0ac1SAl Viro 	.fasync		= pipe_fasync,
1168f776c738SAl Viro };
1169f776c738SAl Viro 
1170d35c7b0eSUlrich Drepper /*
1171f491bd71SMichael Kerrisk (man-pages)  * Currently we rely on the pipe array holding a power-of-2 number
1172d3f14c48SJoe Lawrence  * of pages. Returns 0 on error.
1173f491bd71SMichael Kerrisk (man-pages)  */
117496e99be4SEric Biggers unsigned int round_pipe_size(unsigned long size)
1175f491bd71SMichael Kerrisk (man-pages) {
1176c4fed5a9SEric Biggers 	if (size > (1U << 31))
117796e99be4SEric Biggers 		return 0;
117896e99be4SEric Biggers 
11794c2e4befSEric Biggers 	/* Minimum pipe size, as required by POSIX */
11804c2e4befSEric Biggers 	if (size < PAGE_SIZE)
1181c4fed5a9SEric Biggers 		return PAGE_SIZE;
1182d3f14c48SJoe Lawrence 
1183c4fed5a9SEric Biggers 	return roundup_pow_of_two(size);
1184f491bd71SMichael Kerrisk (man-pages) }
1185f491bd71SMichael Kerrisk (man-pages) 
1186f491bd71SMichael Kerrisk (man-pages) /*
118735f3d14dSJens Axboe  * Allocate a new array of pipe buffers and copy the info over. Returns the
118835f3d14dSJens Axboe  * pipe size if successful, or return -ERROR on error.
118935f3d14dSJens Axboe  */
1190d37d4166SMichael Kerrisk (man-pages) static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
119135f3d14dSJens Axboe {
119235f3d14dSJens Axboe 	struct pipe_buffer *bufs;
11938cefc107SDavid Howells 	unsigned int size, nr_slots, head, tail, mask, n;
11949c87bcf0SMichael Kerrisk (man-pages) 	unsigned long user_bufs;
1195b0b91d18SMichael Kerrisk (man-pages) 	long ret = 0;
1196d37d4166SMichael Kerrisk (man-pages) 
1197d37d4166SMichael Kerrisk (man-pages) 	size = round_pipe_size(arg);
11988cefc107SDavid Howells 	nr_slots = size >> PAGE_SHIFT;
1199d37d4166SMichael Kerrisk (man-pages) 
12008cefc107SDavid Howells 	if (!nr_slots)
1201d37d4166SMichael Kerrisk (man-pages) 		return -EINVAL;
1202d37d4166SMichael Kerrisk (man-pages) 
1203b0b91d18SMichael Kerrisk (man-pages) 	/*
1204b0b91d18SMichael Kerrisk (man-pages) 	 * If trying to increase the pipe capacity, check that an
1205b0b91d18SMichael Kerrisk (man-pages) 	 * unprivileged user is not trying to exceed various limits
1206b0b91d18SMichael Kerrisk (man-pages) 	 * (soft limit check here, hard limit check just below).
1207b0b91d18SMichael Kerrisk (man-pages) 	 * Decreasing the pipe capacity is always permitted, even
1208b0b91d18SMichael Kerrisk (man-pages) 	 * if the user is currently over a limit.
1209b0b91d18SMichael Kerrisk (man-pages) 	 */
12108cefc107SDavid Howells 	if (nr_slots > pipe->ring_size &&
1211b0b91d18SMichael Kerrisk (man-pages) 			size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
1212d37d4166SMichael Kerrisk (man-pages) 		return -EPERM;
1213d37d4166SMichael Kerrisk (man-pages) 
12148cefc107SDavid Howells 	user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots);
1215b0b91d18SMichael Kerrisk (man-pages) 
12168cefc107SDavid Howells 	if (nr_slots > pipe->ring_size &&
12179c87bcf0SMichael Kerrisk (man-pages) 			(too_many_pipe_buffers_hard(user_bufs) ||
12189c87bcf0SMichael Kerrisk (man-pages) 			 too_many_pipe_buffers_soft(user_bufs)) &&
121985c2dd54SEric Biggers 			is_unprivileged_user()) {
1220b0b91d18SMichael Kerrisk (man-pages) 		ret = -EPERM;
1221b0b91d18SMichael Kerrisk (man-pages) 		goto out_revert_acct;
1222b0b91d18SMichael Kerrisk (man-pages) 	}
122335f3d14dSJens Axboe 
122435f3d14dSJens Axboe 	/*
12258cefc107SDavid Howells 	 * We can shrink the pipe, if arg is greater than the ring occupancy.
12268cefc107SDavid Howells 	 * Since we don't expect a lot of shrink+grow operations, just free and
12278cefc107SDavid Howells 	 * allocate again like we would do for growing.  If the pipe currently
122835f3d14dSJens Axboe 	 * contains more buffers than arg, then return busy.
122935f3d14dSJens Axboe 	 */
12308cefc107SDavid Howells 	mask = pipe->ring_size - 1;
12318cefc107SDavid Howells 	head = pipe->head;
12328cefc107SDavid Howells 	tail = pipe->tail;
12338cefc107SDavid Howells 	n = pipe_occupancy(pipe->head, pipe->tail);
12348cefc107SDavid Howells 	if (nr_slots < n) {
1235b0b91d18SMichael Kerrisk (man-pages) 		ret = -EBUSY;
1236b0b91d18SMichael Kerrisk (man-pages) 		goto out_revert_acct;
1237b0b91d18SMichael Kerrisk (man-pages) 	}
123835f3d14dSJens Axboe 
12398cefc107SDavid Howells 	bufs = kcalloc(nr_slots, sizeof(*bufs),
1240d86133bdSVladimir Davydov 		       GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
1241b0b91d18SMichael Kerrisk (man-pages) 	if (unlikely(!bufs)) {
1242b0b91d18SMichael Kerrisk (man-pages) 		ret = -ENOMEM;
1243b0b91d18SMichael Kerrisk (man-pages) 		goto out_revert_acct;
1244b0b91d18SMichael Kerrisk (man-pages) 	}
124535f3d14dSJens Axboe 
124635f3d14dSJens Axboe 	/*
124735f3d14dSJens Axboe 	 * The pipe array wraps around, so just start the new one at zero
12488cefc107SDavid Howells 	 * and adjust the indices.
124935f3d14dSJens Axboe 	 */
12508cefc107SDavid Howells 	if (n > 0) {
12518cefc107SDavid Howells 		unsigned int h = head & mask;
12528cefc107SDavid Howells 		unsigned int t = tail & mask;
12538cefc107SDavid Howells 		if (h > t) {
12548cefc107SDavid Howells 			memcpy(bufs, pipe->bufs + t,
12558cefc107SDavid Howells 			       n * sizeof(struct pipe_buffer));
12568cefc107SDavid Howells 		} else {
12578cefc107SDavid Howells 			unsigned int tsize = pipe->ring_size - t;
12588cefc107SDavid Howells 			if (h > 0)
12598cefc107SDavid Howells 				memcpy(bufs + tsize, pipe->bufs,
12608cefc107SDavid Howells 				       h * sizeof(struct pipe_buffer));
12618cefc107SDavid Howells 			memcpy(bufs, pipe->bufs + t,
12628cefc107SDavid Howells 			       tsize * sizeof(struct pipe_buffer));
12638cefc107SDavid Howells 		}
126435f3d14dSJens Axboe 	}
126535f3d14dSJens Axboe 
12668cefc107SDavid Howells 	head = n;
12678cefc107SDavid Howells 	tail = 0;
12688cefc107SDavid Howells 
126935f3d14dSJens Axboe 	kfree(pipe->bufs);
127035f3d14dSJens Axboe 	pipe->bufs = bufs;
12718cefc107SDavid Howells 	pipe->ring_size = nr_slots;
12726718b6f8SDavid Howells 	pipe->max_usage = nr_slots;
12738cefc107SDavid Howells 	pipe->tail = tail;
12748cefc107SDavid Howells 	pipe->head = head;
12756551d5c5SLinus Torvalds 
12766551d5c5SLinus Torvalds 	/* This might have made more room for writers */
12776551d5c5SLinus Torvalds 	wake_up_interruptible(&pipe->wr_wait);
12786718b6f8SDavid Howells 	return pipe->max_usage * PAGE_SIZE;
1279b0b91d18SMichael Kerrisk (man-pages) 
1280b0b91d18SMichael Kerrisk (man-pages) out_revert_acct:
12818cefc107SDavid Howells 	(void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size);
1282b0b91d18SMichael Kerrisk (man-pages) 	return ret;
128335f3d14dSJens Axboe }
128435f3d14dSJens Axboe 
1285ff9da691SJens Axboe /*
128672083646SLinus Torvalds  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
128772083646SLinus Torvalds  * location, so checking ->i_pipe is not enough to verify that this is a
128872083646SLinus Torvalds  * pipe.
128972083646SLinus Torvalds  */
129072083646SLinus Torvalds struct pipe_inode_info *get_pipe_info(struct file *file)
129172083646SLinus Torvalds {
1292de32ec4cSAl Viro 	return file->f_op == &pipefifo_fops ? file->private_data : NULL;
129372083646SLinus Torvalds }
129472083646SLinus Torvalds 
129535f3d14dSJens Axboe long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
129635f3d14dSJens Axboe {
129735f3d14dSJens Axboe 	struct pipe_inode_info *pipe;
129835f3d14dSJens Axboe 	long ret;
129935f3d14dSJens Axboe 
1300c66fb347SLinus Torvalds 	pipe = get_pipe_info(file);
130135f3d14dSJens Axboe 	if (!pipe)
130235f3d14dSJens Axboe 		return -EBADF;
130335f3d14dSJens Axboe 
1304ebec73f4SAl Viro 	__pipe_lock(pipe);
130535f3d14dSJens Axboe 
130635f3d14dSJens Axboe 	switch (cmd) {
1307d37d4166SMichael Kerrisk (man-pages) 	case F_SETPIPE_SZ:
1308d37d4166SMichael Kerrisk (man-pages) 		ret = pipe_set_size(pipe, arg);
130935f3d14dSJens Axboe 		break;
131035f3d14dSJens Axboe 	case F_GETPIPE_SZ:
13116718b6f8SDavid Howells 		ret = pipe->max_usage * PAGE_SIZE;
131235f3d14dSJens Axboe 		break;
131335f3d14dSJens Axboe 	default:
131435f3d14dSJens Axboe 		ret = -EINVAL;
131535f3d14dSJens Axboe 		break;
131635f3d14dSJens Axboe 	}
131735f3d14dSJens Axboe 
1318ebec73f4SAl Viro 	__pipe_unlock(pipe);
131935f3d14dSJens Axboe 	return ret;
132035f3d14dSJens Axboe }
132135f3d14dSJens Axboe 
1322ff0c7d15SNick Piggin static const struct super_operations pipefs_ops = {
1323ff0c7d15SNick Piggin 	.destroy_inode = free_inode_nonrcu,
1324d70ef97bSPavel Emelyanov 	.statfs = simple_statfs,
1325ff0c7d15SNick Piggin };
1326ff0c7d15SNick Piggin 
132735f3d14dSJens Axboe /*
13281da177e4SLinus Torvalds  * pipefs should _never_ be mounted by userland - too much of security hassle,
13291da177e4SLinus Torvalds  * no real gain from having the whole whorehouse mounted. So we don't need
13301da177e4SLinus Torvalds  * any operations on the root directory. However, we need a non-trivial
13311da177e4SLinus Torvalds  * d_name - pipe: will go nicely and kill the special-casing in procfs.
13321da177e4SLinus Torvalds  */
13334fa7ec5dSDavid Howells 
13344fa7ec5dSDavid Howells static int pipefs_init_fs_context(struct fs_context *fc)
13351da177e4SLinus Torvalds {
13364fa7ec5dSDavid Howells 	struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
13374fa7ec5dSDavid Howells 	if (!ctx)
13384fa7ec5dSDavid Howells 		return -ENOMEM;
13394fa7ec5dSDavid Howells 	ctx->ops = &pipefs_ops;
13404fa7ec5dSDavid Howells 	ctx->dops = &pipefs_dentry_operations;
13414fa7ec5dSDavid Howells 	return 0;
13421da177e4SLinus Torvalds }
13431da177e4SLinus Torvalds 
13441da177e4SLinus Torvalds static struct file_system_type pipe_fs_type = {
13451da177e4SLinus Torvalds 	.name		= "pipefs",
13464fa7ec5dSDavid Howells 	.init_fs_context = pipefs_init_fs_context,
13471da177e4SLinus Torvalds 	.kill_sb	= kill_anon_super,
13481da177e4SLinus Torvalds };
13491da177e4SLinus Torvalds 
13501da177e4SLinus Torvalds static int __init init_pipe_fs(void)
13511da177e4SLinus Torvalds {
13521da177e4SLinus Torvalds 	int err = register_filesystem(&pipe_fs_type);
1353341b446bSIngo Molnar 
13541da177e4SLinus Torvalds 	if (!err) {
13551da177e4SLinus Torvalds 		pipe_mnt = kern_mount(&pipe_fs_type);
13561da177e4SLinus Torvalds 		if (IS_ERR(pipe_mnt)) {
13571da177e4SLinus Torvalds 			err = PTR_ERR(pipe_mnt);
13581da177e4SLinus Torvalds 			unregister_filesystem(&pipe_fs_type);
13591da177e4SLinus Torvalds 		}
13601da177e4SLinus Torvalds 	}
13611da177e4SLinus Torvalds 	return err;
13621da177e4SLinus Torvalds }
13631da177e4SLinus Torvalds 
13641da177e4SLinus Torvalds fs_initcall(init_pipe_fs);
1365