1b2441318SGreg Kroah-Hartman // SPDX-License-Identifier: GPL-2.0 21da177e4SLinus Torvalds /* 31da177e4SLinus Torvalds * linux/fs/pipe.c 41da177e4SLinus Torvalds * 51da177e4SLinus Torvalds * Copyright (C) 1991, 1992, 1999 Linus Torvalds 61da177e4SLinus Torvalds */ 71da177e4SLinus Torvalds 81da177e4SLinus Torvalds #include <linux/mm.h> 91da177e4SLinus Torvalds #include <linux/file.h> 101da177e4SLinus Torvalds #include <linux/poll.h> 111da177e4SLinus Torvalds #include <linux/slab.h> 121da177e4SLinus Torvalds #include <linux/module.h> 131da177e4SLinus Torvalds #include <linux/init.h> 141da177e4SLinus Torvalds #include <linux/fs.h> 1535f3d14dSJens Axboe #include <linux/log2.h> 161da177e4SLinus Torvalds #include <linux/mount.h> 174fa7ec5dSDavid Howells #include <linux/pseudo_fs.h> 18b502bd11SMuthu Kumar #include <linux/magic.h> 191da177e4SLinus Torvalds #include <linux/pipe_fs_i.h> 201da177e4SLinus Torvalds #include <linux/uio.h> 211da177e4SLinus Torvalds #include <linux/highmem.h> 225274f052SJens Axboe #include <linux/pagemap.h> 23db349509SAl Viro #include <linux/audit.h> 24ba719baeSUlrich Drepper #include <linux/syscalls.h> 25b492e95bSJens Axboe #include <linux/fcntl.h> 26d86133bdSVladimir Davydov #include <linux/memcontrol.h> 271da177e4SLinus Torvalds 287c0f6ba6SLinus Torvalds #include <linux/uaccess.h> 291da177e4SLinus Torvalds #include <asm/ioctls.h> 301da177e4SLinus Torvalds 31599a0ac1SAl Viro #include "internal.h" 32599a0ac1SAl Viro 331da177e4SLinus Torvalds /* 34b492e95bSJens Axboe * The max size that a non-root user is allowed to grow the pipe. Can 35ff9da691SJens Axboe * be set by root in /proc/sys/fs/pipe-max-size 36b492e95bSJens Axboe */ 37ff9da691SJens Axboe unsigned int pipe_max_size = 1048576; 38ff9da691SJens Axboe 39759c0114SWilly Tarreau /* Maximum allocatable pages per user. Hard limit is unset by default, soft 40759c0114SWilly Tarreau * matches default values. 41759c0114SWilly Tarreau */ 42759c0114SWilly Tarreau unsigned long pipe_user_pages_hard; 43759c0114SWilly Tarreau unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 44759c0114SWilly Tarreau 45b492e95bSJens Axboe /* 468cefc107SDavid Howells * We use head and tail indices that aren't masked off, except at the point of 478cefc107SDavid Howells * dereference, but rather they're allowed to wrap naturally. This means there 488cefc107SDavid Howells * isn't a dead spot in the buffer, but the ring has to be a power of two and 498cefc107SDavid Howells * <= 2^31. 508cefc107SDavid Howells * -- David Howells 2019-09-23. 511da177e4SLinus Torvalds * 521da177e4SLinus Torvalds * Reads with count = 0 should always return 0. 531da177e4SLinus Torvalds * -- Julian Bradfield 1999-06-07. 541da177e4SLinus Torvalds * 551da177e4SLinus Torvalds * FIFOs and Pipes now generate SIGIO for both readers and writers. 561da177e4SLinus Torvalds * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 571da177e4SLinus Torvalds * 581da177e4SLinus Torvalds * pipe_read & write cleanup 591da177e4SLinus Torvalds * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 601da177e4SLinus Torvalds */ 611da177e4SLinus Torvalds 6261e0d47cSMiklos Szeredi static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 6361e0d47cSMiklos Szeredi { 646447a3cfSAl Viro if (pipe->files) 6572b0d9aaSAl Viro mutex_lock_nested(&pipe->mutex, subclass); 6661e0d47cSMiklos Szeredi } 6761e0d47cSMiklos Szeredi 6861e0d47cSMiklos Szeredi void pipe_lock(struct pipe_inode_info *pipe) 6961e0d47cSMiklos Szeredi { 7061e0d47cSMiklos Szeredi /* 7161e0d47cSMiklos Szeredi * pipe_lock() nests non-pipe inode locks (for writing to a file) 7261e0d47cSMiklos Szeredi */ 7361e0d47cSMiklos Szeredi pipe_lock_nested(pipe, I_MUTEX_PARENT); 7461e0d47cSMiklos Szeredi } 7561e0d47cSMiklos Szeredi EXPORT_SYMBOL(pipe_lock); 7661e0d47cSMiklos Szeredi 7761e0d47cSMiklos Szeredi void pipe_unlock(struct pipe_inode_info *pipe) 7861e0d47cSMiklos Szeredi { 796447a3cfSAl Viro if (pipe->files) 8072b0d9aaSAl Viro mutex_unlock(&pipe->mutex); 8161e0d47cSMiklos Szeredi } 8261e0d47cSMiklos Szeredi EXPORT_SYMBOL(pipe_unlock); 8361e0d47cSMiklos Szeredi 84ebec73f4SAl Viro static inline void __pipe_lock(struct pipe_inode_info *pipe) 85ebec73f4SAl Viro { 86ebec73f4SAl Viro mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 87ebec73f4SAl Viro } 88ebec73f4SAl Viro 89ebec73f4SAl Viro static inline void __pipe_unlock(struct pipe_inode_info *pipe) 90ebec73f4SAl Viro { 91ebec73f4SAl Viro mutex_unlock(&pipe->mutex); 92ebec73f4SAl Viro } 93ebec73f4SAl Viro 9461e0d47cSMiklos Szeredi void pipe_double_lock(struct pipe_inode_info *pipe1, 9561e0d47cSMiklos Szeredi struct pipe_inode_info *pipe2) 9661e0d47cSMiklos Szeredi { 9761e0d47cSMiklos Szeredi BUG_ON(pipe1 == pipe2); 9861e0d47cSMiklos Szeredi 9961e0d47cSMiklos Szeredi if (pipe1 < pipe2) { 10061e0d47cSMiklos Szeredi pipe_lock_nested(pipe1, I_MUTEX_PARENT); 10161e0d47cSMiklos Szeredi pipe_lock_nested(pipe2, I_MUTEX_CHILD); 10261e0d47cSMiklos Szeredi } else { 103023d43c7SPeter Zijlstra pipe_lock_nested(pipe2, I_MUTEX_PARENT); 104023d43c7SPeter Zijlstra pipe_lock_nested(pipe1, I_MUTEX_CHILD); 10561e0d47cSMiklos Szeredi } 10661e0d47cSMiklos Szeredi } 10761e0d47cSMiklos Szeredi 1081da177e4SLinus Torvalds /* Drop the inode semaphore and wait for a pipe event, atomically */ 1093a326a2cSIngo Molnar void pipe_wait(struct pipe_inode_info *pipe) 1101da177e4SLinus Torvalds { 1110ddad21dSLinus Torvalds DEFINE_WAIT(rdwait); 1120ddad21dSLinus Torvalds DEFINE_WAIT(wrwait); 1131da177e4SLinus Torvalds 114d79fc0fcSIngo Molnar /* 115d79fc0fcSIngo Molnar * Pipes are system-local resources, so sleeping on them 116d79fc0fcSIngo Molnar * is considered a noninteractive wait: 117d79fc0fcSIngo Molnar */ 1180ddad21dSLinus Torvalds prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 1190ddad21dSLinus Torvalds prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); 12061e0d47cSMiklos Szeredi pipe_unlock(pipe); 1211da177e4SLinus Torvalds schedule(); 1220ddad21dSLinus Torvalds finish_wait(&pipe->rd_wait, &rdwait); 1230ddad21dSLinus Torvalds finish_wait(&pipe->wr_wait, &wrwait); 12461e0d47cSMiklos Szeredi pipe_lock(pipe); 1251da177e4SLinus Torvalds } 1261da177e4SLinus Torvalds 127341b446bSIngo Molnar static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 128341b446bSIngo Molnar struct pipe_buffer *buf) 1291da177e4SLinus Torvalds { 1301da177e4SLinus Torvalds struct page *page = buf->page; 1311da177e4SLinus Torvalds 1325274f052SJens Axboe /* 1335274f052SJens Axboe * If nobody else uses this page, and we don't already have a 1345274f052SJens Axboe * temporary page, let's keep track of it as a one-deep 135341b446bSIngo Molnar * allocation cache. (Otherwise just release our reference to it) 1365274f052SJens Axboe */ 137341b446bSIngo Molnar if (page_count(page) == 1 && !pipe->tmp_page) 138923f4f23SIngo Molnar pipe->tmp_page = page; 139341b446bSIngo Molnar else 14009cbfeafSKirill A. Shutemov put_page(page); 1411da177e4SLinus Torvalds } 1421da177e4SLinus Torvalds 143d86133bdSVladimir Davydov static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 144d86133bdSVladimir Davydov struct pipe_buffer *buf) 145d86133bdSVladimir Davydov { 146d86133bdSVladimir Davydov struct page *page = buf->page; 147d86133bdSVladimir Davydov 148d86133bdSVladimir Davydov if (page_count(page) == 1) { 149*f4b00eabSRoman Gushchin memcg_kmem_uncharge_page(page, 0); 150d86133bdSVladimir Davydov __SetPageLocked(page); 151d86133bdSVladimir Davydov return 0; 152d86133bdSVladimir Davydov } 153d86133bdSVladimir Davydov return 1; 154d86133bdSVladimir Davydov } 155d86133bdSVladimir Davydov 1560845718dSJens Axboe /** 157b51d63c6SRandy Dunlap * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 1580845718dSJens Axboe * @pipe: the pipe that the buffer belongs to 1590845718dSJens Axboe * @buf: the buffer to attempt to steal 1600845718dSJens Axboe * 1610845718dSJens Axboe * Description: 162b51d63c6SRandy Dunlap * This function attempts to steal the &struct page attached to 1630845718dSJens Axboe * @buf. If successful, this function returns 0 and returns with 1640845718dSJens Axboe * the page locked. The caller may then reuse the page for whatever 165b51d63c6SRandy Dunlap * he wishes; the typical use is insertion into a different file 1660845718dSJens Axboe * page cache. 1670845718dSJens Axboe */ 168330ab716SJens Axboe int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 1695abc97aaSJens Axboe struct pipe_buffer *buf) 1705abc97aaSJens Axboe { 17146e678c9SJens Axboe struct page *page = buf->page; 17246e678c9SJens Axboe 1730845718dSJens Axboe /* 1740845718dSJens Axboe * A reference of one is golden, that means that the owner of this 1750845718dSJens Axboe * page is the only one holding a reference to it. lock the page 1760845718dSJens Axboe * and return OK. 1770845718dSJens Axboe */ 17846e678c9SJens Axboe if (page_count(page) == 1) { 17946e678c9SJens Axboe lock_page(page); 1805abc97aaSJens Axboe return 0; 1815abc97aaSJens Axboe } 1825abc97aaSJens Axboe 18346e678c9SJens Axboe return 1; 18446e678c9SJens Axboe } 18551921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_steal); 18646e678c9SJens Axboe 1870845718dSJens Axboe /** 188b51d63c6SRandy Dunlap * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 1890845718dSJens Axboe * @pipe: the pipe that the buffer belongs to 1900845718dSJens Axboe * @buf: the buffer to get a reference to 1910845718dSJens Axboe * 1920845718dSJens Axboe * Description: 1930845718dSJens Axboe * This function grabs an extra reference to @buf. It's used in 1940845718dSJens Axboe * in the tee() system call, when we duplicate the buffers in one 1950845718dSJens Axboe * pipe into another. 1960845718dSJens Axboe */ 19715fab63eSMatthew Wilcox bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 19870524490SJens Axboe { 19915fab63eSMatthew Wilcox return try_get_page(buf->page); 20070524490SJens Axboe } 20151921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_get); 20270524490SJens Axboe 2030845718dSJens Axboe /** 2040845718dSJens Axboe * generic_pipe_buf_confirm - verify contents of the pipe buffer 20579685b8dSRandy Dunlap * @info: the pipe that the buffer belongs to 2060845718dSJens Axboe * @buf: the buffer to confirm 2070845718dSJens Axboe * 2080845718dSJens Axboe * Description: 2090845718dSJens Axboe * This function does nothing, because the generic pipe code uses 2100845718dSJens Axboe * pages that are always good when inserted into the pipe. 2110845718dSJens Axboe */ 212cac36bb0SJens Axboe int generic_pipe_buf_confirm(struct pipe_inode_info *info, 213cac36bb0SJens Axboe struct pipe_buffer *buf) 214f84d7519SJens Axboe { 215f84d7519SJens Axboe return 0; 216f84d7519SJens Axboe } 21751921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_confirm); 218f84d7519SJens Axboe 2196818173bSMiklos Szeredi /** 2206818173bSMiklos Szeredi * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 2216818173bSMiklos Szeredi * @pipe: the pipe that the buffer belongs to 2226818173bSMiklos Szeredi * @buf: the buffer to put a reference to 2236818173bSMiklos Szeredi * 2246818173bSMiklos Szeredi * Description: 2256818173bSMiklos Szeredi * This function releases a reference to @buf. 2266818173bSMiklos Szeredi */ 2276818173bSMiklos Szeredi void generic_pipe_buf_release(struct pipe_inode_info *pipe, 2286818173bSMiklos Szeredi struct pipe_buffer *buf) 2296818173bSMiklos Szeredi { 23009cbfeafSKirill A. Shutemov put_page(buf->page); 2316818173bSMiklos Szeredi } 23251921cb7SMiklos Szeredi EXPORT_SYMBOL(generic_pipe_buf_release); 2336818173bSMiklos Szeredi 23401e7187bSJann Horn /* New data written to a pipe may be appended to a buffer with this type. */ 235d4c3cca9SEric Dumazet static const struct pipe_buf_operations anon_pipe_buf_ops = { 236cac36bb0SJens Axboe .confirm = generic_pipe_buf_confirm, 2371da177e4SLinus Torvalds .release = anon_pipe_buf_release, 238d86133bdSVladimir Davydov .steal = anon_pipe_buf_steal, 239f84d7519SJens Axboe .get = generic_pipe_buf_get, 2401da177e4SLinus Torvalds }; 2411da177e4SLinus Torvalds 242a0ce2f0aSJann Horn static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 2431da177e4SLinus Torvalds .confirm = generic_pipe_buf_confirm, 2441da177e4SLinus Torvalds .release = anon_pipe_buf_release, 2451da177e4SLinus Torvalds .steal = anon_pipe_buf_steal, 2461da177e4SLinus Torvalds .get = generic_pipe_buf_get, 247923f4f23SIngo Molnar }; 2481da177e4SLinus Torvalds 2499883035aSLinus Torvalds static const struct pipe_buf_operations packet_pipe_buf_ops = { 2509883035aSLinus Torvalds .confirm = generic_pipe_buf_confirm, 2519883035aSLinus Torvalds .release = anon_pipe_buf_release, 252d86133bdSVladimir Davydov .steal = anon_pipe_buf_steal, 2539883035aSLinus Torvalds .get = generic_pipe_buf_get, 2549883035aSLinus Torvalds }; 2559883035aSLinus Torvalds 25601e7187bSJann Horn /** 25701e7187bSJann Horn * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 25801e7187bSJann Horn * @buf: the buffer to mark 25901e7187bSJann Horn * 26001e7187bSJann Horn * Description: 26101e7187bSJann Horn * This function ensures that no future writes will be merged into the 26201e7187bSJann Horn * given &struct pipe_buffer. This is necessary when multiple pipe buffers 26301e7187bSJann Horn * share the same backing page. 26401e7187bSJann Horn */ 265a0ce2f0aSJann Horn void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 266a0ce2f0aSJann Horn { 267a0ce2f0aSJann Horn if (buf->ops == &anon_pipe_buf_ops) 268a0ce2f0aSJann Horn buf->ops = &anon_pipe_buf_nomerge_ops; 269a0ce2f0aSJann Horn } 270a0ce2f0aSJann Horn 27101e7187bSJann Horn static bool pipe_buf_can_merge(struct pipe_buffer *buf) 27201e7187bSJann Horn { 27301e7187bSJann Horn return buf->ops == &anon_pipe_buf_ops; 27401e7187bSJann Horn } 27501e7187bSJann Horn 27685190d15SLinus Torvalds /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 27785190d15SLinus Torvalds static inline bool pipe_readable(const struct pipe_inode_info *pipe) 27885190d15SLinus Torvalds { 27985190d15SLinus Torvalds unsigned int head = READ_ONCE(pipe->head); 28085190d15SLinus Torvalds unsigned int tail = READ_ONCE(pipe->tail); 28185190d15SLinus Torvalds unsigned int writers = READ_ONCE(pipe->writers); 28285190d15SLinus Torvalds 28385190d15SLinus Torvalds return !pipe_empty(head, tail) || !writers; 28485190d15SLinus Torvalds } 28585190d15SLinus Torvalds 2861da177e4SLinus Torvalds static ssize_t 287fb9096a3SAl Viro pipe_read(struct kiocb *iocb, struct iov_iter *to) 2881da177e4SLinus Torvalds { 289fb9096a3SAl Viro size_t total_len = iov_iter_count(to); 290ee0b3e67SBadari Pulavarty struct file *filp = iocb->ki_filp; 291de32ec4cSAl Viro struct pipe_inode_info *pipe = filp->private_data; 2920ddad21dSLinus Torvalds bool was_full, wake_next_reader = false; 2931da177e4SLinus Torvalds ssize_t ret; 2941da177e4SLinus Torvalds 2951da177e4SLinus Torvalds /* Null read succeeds. */ 2961da177e4SLinus Torvalds if (unlikely(total_len == 0)) 2971da177e4SLinus Torvalds return 0; 2981da177e4SLinus Torvalds 2991da177e4SLinus Torvalds ret = 0; 300ebec73f4SAl Viro __pipe_lock(pipe); 301f467a6a6SLinus Torvalds 302f467a6a6SLinus Torvalds /* 303f467a6a6SLinus Torvalds * We only wake up writers if the pipe was full when we started 304f467a6a6SLinus Torvalds * reading in order to avoid unnecessary wakeups. 305f467a6a6SLinus Torvalds * 306f467a6a6SLinus Torvalds * But when we do wake up writers, we do so using a sync wakeup 307f467a6a6SLinus Torvalds * (WF_SYNC), because we want them to get going and generate more 308f467a6a6SLinus Torvalds * data for us. 309f467a6a6SLinus Torvalds */ 310f467a6a6SLinus Torvalds was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 3111da177e4SLinus Torvalds for (;;) { 3128cefc107SDavid Howells unsigned int head = pipe->head; 3138cefc107SDavid Howells unsigned int tail = pipe->tail; 3148cefc107SDavid Howells unsigned int mask = pipe->ring_size - 1; 3158cefc107SDavid Howells 3168cefc107SDavid Howells if (!pipe_empty(head, tail)) { 3178cefc107SDavid Howells struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 3181da177e4SLinus Torvalds size_t chars = buf->len; 319637b58c2SAl Viro size_t written; 320637b58c2SAl Viro int error; 3211da177e4SLinus Torvalds 3221da177e4SLinus Torvalds if (chars > total_len) 3231da177e4SLinus Torvalds chars = total_len; 3241da177e4SLinus Torvalds 325fba597dbSMiklos Szeredi error = pipe_buf_confirm(pipe, buf); 326f84d7519SJens Axboe if (error) { 3275274f052SJens Axboe if (!ret) 328e5953cbdSNicolas Kaiser ret = error; 3295274f052SJens Axboe break; 3305274f052SJens Axboe } 331f84d7519SJens Axboe 332fb9096a3SAl Viro written = copy_page_to_iter(buf->page, buf->offset, chars, to); 333637b58c2SAl Viro if (unlikely(written < chars)) { 334341b446bSIngo Molnar if (!ret) 335637b58c2SAl Viro ret = -EFAULT; 3361da177e4SLinus Torvalds break; 3371da177e4SLinus Torvalds } 3381da177e4SLinus Torvalds ret += chars; 3391da177e4SLinus Torvalds buf->offset += chars; 3401da177e4SLinus Torvalds buf->len -= chars; 3419883035aSLinus Torvalds 3429883035aSLinus Torvalds /* Was it a packet buffer? Clean up and exit */ 3439883035aSLinus Torvalds if (buf->flags & PIPE_BUF_FLAG_PACKET) { 3449883035aSLinus Torvalds total_len = chars; 3459883035aSLinus Torvalds buf->len = 0; 3469883035aSLinus Torvalds } 3479883035aSLinus Torvalds 3481da177e4SLinus Torvalds if (!buf->len) { 349a779638cSMiklos Szeredi pipe_buf_release(pipe, buf); 3500ddad21dSLinus Torvalds spin_lock_irq(&pipe->rd_wait.lock); 3518cefc107SDavid Howells tail++; 3528cefc107SDavid Howells pipe->tail = tail; 3530ddad21dSLinus Torvalds spin_unlock_irq(&pipe->rd_wait.lock); 3541da177e4SLinus Torvalds } 3551da177e4SLinus Torvalds total_len -= chars; 3561da177e4SLinus Torvalds if (!total_len) 3571da177e4SLinus Torvalds break; /* common path: read succeeded */ 3588cefc107SDavid Howells if (!pipe_empty(head, tail)) /* More to do? */ 3591da177e4SLinus Torvalds continue; 3608cefc107SDavid Howells } 3618cefc107SDavid Howells 362923f4f23SIngo Molnar if (!pipe->writers) 3631da177e4SLinus Torvalds break; 3641da177e4SLinus Torvalds if (ret) 3651da177e4SLinus Torvalds break; 3661da177e4SLinus Torvalds if (filp->f_flags & O_NONBLOCK) { 3671da177e4SLinus Torvalds ret = -EAGAIN; 3681da177e4SLinus Torvalds break; 3691da177e4SLinus Torvalds } 37085190d15SLinus Torvalds __pipe_unlock(pipe); 371d1c6a2aaSLinus Torvalds 372d1c6a2aaSLinus Torvalds /* 373d1c6a2aaSLinus Torvalds * We only get here if we didn't actually read anything. 374d1c6a2aaSLinus Torvalds * 375d1c6a2aaSLinus Torvalds * However, we could have seen (and removed) a zero-sized 376d1c6a2aaSLinus Torvalds * pipe buffer, and might have made space in the buffers 377d1c6a2aaSLinus Torvalds * that way. 378d1c6a2aaSLinus Torvalds * 379d1c6a2aaSLinus Torvalds * You can't make zero-sized pipe buffers by doing an empty 380d1c6a2aaSLinus Torvalds * write (not even in packet mode), but they can happen if 381d1c6a2aaSLinus Torvalds * the writer gets an EFAULT when trying to fill a buffer 382d1c6a2aaSLinus Torvalds * that already got allocated and inserted in the buffer 383d1c6a2aaSLinus Torvalds * array. 384d1c6a2aaSLinus Torvalds * 385d1c6a2aaSLinus Torvalds * So we still need to wake up any pending writers in the 386d1c6a2aaSLinus Torvalds * _very_ unlikely case that the pipe was full, but we got 387d1c6a2aaSLinus Torvalds * no data. 388d1c6a2aaSLinus Torvalds */ 389d1c6a2aaSLinus Torvalds if (unlikely(was_full)) { 3900ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 391f467a6a6SLinus Torvalds kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 392f467a6a6SLinus Torvalds } 393d1c6a2aaSLinus Torvalds 394d1c6a2aaSLinus Torvalds /* 395d1c6a2aaSLinus Torvalds * But because we didn't read anything, at this point we can 396d1c6a2aaSLinus Torvalds * just return directly with -ERESTARTSYS if we're interrupted, 397d1c6a2aaSLinus Torvalds * since we've done any required wakeups and there's no need 398d1c6a2aaSLinus Torvalds * to mark anything accessed. And we've dropped the lock. 399d1c6a2aaSLinus Torvalds */ 4000ddad21dSLinus Torvalds if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 401d1c6a2aaSLinus Torvalds return -ERESTARTSYS; 402d1c6a2aaSLinus Torvalds 40385190d15SLinus Torvalds __pipe_lock(pipe); 404f467a6a6SLinus Torvalds was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 4050ddad21dSLinus Torvalds wake_next_reader = true; 4061da177e4SLinus Torvalds } 4070ddad21dSLinus Torvalds if (pipe_empty(pipe->head, pipe->tail)) 4080ddad21dSLinus Torvalds wake_next_reader = false; 409ebec73f4SAl Viro __pipe_unlock(pipe); 410341b446bSIngo Molnar 411f467a6a6SLinus Torvalds if (was_full) { 4120ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 413923f4f23SIngo Molnar kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 4141da177e4SLinus Torvalds } 4150ddad21dSLinus Torvalds if (wake_next_reader) 4160ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 4171da177e4SLinus Torvalds if (ret > 0) 4181da177e4SLinus Torvalds file_accessed(filp); 4191da177e4SLinus Torvalds return ret; 4201da177e4SLinus Torvalds } 4211da177e4SLinus Torvalds 4229883035aSLinus Torvalds static inline int is_packetized(struct file *file) 4239883035aSLinus Torvalds { 4249883035aSLinus Torvalds return (file->f_flags & O_DIRECT) != 0; 4259883035aSLinus Torvalds } 4269883035aSLinus Torvalds 42785190d15SLinus Torvalds /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 42885190d15SLinus Torvalds static inline bool pipe_writable(const struct pipe_inode_info *pipe) 42985190d15SLinus Torvalds { 43085190d15SLinus Torvalds unsigned int head = READ_ONCE(pipe->head); 43185190d15SLinus Torvalds unsigned int tail = READ_ONCE(pipe->tail); 43285190d15SLinus Torvalds unsigned int max_usage = READ_ONCE(pipe->max_usage); 43385190d15SLinus Torvalds 43485190d15SLinus Torvalds return !pipe_full(head, tail, max_usage) || 43585190d15SLinus Torvalds !READ_ONCE(pipe->readers); 43685190d15SLinus Torvalds } 43785190d15SLinus Torvalds 4381da177e4SLinus Torvalds static ssize_t 439f0d1bec9SAl Viro pipe_write(struct kiocb *iocb, struct iov_iter *from) 4401da177e4SLinus Torvalds { 441ee0b3e67SBadari Pulavarty struct file *filp = iocb->ki_filp; 442de32ec4cSAl Viro struct pipe_inode_info *pipe = filp->private_data; 4438f868d68SDavid Howells unsigned int head; 444f0d1bec9SAl Viro ssize_t ret = 0; 445f0d1bec9SAl Viro size_t total_len = iov_iter_count(from); 4461da177e4SLinus Torvalds ssize_t chars; 4471b6b26aeSLinus Torvalds bool was_empty = false; 4480ddad21dSLinus Torvalds bool wake_next_writer = false; 4491da177e4SLinus Torvalds 4501da177e4SLinus Torvalds /* Null write succeeds. */ 4511da177e4SLinus Torvalds if (unlikely(total_len == 0)) 4521da177e4SLinus Torvalds return 0; 4531da177e4SLinus Torvalds 454ebec73f4SAl Viro __pipe_lock(pipe); 4551da177e4SLinus Torvalds 456923f4f23SIngo Molnar if (!pipe->readers) { 4571da177e4SLinus Torvalds send_sig(SIGPIPE, current, 0); 4581da177e4SLinus Torvalds ret = -EPIPE; 4591da177e4SLinus Torvalds goto out; 4601da177e4SLinus Torvalds } 4611da177e4SLinus Torvalds 4621b6b26aeSLinus Torvalds /* 4631b6b26aeSLinus Torvalds * Only wake up if the pipe started out empty, since 4641b6b26aeSLinus Torvalds * otherwise there should be no readers waiting. 4651b6b26aeSLinus Torvalds * 4661b6b26aeSLinus Torvalds * If it wasn't empty we try to merge new data into 4671b6b26aeSLinus Torvalds * the last buffer. 4681b6b26aeSLinus Torvalds * 4691b6b26aeSLinus Torvalds * That naturally merges small writes, but it also 4701b6b26aeSLinus Torvalds * page-aligs the rest of the writes for large writes 4711b6b26aeSLinus Torvalds * spanning multiple pages. 4721b6b26aeSLinus Torvalds */ 4738cefc107SDavid Howells head = pipe->head; 4741b6b26aeSLinus Torvalds was_empty = pipe_empty(head, pipe->tail); 4751b6b26aeSLinus Torvalds chars = total_len & (PAGE_SIZE-1); 4761b6b26aeSLinus Torvalds if (chars && !was_empty) { 4778f868d68SDavid Howells unsigned int mask = pipe->ring_size - 1; 4788cefc107SDavid Howells struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 4791da177e4SLinus Torvalds int offset = buf->offset + buf->len; 480341b446bSIngo Molnar 48101e7187bSJann Horn if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 482fba597dbSMiklos Szeredi ret = pipe_buf_confirm(pipe, buf); 4836ae08069SEric Biggers if (ret) 4845274f052SJens Axboe goto out; 485f84d7519SJens Axboe 486f0d1bec9SAl Viro ret = copy_page_from_iter(buf->page, offset, chars, from); 487f0d1bec9SAl Viro if (unlikely(ret < chars)) { 4886ae08069SEric Biggers ret = -EFAULT; 4891da177e4SLinus Torvalds goto out; 490f6762b7aSJens Axboe } 4911b6b26aeSLinus Torvalds 4926ae08069SEric Biggers buf->len += ret; 493f0d1bec9SAl Viro if (!iov_iter_count(from)) 4941da177e4SLinus Torvalds goto out; 4951da177e4SLinus Torvalds } 4961da177e4SLinus Torvalds } 4971da177e4SLinus Torvalds 4981da177e4SLinus Torvalds for (;;) { 499923f4f23SIngo Molnar if (!pipe->readers) { 5001da177e4SLinus Torvalds send_sig(SIGPIPE, current, 0); 501341b446bSIngo Molnar if (!ret) 502341b446bSIngo Molnar ret = -EPIPE; 5031da177e4SLinus Torvalds break; 5041da177e4SLinus Torvalds } 5058cefc107SDavid Howells 506a194dfe6SDavid Howells head = pipe->head; 5078f868d68SDavid Howells if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 5088f868d68SDavid Howells unsigned int mask = pipe->ring_size - 1; 5098cefc107SDavid Howells struct pipe_buffer *buf = &pipe->bufs[head & mask]; 510923f4f23SIngo Molnar struct page *page = pipe->tmp_page; 511f0d1bec9SAl Viro int copied; 5121da177e4SLinus Torvalds 5131da177e4SLinus Torvalds if (!page) { 514d86133bdSVladimir Davydov page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 5151da177e4SLinus Torvalds if (unlikely(!page)) { 5161da177e4SLinus Torvalds ret = ret ? : -ENOMEM; 5171da177e4SLinus Torvalds break; 5181da177e4SLinus Torvalds } 519923f4f23SIngo Molnar pipe->tmp_page = page; 5201da177e4SLinus Torvalds } 521a194dfe6SDavid Howells 522a194dfe6SDavid Howells /* Allocate a slot in the ring in advance and attach an 523a194dfe6SDavid Howells * empty buffer. If we fault or otherwise fail to use 524a194dfe6SDavid Howells * it, either the reader will consume it or it'll still 525a194dfe6SDavid Howells * be there for the next write. 526a194dfe6SDavid Howells */ 5270ddad21dSLinus Torvalds spin_lock_irq(&pipe->rd_wait.lock); 528a194dfe6SDavid Howells 529a194dfe6SDavid Howells head = pipe->head; 5308f868d68SDavid Howells if (pipe_full(head, pipe->tail, pipe->max_usage)) { 5310ddad21dSLinus Torvalds spin_unlock_irq(&pipe->rd_wait.lock); 5328df44129SDavid Howells continue; 5338df44129SDavid Howells } 5348df44129SDavid Howells 535a194dfe6SDavid Howells pipe->head = head + 1; 5360ddad21dSLinus Torvalds spin_unlock_irq(&pipe->rd_wait.lock); 537a194dfe6SDavid Howells 538a194dfe6SDavid Howells /* Insert it into the buffer array */ 539a194dfe6SDavid Howells buf = &pipe->bufs[head & mask]; 540a194dfe6SDavid Howells buf->page = page; 541a194dfe6SDavid Howells buf->ops = &anon_pipe_buf_ops; 542a194dfe6SDavid Howells buf->offset = 0; 543a194dfe6SDavid Howells buf->len = 0; 544a194dfe6SDavid Howells buf->flags = 0; 545a194dfe6SDavid Howells if (is_packetized(filp)) { 546a194dfe6SDavid Howells buf->ops = &packet_pipe_buf_ops; 547a194dfe6SDavid Howells buf->flags = PIPE_BUF_FLAG_PACKET; 548a194dfe6SDavid Howells } 549a194dfe6SDavid Howells pipe->tmp_page = NULL; 550a194dfe6SDavid Howells 551f0d1bec9SAl Viro copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 552f0d1bec9SAl Viro if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 553341b446bSIngo Molnar if (!ret) 554f0d1bec9SAl Viro ret = -EFAULT; 5551da177e4SLinus Torvalds break; 5561da177e4SLinus Torvalds } 557f0d1bec9SAl Viro ret += copied; 5581da177e4SLinus Torvalds buf->offset = 0; 559f0d1bec9SAl Viro buf->len = copied; 5601da177e4SLinus Torvalds 561f0d1bec9SAl Viro if (!iov_iter_count(from)) 5621da177e4SLinus Torvalds break; 5631da177e4SLinus Torvalds } 5648cefc107SDavid Howells 5658f868d68SDavid Howells if (!pipe_full(head, pipe->tail, pipe->max_usage)) 5661da177e4SLinus Torvalds continue; 5678cefc107SDavid Howells 5688cefc107SDavid Howells /* Wait for buffer space to become available. */ 5691da177e4SLinus Torvalds if (filp->f_flags & O_NONBLOCK) { 570341b446bSIngo Molnar if (!ret) 571341b446bSIngo Molnar ret = -EAGAIN; 5721da177e4SLinus Torvalds break; 5731da177e4SLinus Torvalds } 5741da177e4SLinus Torvalds if (signal_pending(current)) { 575341b446bSIngo Molnar if (!ret) 576341b446bSIngo Molnar ret = -ERESTARTSYS; 5771da177e4SLinus Torvalds break; 5781da177e4SLinus Torvalds } 5791b6b26aeSLinus Torvalds 5801b6b26aeSLinus Torvalds /* 5811b6b26aeSLinus Torvalds * We're going to release the pipe lock and wait for more 5821b6b26aeSLinus Torvalds * space. We wake up any readers if necessary, and then 5831b6b26aeSLinus Torvalds * after waiting we need to re-check whether the pipe 5841b6b26aeSLinus Torvalds * become empty while we dropped the lock. 5851b6b26aeSLinus Torvalds */ 58685190d15SLinus Torvalds __pipe_unlock(pipe); 5871b6b26aeSLinus Torvalds if (was_empty) { 5880ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 5891b6b26aeSLinus Torvalds kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 5901b6b26aeSLinus Torvalds } 5910ddad21dSLinus Torvalds wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 59285190d15SLinus Torvalds __pipe_lock(pipe); 5930dd1e377SJan Stancek was_empty = pipe_empty(pipe->head, pipe->tail); 5940ddad21dSLinus Torvalds wake_next_writer = true; 5951da177e4SLinus Torvalds } 5961da177e4SLinus Torvalds out: 5970ddad21dSLinus Torvalds if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 5980ddad21dSLinus Torvalds wake_next_writer = false; 599ebec73f4SAl Viro __pipe_unlock(pipe); 6001b6b26aeSLinus Torvalds 6011b6b26aeSLinus Torvalds /* 6021b6b26aeSLinus Torvalds * If we do do a wakeup event, we do a 'sync' wakeup, because we 6031b6b26aeSLinus Torvalds * want the reader to start processing things asap, rather than 6041b6b26aeSLinus Torvalds * leave the data pending. 6051b6b26aeSLinus Torvalds * 6061b6b26aeSLinus Torvalds * This is particularly important for small writes, because of 6071b6b26aeSLinus Torvalds * how (for example) the GNU make jobserver uses small writes to 6081b6b26aeSLinus Torvalds * wake up pending jobs 6091b6b26aeSLinus Torvalds */ 6101b6b26aeSLinus Torvalds if (was_empty) { 6110ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 612923f4f23SIngo Molnar kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 6131da177e4SLinus Torvalds } 6140ddad21dSLinus Torvalds if (wake_next_writer) 6150ddad21dSLinus Torvalds wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 6167e775f46SDmitry Monakhov if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 617c3b2da31SJosef Bacik int err = file_update_time(filp); 618c3b2da31SJosef Bacik if (err) 619c3b2da31SJosef Bacik ret = err; 6207e775f46SDmitry Monakhov sb_end_write(file_inode(filp)->i_sb); 621c3b2da31SJosef Bacik } 6221da177e4SLinus Torvalds return ret; 6231da177e4SLinus Torvalds } 6241da177e4SLinus Torvalds 625d59d0b1bSAndi Kleen static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 6261da177e4SLinus Torvalds { 627de32ec4cSAl Viro struct pipe_inode_info *pipe = filp->private_data; 6288cefc107SDavid Howells int count, head, tail, mask; 6291da177e4SLinus Torvalds 6301da177e4SLinus Torvalds switch (cmd) { 6311da177e4SLinus Torvalds case FIONREAD: 632ebec73f4SAl Viro __pipe_lock(pipe); 6331da177e4SLinus Torvalds count = 0; 6348cefc107SDavid Howells head = pipe->head; 6358cefc107SDavid Howells tail = pipe->tail; 6368cefc107SDavid Howells mask = pipe->ring_size - 1; 6378cefc107SDavid Howells 6388cefc107SDavid Howells while (tail != head) { 6398cefc107SDavid Howells count += pipe->bufs[tail & mask].len; 6408cefc107SDavid Howells tail++; 6411da177e4SLinus Torvalds } 642ebec73f4SAl Viro __pipe_unlock(pipe); 643923f4f23SIngo Molnar 6441da177e4SLinus Torvalds return put_user(count, (int __user *)arg); 6451da177e4SLinus Torvalds default: 64646ce341bSWill Deacon return -ENOIOCTLCMD; 6471da177e4SLinus Torvalds } 6481da177e4SLinus Torvalds } 6491da177e4SLinus Torvalds 650dd67081bSChristoph Hellwig /* No kernel lock held - fine */ 651a11e1d43SLinus Torvalds static __poll_t 652a11e1d43SLinus Torvalds pipe_poll(struct file *filp, poll_table *wait) 653dd67081bSChristoph Hellwig { 654a11e1d43SLinus Torvalds __poll_t mask; 655dd67081bSChristoph Hellwig struct pipe_inode_info *pipe = filp->private_data; 656ad910e36SLinus Torvalds unsigned int head, tail; 657a11e1d43SLinus Torvalds 658ad910e36SLinus Torvalds /* 6590ddad21dSLinus Torvalds * Reading pipe state only -- no need for acquiring the semaphore. 660ad910e36SLinus Torvalds * 661ad910e36SLinus Torvalds * But because this is racy, the code has to add the 662ad910e36SLinus Torvalds * entry to the poll table _first_ .. 663ad910e36SLinus Torvalds */ 6640ddad21dSLinus Torvalds if (filp->f_mode & FMODE_READ) 6650ddad21dSLinus Torvalds poll_wait(filp, &pipe->rd_wait, wait); 6660ddad21dSLinus Torvalds if (filp->f_mode & FMODE_WRITE) 6670ddad21dSLinus Torvalds poll_wait(filp, &pipe->wr_wait, wait); 6681da177e4SLinus Torvalds 669ad910e36SLinus Torvalds /* 670ad910e36SLinus Torvalds * .. and only then can you do the racy tests. That way, 671ad910e36SLinus Torvalds * if something changes and you got it wrong, the poll 672ad910e36SLinus Torvalds * table entry will wake you up and fix it. 673ad910e36SLinus Torvalds */ 674ad910e36SLinus Torvalds head = READ_ONCE(pipe->head); 675ad910e36SLinus Torvalds tail = READ_ONCE(pipe->tail); 676ad910e36SLinus Torvalds 677a11e1d43SLinus Torvalds mask = 0; 6781da177e4SLinus Torvalds if (filp->f_mode & FMODE_READ) { 6798cefc107SDavid Howells if (!pipe_empty(head, tail)) 6808cefc107SDavid Howells mask |= EPOLLIN | EPOLLRDNORM; 681923f4f23SIngo Molnar if (!pipe->writers && filp->f_version != pipe->w_counter) 682a9a08845SLinus Torvalds mask |= EPOLLHUP; 6831da177e4SLinus Torvalds } 6841da177e4SLinus Torvalds 6851da177e4SLinus Torvalds if (filp->f_mode & FMODE_WRITE) { 6866718b6f8SDavid Howells if (!pipe_full(head, tail, pipe->max_usage)) 6878cefc107SDavid Howells mask |= EPOLLOUT | EPOLLWRNORM; 6885e5d7a22SPekka Enberg /* 689a9a08845SLinus Torvalds * Most Unices do not set EPOLLERR for FIFOs but on Linux they 6905e5d7a22SPekka Enberg * behave exactly like pipes for poll(). 6915e5d7a22SPekka Enberg */ 692923f4f23SIngo Molnar if (!pipe->readers) 693a9a08845SLinus Torvalds mask |= EPOLLERR; 6941da177e4SLinus Torvalds } 6951da177e4SLinus Torvalds 6961da177e4SLinus Torvalds return mask; 6971da177e4SLinus Torvalds } 6981da177e4SLinus Torvalds 699b0d8d229SLinus Torvalds static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 700b0d8d229SLinus Torvalds { 701b0d8d229SLinus Torvalds int kill = 0; 702b0d8d229SLinus Torvalds 703b0d8d229SLinus Torvalds spin_lock(&inode->i_lock); 704b0d8d229SLinus Torvalds if (!--pipe->files) { 705b0d8d229SLinus Torvalds inode->i_pipe = NULL; 706b0d8d229SLinus Torvalds kill = 1; 707b0d8d229SLinus Torvalds } 708b0d8d229SLinus Torvalds spin_unlock(&inode->i_lock); 709b0d8d229SLinus Torvalds 710b0d8d229SLinus Torvalds if (kill) 711b0d8d229SLinus Torvalds free_pipe_info(pipe); 712b0d8d229SLinus Torvalds } 713b0d8d229SLinus Torvalds 7141da177e4SLinus Torvalds static int 715599a0ac1SAl Viro pipe_release(struct inode *inode, struct file *file) 7161da177e4SLinus Torvalds { 717b0d8d229SLinus Torvalds struct pipe_inode_info *pipe = file->private_data; 718923f4f23SIngo Molnar 719ebec73f4SAl Viro __pipe_lock(pipe); 720599a0ac1SAl Viro if (file->f_mode & FMODE_READ) 721599a0ac1SAl Viro pipe->readers--; 722599a0ac1SAl Viro if (file->f_mode & FMODE_WRITE) 723599a0ac1SAl Viro pipe->writers--; 724341b446bSIngo Molnar 7256551d5c5SLinus Torvalds /* Was that the last reader or writer, but not the other side? */ 7266551d5c5SLinus Torvalds if (!pipe->readers != !pipe->writers) { 7276551d5c5SLinus Torvalds wake_up_interruptible_all(&pipe->rd_wait); 7286551d5c5SLinus Torvalds wake_up_interruptible_all(&pipe->wr_wait); 729923f4f23SIngo Molnar kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 730923f4f23SIngo Molnar kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 7311da177e4SLinus Torvalds } 732ebec73f4SAl Viro __pipe_unlock(pipe); 733ba5bb147SAl Viro 734b0d8d229SLinus Torvalds put_pipe_info(inode, pipe); 7351da177e4SLinus Torvalds return 0; 7361da177e4SLinus Torvalds } 7371da177e4SLinus Torvalds 7381da177e4SLinus Torvalds static int 739599a0ac1SAl Viro pipe_fasync(int fd, struct file *filp, int on) 7401da177e4SLinus Torvalds { 741de32ec4cSAl Viro struct pipe_inode_info *pipe = filp->private_data; 742599a0ac1SAl Viro int retval = 0; 7431da177e4SLinus Torvalds 744ebec73f4SAl Viro __pipe_lock(pipe); 745599a0ac1SAl Viro if (filp->f_mode & FMODE_READ) 746341b446bSIngo Molnar retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 747599a0ac1SAl Viro if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 748341b446bSIngo Molnar retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 749599a0ac1SAl Viro if (retval < 0 && (filp->f_mode & FMODE_READ)) 750599a0ac1SAl Viro /* this can happen only if on == T */ 751e5bc49baSOleg Nesterov fasync_helper(-1, filp, 0, &pipe->fasync_readers); 752e5bc49baSOleg Nesterov } 753ebec73f4SAl Viro __pipe_unlock(pipe); 7541da177e4SLinus Torvalds return retval; 7551da177e4SLinus Torvalds } 7561da177e4SLinus Torvalds 7579c87bcf0SMichael Kerrisk (man-pages) static unsigned long account_pipe_buffers(struct user_struct *user, 758759c0114SWilly Tarreau unsigned long old, unsigned long new) 759759c0114SWilly Tarreau { 7609c87bcf0SMichael Kerrisk (man-pages) return atomic_long_add_return(new - old, &user->pipe_bufs); 761759c0114SWilly Tarreau } 762759c0114SWilly Tarreau 7639c87bcf0SMichael Kerrisk (man-pages) static bool too_many_pipe_buffers_soft(unsigned long user_bufs) 764759c0114SWilly Tarreau { 765f7340761SEric Biggers unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 766f7340761SEric Biggers 767f7340761SEric Biggers return soft_limit && user_bufs > soft_limit; 768759c0114SWilly Tarreau } 769759c0114SWilly Tarreau 7709c87bcf0SMichael Kerrisk (man-pages) static bool too_many_pipe_buffers_hard(unsigned long user_bufs) 771759c0114SWilly Tarreau { 772f7340761SEric Biggers unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 773f7340761SEric Biggers 774f7340761SEric Biggers return hard_limit && user_bufs > hard_limit; 775759c0114SWilly Tarreau } 776759c0114SWilly Tarreau 77785c2dd54SEric Biggers static bool is_unprivileged_user(void) 77885c2dd54SEric Biggers { 77985c2dd54SEric Biggers return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 78085c2dd54SEric Biggers } 78185c2dd54SEric Biggers 7827bee130eSAl Viro struct pipe_inode_info *alloc_pipe_info(void) 7833a326a2cSIngo Molnar { 784923f4f23SIngo Molnar struct pipe_inode_info *pipe; 785759c0114SWilly Tarreau unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 786759c0114SWilly Tarreau struct user_struct *user = get_current_user(); 7879c87bcf0SMichael Kerrisk (man-pages) unsigned long user_bufs; 788f7340761SEric Biggers unsigned int max_size = READ_ONCE(pipe_max_size); 789759c0114SWilly Tarreau 79009b4d199SMichael Kerrisk (man-pages) pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 79109b4d199SMichael Kerrisk (man-pages) if (pipe == NULL) 79209b4d199SMichael Kerrisk (man-pages) goto out_free_uid; 79309b4d199SMichael Kerrisk (man-pages) 794f7340761SEric Biggers if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 795f7340761SEric Biggers pipe_bufs = max_size >> PAGE_SHIFT; 796086e774aSMichael Kerrisk (man-pages) 7979c87bcf0SMichael Kerrisk (man-pages) user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 798a005ca0eSMichael Kerrisk (man-pages) 79985c2dd54SEric Biggers if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { 8009c87bcf0SMichael Kerrisk (man-pages) user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 801759c0114SWilly Tarreau pipe_bufs = 1; 802759c0114SWilly Tarreau } 803759c0114SWilly Tarreau 80485c2dd54SEric Biggers if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) 805a005ca0eSMichael Kerrisk (man-pages) goto out_revert_acct; 806a005ca0eSMichael Kerrisk (man-pages) 807a005ca0eSMichael Kerrisk (man-pages) pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 808a005ca0eSMichael Kerrisk (man-pages) GFP_KERNEL_ACCOUNT); 809a005ca0eSMichael Kerrisk (man-pages) 81035f3d14dSJens Axboe if (pipe->bufs) { 8110ddad21dSLinus Torvalds init_waitqueue_head(&pipe->rd_wait); 8120ddad21dSLinus Torvalds init_waitqueue_head(&pipe->wr_wait); 813923f4f23SIngo Molnar pipe->r_counter = pipe->w_counter = 1; 8146718b6f8SDavid Howells pipe->max_usage = pipe_bufs; 8158cefc107SDavid Howells pipe->ring_size = pipe_bufs; 816759c0114SWilly Tarreau pipe->user = user; 81772b0d9aaSAl Viro mutex_init(&pipe->mutex); 81835f3d14dSJens Axboe return pipe; 81935f3d14dSJens Axboe } 8203a326a2cSIngo Molnar 821a005ca0eSMichael Kerrisk (man-pages) out_revert_acct: 8229c87bcf0SMichael Kerrisk (man-pages) (void) account_pipe_buffers(user, pipe_bufs, 0); 82309b4d199SMichael Kerrisk (man-pages) kfree(pipe); 82409b4d199SMichael Kerrisk (man-pages) out_free_uid: 82509b4d199SMichael Kerrisk (man-pages) free_uid(user); 82635f3d14dSJens Axboe return NULL; 8273a326a2cSIngo Molnar } 8283a326a2cSIngo Molnar 8294b8a8f1eSAl Viro void free_pipe_info(struct pipe_inode_info *pipe) 8301da177e4SLinus Torvalds { 8311da177e4SLinus Torvalds int i; 8321da177e4SLinus Torvalds 8338cefc107SDavid Howells (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); 834759c0114SWilly Tarreau free_uid(pipe->user); 8358cefc107SDavid Howells for (i = 0; i < pipe->ring_size; i++) { 836923f4f23SIngo Molnar struct pipe_buffer *buf = pipe->bufs + i; 8371da177e4SLinus Torvalds if (buf->ops) 838a779638cSMiklos Szeredi pipe_buf_release(pipe, buf); 8391da177e4SLinus Torvalds } 840923f4f23SIngo Molnar if (pipe->tmp_page) 841923f4f23SIngo Molnar __free_page(pipe->tmp_page); 84235f3d14dSJens Axboe kfree(pipe->bufs); 843923f4f23SIngo Molnar kfree(pipe); 8441da177e4SLinus Torvalds } 8451da177e4SLinus Torvalds 846fa3536ccSEric Dumazet static struct vfsmount *pipe_mnt __read_mostly; 847341b446bSIngo Molnar 848c23fbb6bSEric Dumazet /* 849c23fbb6bSEric Dumazet * pipefs_dname() is called from d_path(). 850c23fbb6bSEric Dumazet */ 851c23fbb6bSEric Dumazet static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 852c23fbb6bSEric Dumazet { 853c23fbb6bSEric Dumazet return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 85475c3cfa8SDavid Howells d_inode(dentry)->i_ino); 855c23fbb6bSEric Dumazet } 856c23fbb6bSEric Dumazet 8573ba13d17SAl Viro static const struct dentry_operations pipefs_dentry_operations = { 858c23fbb6bSEric Dumazet .d_dname = pipefs_dname, 8591da177e4SLinus Torvalds }; 8601da177e4SLinus Torvalds 8611da177e4SLinus Torvalds static struct inode * get_pipe_inode(void) 8621da177e4SLinus Torvalds { 863a209dfc7SEric Dumazet struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 864923f4f23SIngo Molnar struct pipe_inode_info *pipe; 8651da177e4SLinus Torvalds 8661da177e4SLinus Torvalds if (!inode) 8671da177e4SLinus Torvalds goto fail_inode; 8681da177e4SLinus Torvalds 86985fe4025SChristoph Hellwig inode->i_ino = get_next_ino(); 87085fe4025SChristoph Hellwig 8717bee130eSAl Viro pipe = alloc_pipe_info(); 872923f4f23SIngo Molnar if (!pipe) 8731da177e4SLinus Torvalds goto fail_iput; 8743a326a2cSIngo Molnar 875ba5bb147SAl Viro inode->i_pipe = pipe; 876ba5bb147SAl Viro pipe->files = 2; 877923f4f23SIngo Molnar pipe->readers = pipe->writers = 1; 878599a0ac1SAl Viro inode->i_fop = &pipefifo_fops; 8791da177e4SLinus Torvalds 8801da177e4SLinus Torvalds /* 8811da177e4SLinus Torvalds * Mark the inode dirty from the very beginning, 8821da177e4SLinus Torvalds * that way it will never be moved to the dirty 8831da177e4SLinus Torvalds * list because "mark_inode_dirty()" will think 8841da177e4SLinus Torvalds * that it already _is_ on the dirty list. 8851da177e4SLinus Torvalds */ 8861da177e4SLinus Torvalds inode->i_state = I_DIRTY; 8871da177e4SLinus Torvalds inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 888da9592edSDavid Howells inode->i_uid = current_fsuid(); 889da9592edSDavid Howells inode->i_gid = current_fsgid(); 890078cd827SDeepa Dinamani inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 891923f4f23SIngo Molnar 8921da177e4SLinus Torvalds return inode; 8931da177e4SLinus Torvalds 8941da177e4SLinus Torvalds fail_iput: 8951da177e4SLinus Torvalds iput(inode); 896341b446bSIngo Molnar 8971da177e4SLinus Torvalds fail_inode: 8981da177e4SLinus Torvalds return NULL; 8991da177e4SLinus Torvalds } 9001da177e4SLinus Torvalds 901e4fad8e5SAl Viro int create_pipe_files(struct file **res, int flags) 9021da177e4SLinus Torvalds { 903e4fad8e5SAl Viro struct inode *inode = get_pipe_inode(); 904d6cbd281SAndi Kleen struct file *f; 9051da177e4SLinus Torvalds 9061da177e4SLinus Torvalds if (!inode) 907e4fad8e5SAl Viro return -ENFILE; 9081da177e4SLinus Torvalds 909152b6372SAl Viro f = alloc_file_pseudo(inode, pipe_mnt, "", 910152b6372SAl Viro O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 911c9c554f2SAl Viro &pipefifo_fops); 912e9bb1f9bSEric Biggers if (IS_ERR(f)) { 9134b8a8f1eSAl Viro free_pipe_info(inode->i_pipe); 914d6cbd281SAndi Kleen iput(inode); 915152b6372SAl Viro return PTR_ERR(f); 9161da177e4SLinus Torvalds } 9171da177e4SLinus Torvalds 9181da177e4SLinus Torvalds f->private_data = inode->i_pipe; 9191da177e4SLinus Torvalds 920183266f2SAl Viro res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 921c9c554f2SAl Viro &pipefifo_fops); 9221da177e4SLinus Torvalds if (IS_ERR(res[0])) { 923b10a4a9fSAl Viro put_pipe_info(inode, inode->i_pipe); 924b10a4a9fSAl Viro fput(f); 925b10a4a9fSAl Viro return PTR_ERR(res[0]); 9261da177e4SLinus Torvalds } 9271da177e4SLinus Torvalds res[0]->private_data = inode->i_pipe; 9281da177e4SLinus Torvalds res[1] = f; 929d8e464ecSLinus Torvalds stream_open(inode, res[0]); 930d8e464ecSLinus Torvalds stream_open(inode, res[1]); 9311da177e4SLinus Torvalds return 0; 932d6cbd281SAndi Kleen } 933d6cbd281SAndi Kleen 9345b249b1bSAl Viro static int __do_pipe_flags(int *fd, struct file **files, int flags) 935d6cbd281SAndi Kleen { 936d6cbd281SAndi Kleen int error; 937d6cbd281SAndi Kleen int fdw, fdr; 938d6cbd281SAndi Kleen 9399883035aSLinus Torvalds if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 940ed8cae8bSUlrich Drepper return -EINVAL; 941ed8cae8bSUlrich Drepper 942e4fad8e5SAl Viro error = create_pipe_files(files, flags); 943e4fad8e5SAl Viro if (error) 944e4fad8e5SAl Viro return error; 945d6cbd281SAndi Kleen 946ed8cae8bSUlrich Drepper error = get_unused_fd_flags(flags); 947d6cbd281SAndi Kleen if (error < 0) 948d6cbd281SAndi Kleen goto err_read_pipe; 949d6cbd281SAndi Kleen fdr = error; 950d6cbd281SAndi Kleen 951ed8cae8bSUlrich Drepper error = get_unused_fd_flags(flags); 952d6cbd281SAndi Kleen if (error < 0) 953d6cbd281SAndi Kleen goto err_fdr; 954d6cbd281SAndi Kleen fdw = error; 955d6cbd281SAndi Kleen 956157cf649SAl Viro audit_fd_pair(fdr, fdw); 957d6cbd281SAndi Kleen fd[0] = fdr; 958d6cbd281SAndi Kleen fd[1] = fdw; 9591da177e4SLinus Torvalds return 0; 9601da177e4SLinus Torvalds 961d6cbd281SAndi Kleen err_fdr: 962d6cbd281SAndi Kleen put_unused_fd(fdr); 963d6cbd281SAndi Kleen err_read_pipe: 964e4fad8e5SAl Viro fput(files[0]); 965e4fad8e5SAl Viro fput(files[1]); 9661da177e4SLinus Torvalds return error; 9671da177e4SLinus Torvalds } 9681da177e4SLinus Torvalds 9695b249b1bSAl Viro int do_pipe_flags(int *fd, int flags) 9705b249b1bSAl Viro { 9715b249b1bSAl Viro struct file *files[2]; 9725b249b1bSAl Viro int error = __do_pipe_flags(fd, files, flags); 9735b249b1bSAl Viro if (!error) { 9745b249b1bSAl Viro fd_install(fd[0], files[0]); 9755b249b1bSAl Viro fd_install(fd[1], files[1]); 9765b249b1bSAl Viro } 9775b249b1bSAl Viro return error; 9785b249b1bSAl Viro } 9795b249b1bSAl Viro 9801da177e4SLinus Torvalds /* 981d35c7b0eSUlrich Drepper * sys_pipe() is the normal C calling standard for creating 982d35c7b0eSUlrich Drepper * a pipe. It's not the way Unix traditionally does this, though. 983d35c7b0eSUlrich Drepper */ 9840a216dd1SDominik Brodowski static int do_pipe2(int __user *fildes, int flags) 985d35c7b0eSUlrich Drepper { 9865b249b1bSAl Viro struct file *files[2]; 987d35c7b0eSUlrich Drepper int fd[2]; 988d35c7b0eSUlrich Drepper int error; 989d35c7b0eSUlrich Drepper 9905b249b1bSAl Viro error = __do_pipe_flags(fd, files, flags); 991d35c7b0eSUlrich Drepper if (!error) { 9925b249b1bSAl Viro if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 9935b249b1bSAl Viro fput(files[0]); 9945b249b1bSAl Viro fput(files[1]); 9955b249b1bSAl Viro put_unused_fd(fd[0]); 9965b249b1bSAl Viro put_unused_fd(fd[1]); 997d35c7b0eSUlrich Drepper error = -EFAULT; 9985b249b1bSAl Viro } else { 9995b249b1bSAl Viro fd_install(fd[0], files[0]); 10005b249b1bSAl Viro fd_install(fd[1], files[1]); 1001d35c7b0eSUlrich Drepper } 1002ba719baeSUlrich Drepper } 1003d35c7b0eSUlrich Drepper return error; 1004d35c7b0eSUlrich Drepper } 1005d35c7b0eSUlrich Drepper 10060a216dd1SDominik Brodowski SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 10070a216dd1SDominik Brodowski { 10080a216dd1SDominik Brodowski return do_pipe2(fildes, flags); 10090a216dd1SDominik Brodowski } 10100a216dd1SDominik Brodowski 10112b664219SHeiko Carstens SYSCALL_DEFINE1(pipe, int __user *, fildes) 1012ed8cae8bSUlrich Drepper { 10130a216dd1SDominik Brodowski return do_pipe2(fildes, 0); 1014ed8cae8bSUlrich Drepper } 1015ed8cae8bSUlrich Drepper 1016fc7478a2SAl Viro static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1017f776c738SAl Viro { 1018f776c738SAl Viro int cur = *cnt; 1019f776c738SAl Viro 1020f776c738SAl Viro while (cur == *cnt) { 1021fc7478a2SAl Viro pipe_wait(pipe); 1022f776c738SAl Viro if (signal_pending(current)) 1023f776c738SAl Viro break; 1024f776c738SAl Viro } 1025f776c738SAl Viro return cur == *cnt ? -ERESTARTSYS : 0; 1026f776c738SAl Viro } 1027f776c738SAl Viro 1028fc7478a2SAl Viro static void wake_up_partner(struct pipe_inode_info *pipe) 1029f776c738SAl Viro { 10306551d5c5SLinus Torvalds wake_up_interruptible_all(&pipe->rd_wait); 10316551d5c5SLinus Torvalds wake_up_interruptible_all(&pipe->wr_wait); 1032f776c738SAl Viro } 1033f776c738SAl Viro 1034f776c738SAl Viro static int fifo_open(struct inode *inode, struct file *filp) 1035f776c738SAl Viro { 1036f776c738SAl Viro struct pipe_inode_info *pipe; 1037599a0ac1SAl Viro bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1038f776c738SAl Viro int ret; 1039f776c738SAl Viro 1040ba5bb147SAl Viro filp->f_version = 0; 1041ba5bb147SAl Viro 1042ba5bb147SAl Viro spin_lock(&inode->i_lock); 1043ba5bb147SAl Viro if (inode->i_pipe) { 1044f776c738SAl Viro pipe = inode->i_pipe; 1045ba5bb147SAl Viro pipe->files++; 1046ba5bb147SAl Viro spin_unlock(&inode->i_lock); 1047ba5bb147SAl Viro } else { 1048ba5bb147SAl Viro spin_unlock(&inode->i_lock); 10497bee130eSAl Viro pipe = alloc_pipe_info(); 1050f776c738SAl Viro if (!pipe) 1051ba5bb147SAl Viro return -ENOMEM; 1052ba5bb147SAl Viro pipe->files = 1; 1053ba5bb147SAl Viro spin_lock(&inode->i_lock); 1054ba5bb147SAl Viro if (unlikely(inode->i_pipe)) { 1055ba5bb147SAl Viro inode->i_pipe->files++; 1056ba5bb147SAl Viro spin_unlock(&inode->i_lock); 10574b8a8f1eSAl Viro free_pipe_info(pipe); 1058ba5bb147SAl Viro pipe = inode->i_pipe; 1059ba5bb147SAl Viro } else { 1060f776c738SAl Viro inode->i_pipe = pipe; 1061ba5bb147SAl Viro spin_unlock(&inode->i_lock); 1062f776c738SAl Viro } 1063ba5bb147SAl Viro } 1064de32ec4cSAl Viro filp->private_data = pipe; 1065ba5bb147SAl Viro /* OK, we have a pipe and it's pinned down */ 1066ba5bb147SAl Viro 1067ebec73f4SAl Viro __pipe_lock(pipe); 1068f776c738SAl Viro 1069f776c738SAl Viro /* We can only do regular read/write on fifos */ 1070d8e464ecSLinus Torvalds stream_open(inode, filp); 1071f776c738SAl Viro 1072d8e464ecSLinus Torvalds switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1073f776c738SAl Viro case FMODE_READ: 1074f776c738SAl Viro /* 1075f776c738SAl Viro * O_RDONLY 1076f776c738SAl Viro * POSIX.1 says that O_NONBLOCK means return with the FIFO 1077f776c738SAl Viro * opened, even when there is no process writing the FIFO. 1078f776c738SAl Viro */ 1079f776c738SAl Viro pipe->r_counter++; 1080f776c738SAl Viro if (pipe->readers++ == 0) 1081fc7478a2SAl Viro wake_up_partner(pipe); 1082f776c738SAl Viro 1083599a0ac1SAl Viro if (!is_pipe && !pipe->writers) { 1084f776c738SAl Viro if ((filp->f_flags & O_NONBLOCK)) { 1085a9a08845SLinus Torvalds /* suppress EPOLLHUP until we have 1086f776c738SAl Viro * seen a writer */ 1087f776c738SAl Viro filp->f_version = pipe->w_counter; 1088f776c738SAl Viro } else { 1089fc7478a2SAl Viro if (wait_for_partner(pipe, &pipe->w_counter)) 1090f776c738SAl Viro goto err_rd; 1091f776c738SAl Viro } 1092f776c738SAl Viro } 1093f776c738SAl Viro break; 1094f776c738SAl Viro 1095f776c738SAl Viro case FMODE_WRITE: 1096f776c738SAl Viro /* 1097f776c738SAl Viro * O_WRONLY 1098f776c738SAl Viro * POSIX.1 says that O_NONBLOCK means return -1 with 1099f776c738SAl Viro * errno=ENXIO when there is no process reading the FIFO. 1100f776c738SAl Viro */ 1101f776c738SAl Viro ret = -ENXIO; 1102599a0ac1SAl Viro if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1103f776c738SAl Viro goto err; 1104f776c738SAl Viro 1105f776c738SAl Viro pipe->w_counter++; 1106f776c738SAl Viro if (!pipe->writers++) 1107fc7478a2SAl Viro wake_up_partner(pipe); 1108f776c738SAl Viro 1109599a0ac1SAl Viro if (!is_pipe && !pipe->readers) { 1110fc7478a2SAl Viro if (wait_for_partner(pipe, &pipe->r_counter)) 1111f776c738SAl Viro goto err_wr; 1112f776c738SAl Viro } 1113f776c738SAl Viro break; 1114f776c738SAl Viro 1115f776c738SAl Viro case FMODE_READ | FMODE_WRITE: 1116f776c738SAl Viro /* 1117f776c738SAl Viro * O_RDWR 1118f776c738SAl Viro * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1119f776c738SAl Viro * This implementation will NEVER block on a O_RDWR open, since 1120f776c738SAl Viro * the process can at least talk to itself. 1121f776c738SAl Viro */ 1122f776c738SAl Viro 1123f776c738SAl Viro pipe->readers++; 1124f776c738SAl Viro pipe->writers++; 1125f776c738SAl Viro pipe->r_counter++; 1126f776c738SAl Viro pipe->w_counter++; 1127f776c738SAl Viro if (pipe->readers == 1 || pipe->writers == 1) 1128fc7478a2SAl Viro wake_up_partner(pipe); 1129f776c738SAl Viro break; 1130f776c738SAl Viro 1131f776c738SAl Viro default: 1132f776c738SAl Viro ret = -EINVAL; 1133f776c738SAl Viro goto err; 1134f776c738SAl Viro } 1135f776c738SAl Viro 1136f776c738SAl Viro /* Ok! */ 1137ebec73f4SAl Viro __pipe_unlock(pipe); 1138f776c738SAl Viro return 0; 1139f776c738SAl Viro 1140f776c738SAl Viro err_rd: 1141f776c738SAl Viro if (!--pipe->readers) 11420ddad21dSLinus Torvalds wake_up_interruptible(&pipe->wr_wait); 1143f776c738SAl Viro ret = -ERESTARTSYS; 1144f776c738SAl Viro goto err; 1145f776c738SAl Viro 1146f776c738SAl Viro err_wr: 1147f776c738SAl Viro if (!--pipe->writers) 11486551d5c5SLinus Torvalds wake_up_interruptible_all(&pipe->rd_wait); 1149f776c738SAl Viro ret = -ERESTARTSYS; 1150f776c738SAl Viro goto err; 1151f776c738SAl Viro 1152f776c738SAl Viro err: 1153ebec73f4SAl Viro __pipe_unlock(pipe); 1154b0d8d229SLinus Torvalds 1155b0d8d229SLinus Torvalds put_pipe_info(inode, pipe); 1156f776c738SAl Viro return ret; 1157f776c738SAl Viro } 1158f776c738SAl Viro 1159599a0ac1SAl Viro const struct file_operations pipefifo_fops = { 1160599a0ac1SAl Viro .open = fifo_open, 1161599a0ac1SAl Viro .llseek = no_llseek, 1162fb9096a3SAl Viro .read_iter = pipe_read, 1163f0d1bec9SAl Viro .write_iter = pipe_write, 1164a11e1d43SLinus Torvalds .poll = pipe_poll, 1165599a0ac1SAl Viro .unlocked_ioctl = pipe_ioctl, 1166599a0ac1SAl Viro .release = pipe_release, 1167599a0ac1SAl Viro .fasync = pipe_fasync, 1168f776c738SAl Viro }; 1169f776c738SAl Viro 1170d35c7b0eSUlrich Drepper /* 1171f491bd71SMichael Kerrisk (man-pages) * Currently we rely on the pipe array holding a power-of-2 number 1172d3f14c48SJoe Lawrence * of pages. Returns 0 on error. 1173f491bd71SMichael Kerrisk (man-pages) */ 117496e99be4SEric Biggers unsigned int round_pipe_size(unsigned long size) 1175f491bd71SMichael Kerrisk (man-pages) { 1176c4fed5a9SEric Biggers if (size > (1U << 31)) 117796e99be4SEric Biggers return 0; 117896e99be4SEric Biggers 11794c2e4befSEric Biggers /* Minimum pipe size, as required by POSIX */ 11804c2e4befSEric Biggers if (size < PAGE_SIZE) 1181c4fed5a9SEric Biggers return PAGE_SIZE; 1182d3f14c48SJoe Lawrence 1183c4fed5a9SEric Biggers return roundup_pow_of_two(size); 1184f491bd71SMichael Kerrisk (man-pages) } 1185f491bd71SMichael Kerrisk (man-pages) 1186f491bd71SMichael Kerrisk (man-pages) /* 118735f3d14dSJens Axboe * Allocate a new array of pipe buffers and copy the info over. Returns the 118835f3d14dSJens Axboe * pipe size if successful, or return -ERROR on error. 118935f3d14dSJens Axboe */ 1190d37d4166SMichael Kerrisk (man-pages) static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 119135f3d14dSJens Axboe { 119235f3d14dSJens Axboe struct pipe_buffer *bufs; 11938cefc107SDavid Howells unsigned int size, nr_slots, head, tail, mask, n; 11949c87bcf0SMichael Kerrisk (man-pages) unsigned long user_bufs; 1195b0b91d18SMichael Kerrisk (man-pages) long ret = 0; 1196d37d4166SMichael Kerrisk (man-pages) 1197d37d4166SMichael Kerrisk (man-pages) size = round_pipe_size(arg); 11988cefc107SDavid Howells nr_slots = size >> PAGE_SHIFT; 1199d37d4166SMichael Kerrisk (man-pages) 12008cefc107SDavid Howells if (!nr_slots) 1201d37d4166SMichael Kerrisk (man-pages) return -EINVAL; 1202d37d4166SMichael Kerrisk (man-pages) 1203b0b91d18SMichael Kerrisk (man-pages) /* 1204b0b91d18SMichael Kerrisk (man-pages) * If trying to increase the pipe capacity, check that an 1205b0b91d18SMichael Kerrisk (man-pages) * unprivileged user is not trying to exceed various limits 1206b0b91d18SMichael Kerrisk (man-pages) * (soft limit check here, hard limit check just below). 1207b0b91d18SMichael Kerrisk (man-pages) * Decreasing the pipe capacity is always permitted, even 1208b0b91d18SMichael Kerrisk (man-pages) * if the user is currently over a limit. 1209b0b91d18SMichael Kerrisk (man-pages) */ 12108cefc107SDavid Howells if (nr_slots > pipe->ring_size && 1211b0b91d18SMichael Kerrisk (man-pages) size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1212d37d4166SMichael Kerrisk (man-pages) return -EPERM; 1213d37d4166SMichael Kerrisk (man-pages) 12148cefc107SDavid Howells user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); 1215b0b91d18SMichael Kerrisk (man-pages) 12168cefc107SDavid Howells if (nr_slots > pipe->ring_size && 12179c87bcf0SMichael Kerrisk (man-pages) (too_many_pipe_buffers_hard(user_bufs) || 12189c87bcf0SMichael Kerrisk (man-pages) too_many_pipe_buffers_soft(user_bufs)) && 121985c2dd54SEric Biggers is_unprivileged_user()) { 1220b0b91d18SMichael Kerrisk (man-pages) ret = -EPERM; 1221b0b91d18SMichael Kerrisk (man-pages) goto out_revert_acct; 1222b0b91d18SMichael Kerrisk (man-pages) } 122335f3d14dSJens Axboe 122435f3d14dSJens Axboe /* 12258cefc107SDavid Howells * We can shrink the pipe, if arg is greater than the ring occupancy. 12268cefc107SDavid Howells * Since we don't expect a lot of shrink+grow operations, just free and 12278cefc107SDavid Howells * allocate again like we would do for growing. If the pipe currently 122835f3d14dSJens Axboe * contains more buffers than arg, then return busy. 122935f3d14dSJens Axboe */ 12308cefc107SDavid Howells mask = pipe->ring_size - 1; 12318cefc107SDavid Howells head = pipe->head; 12328cefc107SDavid Howells tail = pipe->tail; 12338cefc107SDavid Howells n = pipe_occupancy(pipe->head, pipe->tail); 12348cefc107SDavid Howells if (nr_slots < n) { 1235b0b91d18SMichael Kerrisk (man-pages) ret = -EBUSY; 1236b0b91d18SMichael Kerrisk (man-pages) goto out_revert_acct; 1237b0b91d18SMichael Kerrisk (man-pages) } 123835f3d14dSJens Axboe 12398cefc107SDavid Howells bufs = kcalloc(nr_slots, sizeof(*bufs), 1240d86133bdSVladimir Davydov GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1241b0b91d18SMichael Kerrisk (man-pages) if (unlikely(!bufs)) { 1242b0b91d18SMichael Kerrisk (man-pages) ret = -ENOMEM; 1243b0b91d18SMichael Kerrisk (man-pages) goto out_revert_acct; 1244b0b91d18SMichael Kerrisk (man-pages) } 124535f3d14dSJens Axboe 124635f3d14dSJens Axboe /* 124735f3d14dSJens Axboe * The pipe array wraps around, so just start the new one at zero 12488cefc107SDavid Howells * and adjust the indices. 124935f3d14dSJens Axboe */ 12508cefc107SDavid Howells if (n > 0) { 12518cefc107SDavid Howells unsigned int h = head & mask; 12528cefc107SDavid Howells unsigned int t = tail & mask; 12538cefc107SDavid Howells if (h > t) { 12548cefc107SDavid Howells memcpy(bufs, pipe->bufs + t, 12558cefc107SDavid Howells n * sizeof(struct pipe_buffer)); 12568cefc107SDavid Howells } else { 12578cefc107SDavid Howells unsigned int tsize = pipe->ring_size - t; 12588cefc107SDavid Howells if (h > 0) 12598cefc107SDavid Howells memcpy(bufs + tsize, pipe->bufs, 12608cefc107SDavid Howells h * sizeof(struct pipe_buffer)); 12618cefc107SDavid Howells memcpy(bufs, pipe->bufs + t, 12628cefc107SDavid Howells tsize * sizeof(struct pipe_buffer)); 12638cefc107SDavid Howells } 126435f3d14dSJens Axboe } 126535f3d14dSJens Axboe 12668cefc107SDavid Howells head = n; 12678cefc107SDavid Howells tail = 0; 12688cefc107SDavid Howells 126935f3d14dSJens Axboe kfree(pipe->bufs); 127035f3d14dSJens Axboe pipe->bufs = bufs; 12718cefc107SDavid Howells pipe->ring_size = nr_slots; 12726718b6f8SDavid Howells pipe->max_usage = nr_slots; 12738cefc107SDavid Howells pipe->tail = tail; 12748cefc107SDavid Howells pipe->head = head; 12756551d5c5SLinus Torvalds 12766551d5c5SLinus Torvalds /* This might have made more room for writers */ 12776551d5c5SLinus Torvalds wake_up_interruptible(&pipe->wr_wait); 12786718b6f8SDavid Howells return pipe->max_usage * PAGE_SIZE; 1279b0b91d18SMichael Kerrisk (man-pages) 1280b0b91d18SMichael Kerrisk (man-pages) out_revert_acct: 12818cefc107SDavid Howells (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); 1282b0b91d18SMichael Kerrisk (man-pages) return ret; 128335f3d14dSJens Axboe } 128435f3d14dSJens Axboe 1285ff9da691SJens Axboe /* 128672083646SLinus Torvalds * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 128772083646SLinus Torvalds * location, so checking ->i_pipe is not enough to verify that this is a 128872083646SLinus Torvalds * pipe. 128972083646SLinus Torvalds */ 129072083646SLinus Torvalds struct pipe_inode_info *get_pipe_info(struct file *file) 129172083646SLinus Torvalds { 1292de32ec4cSAl Viro return file->f_op == &pipefifo_fops ? file->private_data : NULL; 129372083646SLinus Torvalds } 129472083646SLinus Torvalds 129535f3d14dSJens Axboe long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 129635f3d14dSJens Axboe { 129735f3d14dSJens Axboe struct pipe_inode_info *pipe; 129835f3d14dSJens Axboe long ret; 129935f3d14dSJens Axboe 1300c66fb347SLinus Torvalds pipe = get_pipe_info(file); 130135f3d14dSJens Axboe if (!pipe) 130235f3d14dSJens Axboe return -EBADF; 130335f3d14dSJens Axboe 1304ebec73f4SAl Viro __pipe_lock(pipe); 130535f3d14dSJens Axboe 130635f3d14dSJens Axboe switch (cmd) { 1307d37d4166SMichael Kerrisk (man-pages) case F_SETPIPE_SZ: 1308d37d4166SMichael Kerrisk (man-pages) ret = pipe_set_size(pipe, arg); 130935f3d14dSJens Axboe break; 131035f3d14dSJens Axboe case F_GETPIPE_SZ: 13116718b6f8SDavid Howells ret = pipe->max_usage * PAGE_SIZE; 131235f3d14dSJens Axboe break; 131335f3d14dSJens Axboe default: 131435f3d14dSJens Axboe ret = -EINVAL; 131535f3d14dSJens Axboe break; 131635f3d14dSJens Axboe } 131735f3d14dSJens Axboe 1318ebec73f4SAl Viro __pipe_unlock(pipe); 131935f3d14dSJens Axboe return ret; 132035f3d14dSJens Axboe } 132135f3d14dSJens Axboe 1322ff0c7d15SNick Piggin static const struct super_operations pipefs_ops = { 1323ff0c7d15SNick Piggin .destroy_inode = free_inode_nonrcu, 1324d70ef97bSPavel Emelyanov .statfs = simple_statfs, 1325ff0c7d15SNick Piggin }; 1326ff0c7d15SNick Piggin 132735f3d14dSJens Axboe /* 13281da177e4SLinus Torvalds * pipefs should _never_ be mounted by userland - too much of security hassle, 13291da177e4SLinus Torvalds * no real gain from having the whole whorehouse mounted. So we don't need 13301da177e4SLinus Torvalds * any operations on the root directory. However, we need a non-trivial 13311da177e4SLinus Torvalds * d_name - pipe: will go nicely and kill the special-casing in procfs. 13321da177e4SLinus Torvalds */ 13334fa7ec5dSDavid Howells 13344fa7ec5dSDavid Howells static int pipefs_init_fs_context(struct fs_context *fc) 13351da177e4SLinus Torvalds { 13364fa7ec5dSDavid Howells struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 13374fa7ec5dSDavid Howells if (!ctx) 13384fa7ec5dSDavid Howells return -ENOMEM; 13394fa7ec5dSDavid Howells ctx->ops = &pipefs_ops; 13404fa7ec5dSDavid Howells ctx->dops = &pipefs_dentry_operations; 13414fa7ec5dSDavid Howells return 0; 13421da177e4SLinus Torvalds } 13431da177e4SLinus Torvalds 13441da177e4SLinus Torvalds static struct file_system_type pipe_fs_type = { 13451da177e4SLinus Torvalds .name = "pipefs", 13464fa7ec5dSDavid Howells .init_fs_context = pipefs_init_fs_context, 13471da177e4SLinus Torvalds .kill_sb = kill_anon_super, 13481da177e4SLinus Torvalds }; 13491da177e4SLinus Torvalds 13501da177e4SLinus Torvalds static int __init init_pipe_fs(void) 13511da177e4SLinus Torvalds { 13521da177e4SLinus Torvalds int err = register_filesystem(&pipe_fs_type); 1353341b446bSIngo Molnar 13541da177e4SLinus Torvalds if (!err) { 13551da177e4SLinus Torvalds pipe_mnt = kern_mount(&pipe_fs_type); 13561da177e4SLinus Torvalds if (IS_ERR(pipe_mnt)) { 13571da177e4SLinus Torvalds err = PTR_ERR(pipe_mnt); 13581da177e4SLinus Torvalds unregister_filesystem(&pipe_fs_type); 13591da177e4SLinus Torvalds } 13601da177e4SLinus Torvalds } 13611da177e4SLinus Torvalds return err; 13621da177e4SLinus Torvalds } 13631da177e4SLinus Torvalds 13641da177e4SLinus Torvalds fs_initcall(init_pipe_fs); 1365