1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 #include <linux/watch_queue.h> 28 #include <linux/sysctl.h> 29 30 #include <linux/uaccess.h> 31 #include <asm/ioctls.h> 32 33 #include "internal.h" 34 35 /* 36 * New pipe buffers will be restricted to this size while the user is exceeding 37 * their pipe buffer quota. The general pipe use case needs at least two 38 * buffers: one for data yet to be read, and one for new data. If this is less 39 * than two, then a write to a non-empty pipe may block even if the pipe is not 40 * full. This can occur with GNU make jobserver or similar uses of pipes as 41 * semaphores: multiple processes may be waiting to write tokens back to the 42 * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. 43 * 44 * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their 45 * own risk, namely: pipe writes to non-full pipes may block until the pipe is 46 * emptied. 47 */ 48 #define PIPE_MIN_DEF_BUFFERS 2 49 50 /* 51 * The max size that a non-root user is allowed to grow the pipe. Can 52 * be set by root in /proc/sys/fs/pipe-max-size 53 */ 54 static unsigned int pipe_max_size = 1048576; 55 56 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 57 * matches default values. 58 */ 59 static unsigned long pipe_user_pages_hard; 60 static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 61 62 /* 63 * We use head and tail indices that aren't masked off, except at the point of 64 * dereference, but rather they're allowed to wrap naturally. This means there 65 * isn't a dead spot in the buffer, but the ring has to be a power of two and 66 * <= 2^31. 67 * -- David Howells 2019-09-23. 68 * 69 * Reads with count = 0 should always return 0. 70 * -- Julian Bradfield 1999-06-07. 71 * 72 * FIFOs and Pipes now generate SIGIO for both readers and writers. 73 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 74 * 75 * pipe_read & write cleanup 76 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 77 */ 78 79 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 80 { 81 if (pipe->files) 82 mutex_lock_nested(&pipe->mutex, subclass); 83 } 84 85 void pipe_lock(struct pipe_inode_info *pipe) 86 { 87 /* 88 * pipe_lock() nests non-pipe inode locks (for writing to a file) 89 */ 90 pipe_lock_nested(pipe, I_MUTEX_PARENT); 91 } 92 EXPORT_SYMBOL(pipe_lock); 93 94 void pipe_unlock(struct pipe_inode_info *pipe) 95 { 96 if (pipe->files) 97 mutex_unlock(&pipe->mutex); 98 } 99 EXPORT_SYMBOL(pipe_unlock); 100 101 static inline void __pipe_lock(struct pipe_inode_info *pipe) 102 { 103 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 104 } 105 106 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 107 { 108 mutex_unlock(&pipe->mutex); 109 } 110 111 void pipe_double_lock(struct pipe_inode_info *pipe1, 112 struct pipe_inode_info *pipe2) 113 { 114 BUG_ON(pipe1 == pipe2); 115 116 if (pipe1 < pipe2) { 117 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 118 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 119 } else { 120 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 121 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 122 } 123 } 124 125 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 126 struct pipe_buffer *buf) 127 { 128 struct page *page = buf->page; 129 130 /* 131 * If nobody else uses this page, and we don't already have a 132 * temporary page, let's keep track of it as a one-deep 133 * allocation cache. (Otherwise just release our reference to it) 134 */ 135 if (page_count(page) == 1 && !pipe->tmp_page) 136 pipe->tmp_page = page; 137 else 138 put_page(page); 139 } 140 141 static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 struct page *page = buf->page; 145 146 if (page_count(page) != 1) 147 return false; 148 memcg_kmem_uncharge_page(page, 0); 149 __SetPageLocked(page); 150 return true; 151 } 152 153 /** 154 * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer 155 * @pipe: the pipe that the buffer belongs to 156 * @buf: the buffer to attempt to steal 157 * 158 * Description: 159 * This function attempts to steal the &struct page attached to 160 * @buf. If successful, this function returns 0 and returns with 161 * the page locked. The caller may then reuse the page for whatever 162 * he wishes; the typical use is insertion into a different file 163 * page cache. 164 */ 165 bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, 166 struct pipe_buffer *buf) 167 { 168 struct page *page = buf->page; 169 170 /* 171 * A reference of one is golden, that means that the owner of this 172 * page is the only one holding a reference to it. lock the page 173 * and return OK. 174 */ 175 if (page_count(page) == 1) { 176 lock_page(page); 177 return true; 178 } 179 return false; 180 } 181 EXPORT_SYMBOL(generic_pipe_buf_try_steal); 182 183 /** 184 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 185 * @pipe: the pipe that the buffer belongs to 186 * @buf: the buffer to get a reference to 187 * 188 * Description: 189 * This function grabs an extra reference to @buf. It's used in 190 * the tee() system call, when we duplicate the buffers in one 191 * pipe into another. 192 */ 193 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 194 { 195 return try_get_page(buf->page); 196 } 197 EXPORT_SYMBOL(generic_pipe_buf_get); 198 199 /** 200 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 201 * @pipe: the pipe that the buffer belongs to 202 * @buf: the buffer to put a reference to 203 * 204 * Description: 205 * This function releases a reference to @buf. 206 */ 207 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 208 struct pipe_buffer *buf) 209 { 210 put_page(buf->page); 211 } 212 EXPORT_SYMBOL(generic_pipe_buf_release); 213 214 static const struct pipe_buf_operations anon_pipe_buf_ops = { 215 .release = anon_pipe_buf_release, 216 .try_steal = anon_pipe_buf_try_steal, 217 .get = generic_pipe_buf_get, 218 }; 219 220 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 221 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 222 { 223 unsigned int head = READ_ONCE(pipe->head); 224 unsigned int tail = READ_ONCE(pipe->tail); 225 unsigned int writers = READ_ONCE(pipe->writers); 226 227 return !pipe_empty(head, tail) || !writers; 228 } 229 230 static ssize_t 231 pipe_read(struct kiocb *iocb, struct iov_iter *to) 232 { 233 size_t total_len = iov_iter_count(to); 234 struct file *filp = iocb->ki_filp; 235 struct pipe_inode_info *pipe = filp->private_data; 236 bool was_full, wake_next_reader = false; 237 ssize_t ret; 238 239 /* Null read succeeds. */ 240 if (unlikely(total_len == 0)) 241 return 0; 242 243 ret = 0; 244 __pipe_lock(pipe); 245 246 /* 247 * We only wake up writers if the pipe was full when we started 248 * reading in order to avoid unnecessary wakeups. 249 * 250 * But when we do wake up writers, we do so using a sync wakeup 251 * (WF_SYNC), because we want them to get going and generate more 252 * data for us. 253 */ 254 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 255 for (;;) { 256 /* Read ->head with a barrier vs post_one_notification() */ 257 unsigned int head = smp_load_acquire(&pipe->head); 258 unsigned int tail = pipe->tail; 259 unsigned int mask = pipe->ring_size - 1; 260 261 #ifdef CONFIG_WATCH_QUEUE 262 if (pipe->note_loss) { 263 struct watch_notification n; 264 265 if (total_len < 8) { 266 if (ret == 0) 267 ret = -ENOBUFS; 268 break; 269 } 270 271 n.type = WATCH_TYPE_META; 272 n.subtype = WATCH_META_LOSS_NOTIFICATION; 273 n.info = watch_sizeof(n); 274 if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { 275 if (ret == 0) 276 ret = -EFAULT; 277 break; 278 } 279 ret += sizeof(n); 280 total_len -= sizeof(n); 281 pipe->note_loss = false; 282 } 283 #endif 284 285 if (!pipe_empty(head, tail)) { 286 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 287 size_t chars = buf->len; 288 size_t written; 289 int error; 290 291 if (chars > total_len) { 292 if (buf->flags & PIPE_BUF_FLAG_WHOLE) { 293 if (ret == 0) 294 ret = -ENOBUFS; 295 break; 296 } 297 chars = total_len; 298 } 299 300 error = pipe_buf_confirm(pipe, buf); 301 if (error) { 302 if (!ret) 303 ret = error; 304 break; 305 } 306 307 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 308 if (unlikely(written < chars)) { 309 if (!ret) 310 ret = -EFAULT; 311 break; 312 } 313 ret += chars; 314 buf->offset += chars; 315 buf->len -= chars; 316 317 /* Was it a packet buffer? Clean up and exit */ 318 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 319 total_len = chars; 320 buf->len = 0; 321 } 322 323 if (!buf->len) { 324 pipe_buf_release(pipe, buf); 325 spin_lock_irq(&pipe->rd_wait.lock); 326 #ifdef CONFIG_WATCH_QUEUE 327 if (buf->flags & PIPE_BUF_FLAG_LOSS) 328 pipe->note_loss = true; 329 #endif 330 tail++; 331 pipe->tail = tail; 332 spin_unlock_irq(&pipe->rd_wait.lock); 333 } 334 total_len -= chars; 335 if (!total_len) 336 break; /* common path: read succeeded */ 337 if (!pipe_empty(head, tail)) /* More to do? */ 338 continue; 339 } 340 341 if (!pipe->writers) 342 break; 343 if (ret) 344 break; 345 if ((filp->f_flags & O_NONBLOCK) || 346 (iocb->ki_flags & IOCB_NOWAIT)) { 347 ret = -EAGAIN; 348 break; 349 } 350 __pipe_unlock(pipe); 351 352 /* 353 * We only get here if we didn't actually read anything. 354 * 355 * However, we could have seen (and removed) a zero-sized 356 * pipe buffer, and might have made space in the buffers 357 * that way. 358 * 359 * You can't make zero-sized pipe buffers by doing an empty 360 * write (not even in packet mode), but they can happen if 361 * the writer gets an EFAULT when trying to fill a buffer 362 * that already got allocated and inserted in the buffer 363 * array. 364 * 365 * So we still need to wake up any pending writers in the 366 * _very_ unlikely case that the pipe was full, but we got 367 * no data. 368 */ 369 if (unlikely(was_full)) 370 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 371 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 372 373 /* 374 * But because we didn't read anything, at this point we can 375 * just return directly with -ERESTARTSYS if we're interrupted, 376 * since we've done any required wakeups and there's no need 377 * to mark anything accessed. And we've dropped the lock. 378 */ 379 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 380 return -ERESTARTSYS; 381 382 __pipe_lock(pipe); 383 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 384 wake_next_reader = true; 385 } 386 if (pipe_empty(pipe->head, pipe->tail)) 387 wake_next_reader = false; 388 __pipe_unlock(pipe); 389 390 if (was_full) 391 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 392 if (wake_next_reader) 393 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 394 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 395 if (ret > 0) 396 file_accessed(filp); 397 return ret; 398 } 399 400 static inline int is_packetized(struct file *file) 401 { 402 return (file->f_flags & O_DIRECT) != 0; 403 } 404 405 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 406 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 407 { 408 unsigned int head = READ_ONCE(pipe->head); 409 unsigned int tail = READ_ONCE(pipe->tail); 410 unsigned int max_usage = READ_ONCE(pipe->max_usage); 411 412 return !pipe_full(head, tail, max_usage) || 413 !READ_ONCE(pipe->readers); 414 } 415 416 static ssize_t 417 pipe_write(struct kiocb *iocb, struct iov_iter *from) 418 { 419 struct file *filp = iocb->ki_filp; 420 struct pipe_inode_info *pipe = filp->private_data; 421 unsigned int head; 422 ssize_t ret = 0; 423 size_t total_len = iov_iter_count(from); 424 ssize_t chars; 425 bool was_empty = false; 426 bool wake_next_writer = false; 427 428 /* 429 * Reject writing to watch queue pipes before the point where we lock 430 * the pipe. 431 * Otherwise, lockdep would be unhappy if the caller already has another 432 * pipe locked. 433 * If we had to support locking a normal pipe and a notification pipe at 434 * the same time, we could set up lockdep annotations for that, but 435 * since we don't actually need that, it's simpler to just bail here. 436 */ 437 if (pipe_has_watch_queue(pipe)) 438 return -EXDEV; 439 440 /* Null write succeeds. */ 441 if (unlikely(total_len == 0)) 442 return 0; 443 444 __pipe_lock(pipe); 445 446 if (!pipe->readers) { 447 send_sig(SIGPIPE, current, 0); 448 ret = -EPIPE; 449 goto out; 450 } 451 452 /* 453 * If it wasn't empty we try to merge new data into 454 * the last buffer. 455 * 456 * That naturally merges small writes, but it also 457 * page-aligns the rest of the writes for large writes 458 * spanning multiple pages. 459 */ 460 head = pipe->head; 461 was_empty = pipe_empty(head, pipe->tail); 462 chars = total_len & (PAGE_SIZE-1); 463 if (chars && !was_empty) { 464 unsigned int mask = pipe->ring_size - 1; 465 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 466 int offset = buf->offset + buf->len; 467 468 if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && 469 offset + chars <= PAGE_SIZE) { 470 ret = pipe_buf_confirm(pipe, buf); 471 if (ret) 472 goto out; 473 474 ret = copy_page_from_iter(buf->page, offset, chars, from); 475 if (unlikely(ret < chars)) { 476 ret = -EFAULT; 477 goto out; 478 } 479 480 buf->len += ret; 481 if (!iov_iter_count(from)) 482 goto out; 483 } 484 } 485 486 for (;;) { 487 if (!pipe->readers) { 488 send_sig(SIGPIPE, current, 0); 489 if (!ret) 490 ret = -EPIPE; 491 break; 492 } 493 494 head = pipe->head; 495 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 496 unsigned int mask = pipe->ring_size - 1; 497 struct pipe_buffer *buf; 498 struct page *page = pipe->tmp_page; 499 int copied; 500 501 if (!page) { 502 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 503 if (unlikely(!page)) { 504 ret = ret ? : -ENOMEM; 505 break; 506 } 507 pipe->tmp_page = page; 508 } 509 510 /* Allocate a slot in the ring in advance and attach an 511 * empty buffer. If we fault or otherwise fail to use 512 * it, either the reader will consume it or it'll still 513 * be there for the next write. 514 */ 515 spin_lock_irq(&pipe->rd_wait.lock); 516 517 head = pipe->head; 518 if (pipe_full(head, pipe->tail, pipe->max_usage)) { 519 spin_unlock_irq(&pipe->rd_wait.lock); 520 continue; 521 } 522 523 pipe->head = head + 1; 524 spin_unlock_irq(&pipe->rd_wait.lock); 525 526 /* Insert it into the buffer array */ 527 buf = &pipe->bufs[head & mask]; 528 buf->page = page; 529 buf->ops = &anon_pipe_buf_ops; 530 buf->offset = 0; 531 buf->len = 0; 532 if (is_packetized(filp)) 533 buf->flags = PIPE_BUF_FLAG_PACKET; 534 else 535 buf->flags = PIPE_BUF_FLAG_CAN_MERGE; 536 pipe->tmp_page = NULL; 537 538 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 539 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 540 if (!ret) 541 ret = -EFAULT; 542 break; 543 } 544 ret += copied; 545 buf->len = copied; 546 547 if (!iov_iter_count(from)) 548 break; 549 } 550 551 if (!pipe_full(head, pipe->tail, pipe->max_usage)) 552 continue; 553 554 /* Wait for buffer space to become available. */ 555 if ((filp->f_flags & O_NONBLOCK) || 556 (iocb->ki_flags & IOCB_NOWAIT)) { 557 if (!ret) 558 ret = -EAGAIN; 559 break; 560 } 561 if (signal_pending(current)) { 562 if (!ret) 563 ret = -ERESTARTSYS; 564 break; 565 } 566 567 /* 568 * We're going to release the pipe lock and wait for more 569 * space. We wake up any readers if necessary, and then 570 * after waiting we need to re-check whether the pipe 571 * become empty while we dropped the lock. 572 */ 573 __pipe_unlock(pipe); 574 if (was_empty) 575 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 576 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 577 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 578 __pipe_lock(pipe); 579 was_empty = pipe_empty(pipe->head, pipe->tail); 580 wake_next_writer = true; 581 } 582 out: 583 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 584 wake_next_writer = false; 585 __pipe_unlock(pipe); 586 587 /* 588 * If we do do a wakeup event, we do a 'sync' wakeup, because we 589 * want the reader to start processing things asap, rather than 590 * leave the data pending. 591 * 592 * This is particularly important for small writes, because of 593 * how (for example) the GNU make jobserver uses small writes to 594 * wake up pending jobs 595 * 596 * Epoll nonsensically wants a wakeup whether the pipe 597 * was already empty or not. 598 */ 599 if (was_empty || pipe->poll_usage) 600 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 601 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 602 if (wake_next_writer) 603 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 604 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 605 int err = file_update_time(filp); 606 if (err) 607 ret = err; 608 sb_end_write(file_inode(filp)->i_sb); 609 } 610 return ret; 611 } 612 613 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 614 { 615 struct pipe_inode_info *pipe = filp->private_data; 616 unsigned int count, head, tail, mask; 617 618 switch (cmd) { 619 case FIONREAD: 620 __pipe_lock(pipe); 621 count = 0; 622 head = pipe->head; 623 tail = pipe->tail; 624 mask = pipe->ring_size - 1; 625 626 while (tail != head) { 627 count += pipe->bufs[tail & mask].len; 628 tail++; 629 } 630 __pipe_unlock(pipe); 631 632 return put_user(count, (int __user *)arg); 633 634 #ifdef CONFIG_WATCH_QUEUE 635 case IOC_WATCH_QUEUE_SET_SIZE: { 636 int ret; 637 __pipe_lock(pipe); 638 ret = watch_queue_set_size(pipe, arg); 639 __pipe_unlock(pipe); 640 return ret; 641 } 642 643 case IOC_WATCH_QUEUE_SET_FILTER: 644 return watch_queue_set_filter( 645 pipe, (struct watch_notification_filter __user *)arg); 646 #endif 647 648 default: 649 return -ENOIOCTLCMD; 650 } 651 } 652 653 /* No kernel lock held - fine */ 654 static __poll_t 655 pipe_poll(struct file *filp, poll_table *wait) 656 { 657 __poll_t mask; 658 struct pipe_inode_info *pipe = filp->private_data; 659 unsigned int head, tail; 660 661 /* Epoll has some historical nasty semantics, this enables them */ 662 WRITE_ONCE(pipe->poll_usage, true); 663 664 /* 665 * Reading pipe state only -- no need for acquiring the semaphore. 666 * 667 * But because this is racy, the code has to add the 668 * entry to the poll table _first_ .. 669 */ 670 if (filp->f_mode & FMODE_READ) 671 poll_wait(filp, &pipe->rd_wait, wait); 672 if (filp->f_mode & FMODE_WRITE) 673 poll_wait(filp, &pipe->wr_wait, wait); 674 675 /* 676 * .. and only then can you do the racy tests. That way, 677 * if something changes and you got it wrong, the poll 678 * table entry will wake you up and fix it. 679 */ 680 head = READ_ONCE(pipe->head); 681 tail = READ_ONCE(pipe->tail); 682 683 mask = 0; 684 if (filp->f_mode & FMODE_READ) { 685 if (!pipe_empty(head, tail)) 686 mask |= EPOLLIN | EPOLLRDNORM; 687 if (!pipe->writers && filp->f_version != pipe->w_counter) 688 mask |= EPOLLHUP; 689 } 690 691 if (filp->f_mode & FMODE_WRITE) { 692 if (!pipe_full(head, tail, pipe->max_usage)) 693 mask |= EPOLLOUT | EPOLLWRNORM; 694 /* 695 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 696 * behave exactly like pipes for poll(). 697 */ 698 if (!pipe->readers) 699 mask |= EPOLLERR; 700 } 701 702 return mask; 703 } 704 705 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 706 { 707 int kill = 0; 708 709 spin_lock(&inode->i_lock); 710 if (!--pipe->files) { 711 inode->i_pipe = NULL; 712 kill = 1; 713 } 714 spin_unlock(&inode->i_lock); 715 716 if (kill) 717 free_pipe_info(pipe); 718 } 719 720 static int 721 pipe_release(struct inode *inode, struct file *file) 722 { 723 struct pipe_inode_info *pipe = file->private_data; 724 725 __pipe_lock(pipe); 726 if (file->f_mode & FMODE_READ) 727 pipe->readers--; 728 if (file->f_mode & FMODE_WRITE) 729 pipe->writers--; 730 731 /* Was that the last reader or writer, but not the other side? */ 732 if (!pipe->readers != !pipe->writers) { 733 wake_up_interruptible_all(&pipe->rd_wait); 734 wake_up_interruptible_all(&pipe->wr_wait); 735 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 736 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 737 } 738 __pipe_unlock(pipe); 739 740 put_pipe_info(inode, pipe); 741 return 0; 742 } 743 744 static int 745 pipe_fasync(int fd, struct file *filp, int on) 746 { 747 struct pipe_inode_info *pipe = filp->private_data; 748 int retval = 0; 749 750 __pipe_lock(pipe); 751 if (filp->f_mode & FMODE_READ) 752 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 753 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 754 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 755 if (retval < 0 && (filp->f_mode & FMODE_READ)) 756 /* this can happen only if on == T */ 757 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 758 } 759 __pipe_unlock(pipe); 760 return retval; 761 } 762 763 unsigned long account_pipe_buffers(struct user_struct *user, 764 unsigned long old, unsigned long new) 765 { 766 return atomic_long_add_return(new - old, &user->pipe_bufs); 767 } 768 769 bool too_many_pipe_buffers_soft(unsigned long user_bufs) 770 { 771 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 772 773 return soft_limit && user_bufs > soft_limit; 774 } 775 776 bool too_many_pipe_buffers_hard(unsigned long user_bufs) 777 { 778 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 779 780 return hard_limit && user_bufs > hard_limit; 781 } 782 783 bool pipe_is_unprivileged_user(void) 784 { 785 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 786 } 787 788 struct pipe_inode_info *alloc_pipe_info(void) 789 { 790 struct pipe_inode_info *pipe; 791 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 792 struct user_struct *user = get_current_user(); 793 unsigned long user_bufs; 794 unsigned int max_size = READ_ONCE(pipe_max_size); 795 796 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 797 if (pipe == NULL) 798 goto out_free_uid; 799 800 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 801 pipe_bufs = max_size >> PAGE_SHIFT; 802 803 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 804 805 if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { 806 user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); 807 pipe_bufs = PIPE_MIN_DEF_BUFFERS; 808 } 809 810 if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) 811 goto out_revert_acct; 812 813 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 814 GFP_KERNEL_ACCOUNT); 815 816 if (pipe->bufs) { 817 init_waitqueue_head(&pipe->rd_wait); 818 init_waitqueue_head(&pipe->wr_wait); 819 pipe->r_counter = pipe->w_counter = 1; 820 pipe->max_usage = pipe_bufs; 821 pipe->ring_size = pipe_bufs; 822 pipe->nr_accounted = pipe_bufs; 823 pipe->user = user; 824 mutex_init(&pipe->mutex); 825 return pipe; 826 } 827 828 out_revert_acct: 829 (void) account_pipe_buffers(user, pipe_bufs, 0); 830 kfree(pipe); 831 out_free_uid: 832 free_uid(user); 833 return NULL; 834 } 835 836 void free_pipe_info(struct pipe_inode_info *pipe) 837 { 838 unsigned int i; 839 840 #ifdef CONFIG_WATCH_QUEUE 841 if (pipe->watch_queue) 842 watch_queue_clear(pipe->watch_queue); 843 #endif 844 845 (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); 846 free_uid(pipe->user); 847 for (i = 0; i < pipe->ring_size; i++) { 848 struct pipe_buffer *buf = pipe->bufs + i; 849 if (buf->ops) 850 pipe_buf_release(pipe, buf); 851 } 852 #ifdef CONFIG_WATCH_QUEUE 853 if (pipe->watch_queue) 854 put_watch_queue(pipe->watch_queue); 855 #endif 856 if (pipe->tmp_page) 857 __free_page(pipe->tmp_page); 858 kfree(pipe->bufs); 859 kfree(pipe); 860 } 861 862 static struct vfsmount *pipe_mnt __read_mostly; 863 864 /* 865 * pipefs_dname() is called from d_path(). 866 */ 867 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 868 { 869 return dynamic_dname(buffer, buflen, "pipe:[%lu]", 870 d_inode(dentry)->i_ino); 871 } 872 873 static const struct dentry_operations pipefs_dentry_operations = { 874 .d_dname = pipefs_dname, 875 }; 876 877 static struct inode * get_pipe_inode(void) 878 { 879 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 880 struct pipe_inode_info *pipe; 881 882 if (!inode) 883 goto fail_inode; 884 885 inode->i_ino = get_next_ino(); 886 887 pipe = alloc_pipe_info(); 888 if (!pipe) 889 goto fail_iput; 890 891 inode->i_pipe = pipe; 892 pipe->files = 2; 893 pipe->readers = pipe->writers = 1; 894 inode->i_fop = &pipefifo_fops; 895 896 /* 897 * Mark the inode dirty from the very beginning, 898 * that way it will never be moved to the dirty 899 * list because "mark_inode_dirty()" will think 900 * that it already _is_ on the dirty list. 901 */ 902 inode->i_state = I_DIRTY; 903 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 904 inode->i_uid = current_fsuid(); 905 inode->i_gid = current_fsgid(); 906 inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); 907 908 return inode; 909 910 fail_iput: 911 iput(inode); 912 913 fail_inode: 914 return NULL; 915 } 916 917 int create_pipe_files(struct file **res, int flags) 918 { 919 struct inode *inode = get_pipe_inode(); 920 struct file *f; 921 int error; 922 923 if (!inode) 924 return -ENFILE; 925 926 if (flags & O_NOTIFICATION_PIPE) { 927 error = watch_queue_init(inode->i_pipe); 928 if (error) { 929 free_pipe_info(inode->i_pipe); 930 iput(inode); 931 return error; 932 } 933 } 934 935 f = alloc_file_pseudo(inode, pipe_mnt, "", 936 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 937 &pipefifo_fops); 938 if (IS_ERR(f)) { 939 free_pipe_info(inode->i_pipe); 940 iput(inode); 941 return PTR_ERR(f); 942 } 943 944 f->private_data = inode->i_pipe; 945 946 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 947 &pipefifo_fops); 948 if (IS_ERR(res[0])) { 949 put_pipe_info(inode, inode->i_pipe); 950 fput(f); 951 return PTR_ERR(res[0]); 952 } 953 res[0]->private_data = inode->i_pipe; 954 res[1] = f; 955 stream_open(inode, res[0]); 956 stream_open(inode, res[1]); 957 return 0; 958 } 959 960 static int __do_pipe_flags(int *fd, struct file **files, int flags) 961 { 962 int error; 963 int fdw, fdr; 964 965 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) 966 return -EINVAL; 967 968 error = create_pipe_files(files, flags); 969 if (error) 970 return error; 971 972 error = get_unused_fd_flags(flags); 973 if (error < 0) 974 goto err_read_pipe; 975 fdr = error; 976 977 error = get_unused_fd_flags(flags); 978 if (error < 0) 979 goto err_fdr; 980 fdw = error; 981 982 audit_fd_pair(fdr, fdw); 983 fd[0] = fdr; 984 fd[1] = fdw; 985 /* pipe groks IOCB_NOWAIT */ 986 files[0]->f_mode |= FMODE_NOWAIT; 987 files[1]->f_mode |= FMODE_NOWAIT; 988 return 0; 989 990 err_fdr: 991 put_unused_fd(fdr); 992 err_read_pipe: 993 fput(files[0]); 994 fput(files[1]); 995 return error; 996 } 997 998 int do_pipe_flags(int *fd, int flags) 999 { 1000 struct file *files[2]; 1001 int error = __do_pipe_flags(fd, files, flags); 1002 if (!error) { 1003 fd_install(fd[0], files[0]); 1004 fd_install(fd[1], files[1]); 1005 } 1006 return error; 1007 } 1008 1009 /* 1010 * sys_pipe() is the normal C calling standard for creating 1011 * a pipe. It's not the way Unix traditionally does this, though. 1012 */ 1013 static int do_pipe2(int __user *fildes, int flags) 1014 { 1015 struct file *files[2]; 1016 int fd[2]; 1017 int error; 1018 1019 error = __do_pipe_flags(fd, files, flags); 1020 if (!error) { 1021 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 1022 fput(files[0]); 1023 fput(files[1]); 1024 put_unused_fd(fd[0]); 1025 put_unused_fd(fd[1]); 1026 error = -EFAULT; 1027 } else { 1028 fd_install(fd[0], files[0]); 1029 fd_install(fd[1], files[1]); 1030 } 1031 } 1032 return error; 1033 } 1034 1035 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1036 { 1037 return do_pipe2(fildes, flags); 1038 } 1039 1040 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1041 { 1042 return do_pipe2(fildes, 0); 1043 } 1044 1045 /* 1046 * This is the stupid "wait for pipe to be readable or writable" 1047 * model. 1048 * 1049 * See pipe_read/write() for the proper kind of exclusive wait, 1050 * but that requires that we wake up any other readers/writers 1051 * if we then do not end up reading everything (ie the whole 1052 * "wake_next_reader/writer" logic in pipe_read/write()). 1053 */ 1054 void pipe_wait_readable(struct pipe_inode_info *pipe) 1055 { 1056 pipe_unlock(pipe); 1057 wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); 1058 pipe_lock(pipe); 1059 } 1060 1061 void pipe_wait_writable(struct pipe_inode_info *pipe) 1062 { 1063 pipe_unlock(pipe); 1064 wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); 1065 pipe_lock(pipe); 1066 } 1067 1068 /* 1069 * This depends on both the wait (here) and the wakeup (wake_up_partner) 1070 * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot 1071 * race with the count check and waitqueue prep. 1072 * 1073 * Normally in order to avoid races, you'd do the prepare_to_wait() first, 1074 * then check the condition you're waiting for, and only then sleep. But 1075 * because of the pipe lock, we can check the condition before being on 1076 * the wait queue. 1077 * 1078 * We use the 'rd_wait' waitqueue for pipe partner waiting. 1079 */ 1080 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1081 { 1082 DEFINE_WAIT(rdwait); 1083 int cur = *cnt; 1084 1085 while (cur == *cnt) { 1086 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 1087 pipe_unlock(pipe); 1088 schedule(); 1089 finish_wait(&pipe->rd_wait, &rdwait); 1090 pipe_lock(pipe); 1091 if (signal_pending(current)) 1092 break; 1093 } 1094 return cur == *cnt ? -ERESTARTSYS : 0; 1095 } 1096 1097 static void wake_up_partner(struct pipe_inode_info *pipe) 1098 { 1099 wake_up_interruptible_all(&pipe->rd_wait); 1100 } 1101 1102 static int fifo_open(struct inode *inode, struct file *filp) 1103 { 1104 struct pipe_inode_info *pipe; 1105 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1106 int ret; 1107 1108 filp->f_version = 0; 1109 1110 spin_lock(&inode->i_lock); 1111 if (inode->i_pipe) { 1112 pipe = inode->i_pipe; 1113 pipe->files++; 1114 spin_unlock(&inode->i_lock); 1115 } else { 1116 spin_unlock(&inode->i_lock); 1117 pipe = alloc_pipe_info(); 1118 if (!pipe) 1119 return -ENOMEM; 1120 pipe->files = 1; 1121 spin_lock(&inode->i_lock); 1122 if (unlikely(inode->i_pipe)) { 1123 inode->i_pipe->files++; 1124 spin_unlock(&inode->i_lock); 1125 free_pipe_info(pipe); 1126 pipe = inode->i_pipe; 1127 } else { 1128 inode->i_pipe = pipe; 1129 spin_unlock(&inode->i_lock); 1130 } 1131 } 1132 filp->private_data = pipe; 1133 /* OK, we have a pipe and it's pinned down */ 1134 1135 __pipe_lock(pipe); 1136 1137 /* We can only do regular read/write on fifos */ 1138 stream_open(inode, filp); 1139 1140 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1141 case FMODE_READ: 1142 /* 1143 * O_RDONLY 1144 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1145 * opened, even when there is no process writing the FIFO. 1146 */ 1147 pipe->r_counter++; 1148 if (pipe->readers++ == 0) 1149 wake_up_partner(pipe); 1150 1151 if (!is_pipe && !pipe->writers) { 1152 if ((filp->f_flags & O_NONBLOCK)) { 1153 /* suppress EPOLLHUP until we have 1154 * seen a writer */ 1155 filp->f_version = pipe->w_counter; 1156 } else { 1157 if (wait_for_partner(pipe, &pipe->w_counter)) 1158 goto err_rd; 1159 } 1160 } 1161 break; 1162 1163 case FMODE_WRITE: 1164 /* 1165 * O_WRONLY 1166 * POSIX.1 says that O_NONBLOCK means return -1 with 1167 * errno=ENXIO when there is no process reading the FIFO. 1168 */ 1169 ret = -ENXIO; 1170 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1171 goto err; 1172 1173 pipe->w_counter++; 1174 if (!pipe->writers++) 1175 wake_up_partner(pipe); 1176 1177 if (!is_pipe && !pipe->readers) { 1178 if (wait_for_partner(pipe, &pipe->r_counter)) 1179 goto err_wr; 1180 } 1181 break; 1182 1183 case FMODE_READ | FMODE_WRITE: 1184 /* 1185 * O_RDWR 1186 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1187 * This implementation will NEVER block on a O_RDWR open, since 1188 * the process can at least talk to itself. 1189 */ 1190 1191 pipe->readers++; 1192 pipe->writers++; 1193 pipe->r_counter++; 1194 pipe->w_counter++; 1195 if (pipe->readers == 1 || pipe->writers == 1) 1196 wake_up_partner(pipe); 1197 break; 1198 1199 default: 1200 ret = -EINVAL; 1201 goto err; 1202 } 1203 1204 /* Ok! */ 1205 __pipe_unlock(pipe); 1206 return 0; 1207 1208 err_rd: 1209 if (!--pipe->readers) 1210 wake_up_interruptible(&pipe->wr_wait); 1211 ret = -ERESTARTSYS; 1212 goto err; 1213 1214 err_wr: 1215 if (!--pipe->writers) 1216 wake_up_interruptible_all(&pipe->rd_wait); 1217 ret = -ERESTARTSYS; 1218 goto err; 1219 1220 err: 1221 __pipe_unlock(pipe); 1222 1223 put_pipe_info(inode, pipe); 1224 return ret; 1225 } 1226 1227 const struct file_operations pipefifo_fops = { 1228 .open = fifo_open, 1229 .llseek = no_llseek, 1230 .read_iter = pipe_read, 1231 .write_iter = pipe_write, 1232 .poll = pipe_poll, 1233 .unlocked_ioctl = pipe_ioctl, 1234 .release = pipe_release, 1235 .fasync = pipe_fasync, 1236 .splice_write = iter_file_splice_write, 1237 }; 1238 1239 /* 1240 * Currently we rely on the pipe array holding a power-of-2 number 1241 * of pages. Returns 0 on error. 1242 */ 1243 unsigned int round_pipe_size(unsigned int size) 1244 { 1245 if (size > (1U << 31)) 1246 return 0; 1247 1248 /* Minimum pipe size, as required by POSIX */ 1249 if (size < PAGE_SIZE) 1250 return PAGE_SIZE; 1251 1252 return roundup_pow_of_two(size); 1253 } 1254 1255 /* 1256 * Resize the pipe ring to a number of slots. 1257 * 1258 * Note the pipe can be reduced in capacity, but only if the current 1259 * occupancy doesn't exceed nr_slots; if it does, EBUSY will be 1260 * returned instead. 1261 */ 1262 int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) 1263 { 1264 struct pipe_buffer *bufs; 1265 unsigned int head, tail, mask, n; 1266 1267 bufs = kcalloc(nr_slots, sizeof(*bufs), 1268 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1269 if (unlikely(!bufs)) 1270 return -ENOMEM; 1271 1272 spin_lock_irq(&pipe->rd_wait.lock); 1273 mask = pipe->ring_size - 1; 1274 head = pipe->head; 1275 tail = pipe->tail; 1276 1277 n = pipe_occupancy(head, tail); 1278 if (nr_slots < n) { 1279 spin_unlock_irq(&pipe->rd_wait.lock); 1280 kfree(bufs); 1281 return -EBUSY; 1282 } 1283 1284 /* 1285 * The pipe array wraps around, so just start the new one at zero 1286 * and adjust the indices. 1287 */ 1288 if (n > 0) { 1289 unsigned int h = head & mask; 1290 unsigned int t = tail & mask; 1291 if (h > t) { 1292 memcpy(bufs, pipe->bufs + t, 1293 n * sizeof(struct pipe_buffer)); 1294 } else { 1295 unsigned int tsize = pipe->ring_size - t; 1296 if (h > 0) 1297 memcpy(bufs + tsize, pipe->bufs, 1298 h * sizeof(struct pipe_buffer)); 1299 memcpy(bufs, pipe->bufs + t, 1300 tsize * sizeof(struct pipe_buffer)); 1301 } 1302 } 1303 1304 head = n; 1305 tail = 0; 1306 1307 kfree(pipe->bufs); 1308 pipe->bufs = bufs; 1309 pipe->ring_size = nr_slots; 1310 if (pipe->max_usage > nr_slots) 1311 pipe->max_usage = nr_slots; 1312 pipe->tail = tail; 1313 pipe->head = head; 1314 1315 if (!pipe_has_watch_queue(pipe)) { 1316 pipe->max_usage = nr_slots; 1317 pipe->nr_accounted = nr_slots; 1318 } 1319 1320 spin_unlock_irq(&pipe->rd_wait.lock); 1321 1322 /* This might have made more room for writers */ 1323 wake_up_interruptible(&pipe->wr_wait); 1324 return 0; 1325 } 1326 1327 /* 1328 * Allocate a new array of pipe buffers and copy the info over. Returns the 1329 * pipe size if successful, or return -ERROR on error. 1330 */ 1331 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) 1332 { 1333 unsigned long user_bufs; 1334 unsigned int nr_slots, size; 1335 long ret = 0; 1336 1337 if (pipe_has_watch_queue(pipe)) 1338 return -EBUSY; 1339 1340 size = round_pipe_size(arg); 1341 nr_slots = size >> PAGE_SHIFT; 1342 1343 if (!nr_slots) 1344 return -EINVAL; 1345 1346 /* 1347 * If trying to increase the pipe capacity, check that an 1348 * unprivileged user is not trying to exceed various limits 1349 * (soft limit check here, hard limit check just below). 1350 * Decreasing the pipe capacity is always permitted, even 1351 * if the user is currently over a limit. 1352 */ 1353 if (nr_slots > pipe->max_usage && 1354 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1355 return -EPERM; 1356 1357 user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); 1358 1359 if (nr_slots > pipe->max_usage && 1360 (too_many_pipe_buffers_hard(user_bufs) || 1361 too_many_pipe_buffers_soft(user_bufs)) && 1362 pipe_is_unprivileged_user()) { 1363 ret = -EPERM; 1364 goto out_revert_acct; 1365 } 1366 1367 ret = pipe_resize_ring(pipe, nr_slots); 1368 if (ret < 0) 1369 goto out_revert_acct; 1370 1371 return pipe->max_usage * PAGE_SIZE; 1372 1373 out_revert_acct: 1374 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); 1375 return ret; 1376 } 1377 1378 /* 1379 * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is 1380 * not enough to verify that this is a pipe. 1381 */ 1382 struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) 1383 { 1384 struct pipe_inode_info *pipe = file->private_data; 1385 1386 if (file->f_op != &pipefifo_fops || !pipe) 1387 return NULL; 1388 if (for_splice && pipe_has_watch_queue(pipe)) 1389 return NULL; 1390 return pipe; 1391 } 1392 1393 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) 1394 { 1395 struct pipe_inode_info *pipe; 1396 long ret; 1397 1398 pipe = get_pipe_info(file, false); 1399 if (!pipe) 1400 return -EBADF; 1401 1402 __pipe_lock(pipe); 1403 1404 switch (cmd) { 1405 case F_SETPIPE_SZ: 1406 ret = pipe_set_size(pipe, arg); 1407 break; 1408 case F_GETPIPE_SZ: 1409 ret = pipe->max_usage * PAGE_SIZE; 1410 break; 1411 default: 1412 ret = -EINVAL; 1413 break; 1414 } 1415 1416 __pipe_unlock(pipe); 1417 return ret; 1418 } 1419 1420 static const struct super_operations pipefs_ops = { 1421 .destroy_inode = free_inode_nonrcu, 1422 .statfs = simple_statfs, 1423 }; 1424 1425 /* 1426 * pipefs should _never_ be mounted by userland - too much of security hassle, 1427 * no real gain from having the whole whorehouse mounted. So we don't need 1428 * any operations on the root directory. However, we need a non-trivial 1429 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1430 */ 1431 1432 static int pipefs_init_fs_context(struct fs_context *fc) 1433 { 1434 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1435 if (!ctx) 1436 return -ENOMEM; 1437 ctx->ops = &pipefs_ops; 1438 ctx->dops = &pipefs_dentry_operations; 1439 return 0; 1440 } 1441 1442 static struct file_system_type pipe_fs_type = { 1443 .name = "pipefs", 1444 .init_fs_context = pipefs_init_fs_context, 1445 .kill_sb = kill_anon_super, 1446 }; 1447 1448 #ifdef CONFIG_SYSCTL 1449 static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, 1450 unsigned int *valp, 1451 int write, void *data) 1452 { 1453 if (write) { 1454 unsigned int val; 1455 1456 val = round_pipe_size(*lvalp); 1457 if (val == 0) 1458 return -EINVAL; 1459 1460 *valp = val; 1461 } else { 1462 unsigned int val = *valp; 1463 *lvalp = (unsigned long) val; 1464 } 1465 1466 return 0; 1467 } 1468 1469 static int proc_dopipe_max_size(struct ctl_table *table, int write, 1470 void *buffer, size_t *lenp, loff_t *ppos) 1471 { 1472 return do_proc_douintvec(table, write, buffer, lenp, ppos, 1473 do_proc_dopipe_max_size_conv, NULL); 1474 } 1475 1476 static struct ctl_table fs_pipe_sysctls[] = { 1477 { 1478 .procname = "pipe-max-size", 1479 .data = &pipe_max_size, 1480 .maxlen = sizeof(pipe_max_size), 1481 .mode = 0644, 1482 .proc_handler = proc_dopipe_max_size, 1483 }, 1484 { 1485 .procname = "pipe-user-pages-hard", 1486 .data = &pipe_user_pages_hard, 1487 .maxlen = sizeof(pipe_user_pages_hard), 1488 .mode = 0644, 1489 .proc_handler = proc_doulongvec_minmax, 1490 }, 1491 { 1492 .procname = "pipe-user-pages-soft", 1493 .data = &pipe_user_pages_soft, 1494 .maxlen = sizeof(pipe_user_pages_soft), 1495 .mode = 0644, 1496 .proc_handler = proc_doulongvec_minmax, 1497 }, 1498 { } 1499 }; 1500 #endif 1501 1502 static int __init init_pipe_fs(void) 1503 { 1504 int err = register_filesystem(&pipe_fs_type); 1505 1506 if (!err) { 1507 pipe_mnt = kern_mount(&pipe_fs_type); 1508 if (IS_ERR(pipe_mnt)) { 1509 err = PTR_ERR(pipe_mnt); 1510 unregister_filesystem(&pipe_fs_type); 1511 } 1512 } 1513 #ifdef CONFIG_SYSCTL 1514 register_sysctl_init("fs", fs_pipe_sysctls); 1515 #endif 1516 return err; 1517 } 1518 1519 fs_initcall(init_pipe_fs); 1520