1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 28 #include <linux/uaccess.h> 29 #include <asm/ioctls.h> 30 31 #include "internal.h" 32 33 /* 34 * The max size that a non-root user is allowed to grow the pipe. Can 35 * be set by root in /proc/sys/fs/pipe-max-size 36 */ 37 unsigned int pipe_max_size = 1048576; 38 39 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 40 * matches default values. 41 */ 42 unsigned long pipe_user_pages_hard; 43 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 44 45 /* 46 * We use head and tail indices that aren't masked off, except at the point of 47 * dereference, but rather they're allowed to wrap naturally. This means there 48 * isn't a dead spot in the buffer, but the ring has to be a power of two and 49 * <= 2^31. 50 * -- David Howells 2019-09-23. 51 * 52 * Reads with count = 0 should always return 0. 53 * -- Julian Bradfield 1999-06-07. 54 * 55 * FIFOs and Pipes now generate SIGIO for both readers and writers. 56 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 57 * 58 * pipe_read & write cleanup 59 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 60 */ 61 62 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 63 { 64 if (pipe->files) 65 mutex_lock_nested(&pipe->mutex, subclass); 66 } 67 68 void pipe_lock(struct pipe_inode_info *pipe) 69 { 70 /* 71 * pipe_lock() nests non-pipe inode locks (for writing to a file) 72 */ 73 pipe_lock_nested(pipe, I_MUTEX_PARENT); 74 } 75 EXPORT_SYMBOL(pipe_lock); 76 77 void pipe_unlock(struct pipe_inode_info *pipe) 78 { 79 if (pipe->files) 80 mutex_unlock(&pipe->mutex); 81 } 82 EXPORT_SYMBOL(pipe_unlock); 83 84 static inline void __pipe_lock(struct pipe_inode_info *pipe) 85 { 86 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 87 } 88 89 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 90 { 91 mutex_unlock(&pipe->mutex); 92 } 93 94 void pipe_double_lock(struct pipe_inode_info *pipe1, 95 struct pipe_inode_info *pipe2) 96 { 97 BUG_ON(pipe1 == pipe2); 98 99 if (pipe1 < pipe2) { 100 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 101 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 102 } else { 103 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 104 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 105 } 106 } 107 108 /* Drop the inode semaphore and wait for a pipe event, atomically */ 109 void pipe_wait(struct pipe_inode_info *pipe) 110 { 111 DEFINE_WAIT(rdwait); 112 DEFINE_WAIT(wrwait); 113 114 /* 115 * Pipes are system-local resources, so sleeping on them 116 * is considered a noninteractive wait: 117 */ 118 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 119 prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); 120 pipe_unlock(pipe); 121 schedule(); 122 finish_wait(&pipe->rd_wait, &rdwait); 123 finish_wait(&pipe->wr_wait, &wrwait); 124 pipe_lock(pipe); 125 } 126 127 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 128 struct pipe_buffer *buf) 129 { 130 struct page *page = buf->page; 131 132 /* 133 * If nobody else uses this page, and we don't already have a 134 * temporary page, let's keep track of it as a one-deep 135 * allocation cache. (Otherwise just release our reference to it) 136 */ 137 if (page_count(page) == 1 && !pipe->tmp_page) 138 pipe->tmp_page = page; 139 else 140 put_page(page); 141 } 142 143 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 144 struct pipe_buffer *buf) 145 { 146 struct page *page = buf->page; 147 148 if (page_count(page) == 1) { 149 memcg_kmem_uncharge(page, 0); 150 __SetPageLocked(page); 151 return 0; 152 } 153 return 1; 154 } 155 156 /** 157 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 158 * @pipe: the pipe that the buffer belongs to 159 * @buf: the buffer to attempt to steal 160 * 161 * Description: 162 * This function attempts to steal the &struct page attached to 163 * @buf. If successful, this function returns 0 and returns with 164 * the page locked. The caller may then reuse the page for whatever 165 * he wishes; the typical use is insertion into a different file 166 * page cache. 167 */ 168 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 169 struct pipe_buffer *buf) 170 { 171 struct page *page = buf->page; 172 173 /* 174 * A reference of one is golden, that means that the owner of this 175 * page is the only one holding a reference to it. lock the page 176 * and return OK. 177 */ 178 if (page_count(page) == 1) { 179 lock_page(page); 180 return 0; 181 } 182 183 return 1; 184 } 185 EXPORT_SYMBOL(generic_pipe_buf_steal); 186 187 /** 188 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 189 * @pipe: the pipe that the buffer belongs to 190 * @buf: the buffer to get a reference to 191 * 192 * Description: 193 * This function grabs an extra reference to @buf. It's used in 194 * in the tee() system call, when we duplicate the buffers in one 195 * pipe into another. 196 */ 197 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 198 { 199 return try_get_page(buf->page); 200 } 201 EXPORT_SYMBOL(generic_pipe_buf_get); 202 203 /** 204 * generic_pipe_buf_confirm - verify contents of the pipe buffer 205 * @info: the pipe that the buffer belongs to 206 * @buf: the buffer to confirm 207 * 208 * Description: 209 * This function does nothing, because the generic pipe code uses 210 * pages that are always good when inserted into the pipe. 211 */ 212 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 213 struct pipe_buffer *buf) 214 { 215 return 0; 216 } 217 EXPORT_SYMBOL(generic_pipe_buf_confirm); 218 219 /** 220 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 221 * @pipe: the pipe that the buffer belongs to 222 * @buf: the buffer to put a reference to 223 * 224 * Description: 225 * This function releases a reference to @buf. 226 */ 227 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 228 struct pipe_buffer *buf) 229 { 230 put_page(buf->page); 231 } 232 EXPORT_SYMBOL(generic_pipe_buf_release); 233 234 /* New data written to a pipe may be appended to a buffer with this type. */ 235 static const struct pipe_buf_operations anon_pipe_buf_ops = { 236 .confirm = generic_pipe_buf_confirm, 237 .release = anon_pipe_buf_release, 238 .steal = anon_pipe_buf_steal, 239 .get = generic_pipe_buf_get, 240 }; 241 242 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 243 .confirm = generic_pipe_buf_confirm, 244 .release = anon_pipe_buf_release, 245 .steal = anon_pipe_buf_steal, 246 .get = generic_pipe_buf_get, 247 }; 248 249 static const struct pipe_buf_operations packet_pipe_buf_ops = { 250 .confirm = generic_pipe_buf_confirm, 251 .release = anon_pipe_buf_release, 252 .steal = anon_pipe_buf_steal, 253 .get = generic_pipe_buf_get, 254 }; 255 256 /** 257 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 258 * @buf: the buffer to mark 259 * 260 * Description: 261 * This function ensures that no future writes will be merged into the 262 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 263 * share the same backing page. 264 */ 265 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 266 { 267 if (buf->ops == &anon_pipe_buf_ops) 268 buf->ops = &anon_pipe_buf_nomerge_ops; 269 } 270 271 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 272 { 273 return buf->ops == &anon_pipe_buf_ops; 274 } 275 276 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 277 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 278 { 279 unsigned int head = READ_ONCE(pipe->head); 280 unsigned int tail = READ_ONCE(pipe->tail); 281 unsigned int writers = READ_ONCE(pipe->writers); 282 283 return !pipe_empty(head, tail) || !writers; 284 } 285 286 static ssize_t 287 pipe_read(struct kiocb *iocb, struct iov_iter *to) 288 { 289 size_t total_len = iov_iter_count(to); 290 struct file *filp = iocb->ki_filp; 291 struct pipe_inode_info *pipe = filp->private_data; 292 bool was_full, wake_next_reader = false; 293 ssize_t ret; 294 295 /* Null read succeeds. */ 296 if (unlikely(total_len == 0)) 297 return 0; 298 299 ret = 0; 300 __pipe_lock(pipe); 301 302 /* 303 * We only wake up writers if the pipe was full when we started 304 * reading in order to avoid unnecessary wakeups. 305 * 306 * But when we do wake up writers, we do so using a sync wakeup 307 * (WF_SYNC), because we want them to get going and generate more 308 * data for us. 309 */ 310 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 311 for (;;) { 312 unsigned int head = pipe->head; 313 unsigned int tail = pipe->tail; 314 unsigned int mask = pipe->ring_size - 1; 315 316 if (!pipe_empty(head, tail)) { 317 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 318 size_t chars = buf->len; 319 size_t written; 320 int error; 321 322 if (chars > total_len) 323 chars = total_len; 324 325 error = pipe_buf_confirm(pipe, buf); 326 if (error) { 327 if (!ret) 328 ret = error; 329 break; 330 } 331 332 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 333 if (unlikely(written < chars)) { 334 if (!ret) 335 ret = -EFAULT; 336 break; 337 } 338 ret += chars; 339 buf->offset += chars; 340 buf->len -= chars; 341 342 /* Was it a packet buffer? Clean up and exit */ 343 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 344 total_len = chars; 345 buf->len = 0; 346 } 347 348 if (!buf->len) { 349 pipe_buf_release(pipe, buf); 350 spin_lock_irq(&pipe->rd_wait.lock); 351 tail++; 352 pipe->tail = tail; 353 spin_unlock_irq(&pipe->rd_wait.lock); 354 } 355 total_len -= chars; 356 if (!total_len) 357 break; /* common path: read succeeded */ 358 if (!pipe_empty(head, tail)) /* More to do? */ 359 continue; 360 } 361 362 if (!pipe->writers) 363 break; 364 if (ret) 365 break; 366 if (filp->f_flags & O_NONBLOCK) { 367 ret = -EAGAIN; 368 break; 369 } 370 __pipe_unlock(pipe); 371 372 /* 373 * We only get here if we didn't actually read anything. 374 * 375 * However, we could have seen (and removed) a zero-sized 376 * pipe buffer, and might have made space in the buffers 377 * that way. 378 * 379 * You can't make zero-sized pipe buffers by doing an empty 380 * write (not even in packet mode), but they can happen if 381 * the writer gets an EFAULT when trying to fill a buffer 382 * that already got allocated and inserted in the buffer 383 * array. 384 * 385 * So we still need to wake up any pending writers in the 386 * _very_ unlikely case that the pipe was full, but we got 387 * no data. 388 */ 389 if (unlikely(was_full)) { 390 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 391 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 392 } 393 394 /* 395 * But because we didn't read anything, at this point we can 396 * just return directly with -ERESTARTSYS if we're interrupted, 397 * since we've done any required wakeups and there's no need 398 * to mark anything accessed. And we've dropped the lock. 399 */ 400 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 401 return -ERESTARTSYS; 402 403 __pipe_lock(pipe); 404 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 405 wake_next_reader = true; 406 } 407 if (pipe_empty(pipe->head, pipe->tail)) 408 wake_next_reader = false; 409 __pipe_unlock(pipe); 410 411 if (was_full) { 412 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 413 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 414 } 415 if (wake_next_reader) 416 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 417 if (ret > 0) 418 file_accessed(filp); 419 return ret; 420 } 421 422 static inline int is_packetized(struct file *file) 423 { 424 return (file->f_flags & O_DIRECT) != 0; 425 } 426 427 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 428 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 429 { 430 unsigned int head = READ_ONCE(pipe->head); 431 unsigned int tail = READ_ONCE(pipe->tail); 432 unsigned int max_usage = READ_ONCE(pipe->max_usage); 433 434 return !pipe_full(head, tail, max_usage) || 435 !READ_ONCE(pipe->readers); 436 } 437 438 static ssize_t 439 pipe_write(struct kiocb *iocb, struct iov_iter *from) 440 { 441 struct file *filp = iocb->ki_filp; 442 struct pipe_inode_info *pipe = filp->private_data; 443 unsigned int head; 444 ssize_t ret = 0; 445 size_t total_len = iov_iter_count(from); 446 ssize_t chars; 447 bool was_empty = false; 448 bool wake_next_writer = false; 449 450 /* Null write succeeds. */ 451 if (unlikely(total_len == 0)) 452 return 0; 453 454 __pipe_lock(pipe); 455 456 if (!pipe->readers) { 457 send_sig(SIGPIPE, current, 0); 458 ret = -EPIPE; 459 goto out; 460 } 461 462 /* 463 * Only wake up if the pipe started out empty, since 464 * otherwise there should be no readers waiting. 465 * 466 * If it wasn't empty we try to merge new data into 467 * the last buffer. 468 * 469 * That naturally merges small writes, but it also 470 * page-aligs the rest of the writes for large writes 471 * spanning multiple pages. 472 */ 473 head = pipe->head; 474 was_empty = pipe_empty(head, pipe->tail); 475 chars = total_len & (PAGE_SIZE-1); 476 if (chars && !was_empty) { 477 unsigned int mask = pipe->ring_size - 1; 478 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 479 int offset = buf->offset + buf->len; 480 481 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 482 ret = pipe_buf_confirm(pipe, buf); 483 if (ret) 484 goto out; 485 486 ret = copy_page_from_iter(buf->page, offset, chars, from); 487 if (unlikely(ret < chars)) { 488 ret = -EFAULT; 489 goto out; 490 } 491 492 buf->len += ret; 493 if (!iov_iter_count(from)) 494 goto out; 495 } 496 } 497 498 for (;;) { 499 if (!pipe->readers) { 500 send_sig(SIGPIPE, current, 0); 501 if (!ret) 502 ret = -EPIPE; 503 break; 504 } 505 506 head = pipe->head; 507 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 508 unsigned int mask = pipe->ring_size - 1; 509 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 510 struct page *page = pipe->tmp_page; 511 int copied; 512 513 if (!page) { 514 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 515 if (unlikely(!page)) { 516 ret = ret ? : -ENOMEM; 517 break; 518 } 519 pipe->tmp_page = page; 520 } 521 522 /* Allocate a slot in the ring in advance and attach an 523 * empty buffer. If we fault or otherwise fail to use 524 * it, either the reader will consume it or it'll still 525 * be there for the next write. 526 */ 527 spin_lock_irq(&pipe->rd_wait.lock); 528 529 head = pipe->head; 530 if (pipe_full(head, pipe->tail, pipe->max_usage)) { 531 spin_unlock_irq(&pipe->rd_wait.lock); 532 continue; 533 } 534 535 pipe->head = head + 1; 536 spin_unlock_irq(&pipe->rd_wait.lock); 537 538 /* Insert it into the buffer array */ 539 buf = &pipe->bufs[head & mask]; 540 buf->page = page; 541 buf->ops = &anon_pipe_buf_ops; 542 buf->offset = 0; 543 buf->len = 0; 544 buf->flags = 0; 545 if (is_packetized(filp)) { 546 buf->ops = &packet_pipe_buf_ops; 547 buf->flags = PIPE_BUF_FLAG_PACKET; 548 } 549 pipe->tmp_page = NULL; 550 551 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 552 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 553 if (!ret) 554 ret = -EFAULT; 555 break; 556 } 557 ret += copied; 558 buf->offset = 0; 559 buf->len = copied; 560 561 if (!iov_iter_count(from)) 562 break; 563 } 564 565 if (!pipe_full(head, pipe->tail, pipe->max_usage)) 566 continue; 567 568 /* Wait for buffer space to become available. */ 569 if (filp->f_flags & O_NONBLOCK) { 570 if (!ret) 571 ret = -EAGAIN; 572 break; 573 } 574 if (signal_pending(current)) { 575 if (!ret) 576 ret = -ERESTARTSYS; 577 break; 578 } 579 580 /* 581 * We're going to release the pipe lock and wait for more 582 * space. We wake up any readers if necessary, and then 583 * after waiting we need to re-check whether the pipe 584 * become empty while we dropped the lock. 585 */ 586 __pipe_unlock(pipe); 587 if (was_empty) { 588 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 589 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 590 } 591 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 592 __pipe_lock(pipe); 593 was_empty = pipe_empty(pipe->head, pipe->tail); 594 wake_next_writer = true; 595 } 596 out: 597 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 598 wake_next_writer = false; 599 __pipe_unlock(pipe); 600 601 /* 602 * If we do do a wakeup event, we do a 'sync' wakeup, because we 603 * want the reader to start processing things asap, rather than 604 * leave the data pending. 605 * 606 * This is particularly important for small writes, because of 607 * how (for example) the GNU make jobserver uses small writes to 608 * wake up pending jobs 609 */ 610 if (was_empty) { 611 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 612 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 613 } 614 if (wake_next_writer) 615 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 616 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 617 int err = file_update_time(filp); 618 if (err) 619 ret = err; 620 sb_end_write(file_inode(filp)->i_sb); 621 } 622 return ret; 623 } 624 625 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 626 { 627 struct pipe_inode_info *pipe = filp->private_data; 628 int count, head, tail, mask; 629 630 switch (cmd) { 631 case FIONREAD: 632 __pipe_lock(pipe); 633 count = 0; 634 head = pipe->head; 635 tail = pipe->tail; 636 mask = pipe->ring_size - 1; 637 638 while (tail != head) { 639 count += pipe->bufs[tail & mask].len; 640 tail++; 641 } 642 __pipe_unlock(pipe); 643 644 return put_user(count, (int __user *)arg); 645 default: 646 return -ENOIOCTLCMD; 647 } 648 } 649 650 /* No kernel lock held - fine */ 651 static __poll_t 652 pipe_poll(struct file *filp, poll_table *wait) 653 { 654 __poll_t mask; 655 struct pipe_inode_info *pipe = filp->private_data; 656 unsigned int head, tail; 657 658 /* 659 * Reading pipe state only -- no need for acquiring the semaphore. 660 * 661 * But because this is racy, the code has to add the 662 * entry to the poll table _first_ .. 663 */ 664 if (filp->f_mode & FMODE_READ) 665 poll_wait(filp, &pipe->rd_wait, wait); 666 if (filp->f_mode & FMODE_WRITE) 667 poll_wait(filp, &pipe->wr_wait, wait); 668 669 /* 670 * .. and only then can you do the racy tests. That way, 671 * if something changes and you got it wrong, the poll 672 * table entry will wake you up and fix it. 673 */ 674 head = READ_ONCE(pipe->head); 675 tail = READ_ONCE(pipe->tail); 676 677 mask = 0; 678 if (filp->f_mode & FMODE_READ) { 679 if (!pipe_empty(head, tail)) 680 mask |= EPOLLIN | EPOLLRDNORM; 681 if (!pipe->writers && filp->f_version != pipe->w_counter) 682 mask |= EPOLLHUP; 683 } 684 685 if (filp->f_mode & FMODE_WRITE) { 686 if (!pipe_full(head, tail, pipe->max_usage)) 687 mask |= EPOLLOUT | EPOLLWRNORM; 688 /* 689 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 690 * behave exactly like pipes for poll(). 691 */ 692 if (!pipe->readers) 693 mask |= EPOLLERR; 694 } 695 696 return mask; 697 } 698 699 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 700 { 701 int kill = 0; 702 703 spin_lock(&inode->i_lock); 704 if (!--pipe->files) { 705 inode->i_pipe = NULL; 706 kill = 1; 707 } 708 spin_unlock(&inode->i_lock); 709 710 if (kill) 711 free_pipe_info(pipe); 712 } 713 714 static int 715 pipe_release(struct inode *inode, struct file *file) 716 { 717 struct pipe_inode_info *pipe = file->private_data; 718 719 __pipe_lock(pipe); 720 if (file->f_mode & FMODE_READ) 721 pipe->readers--; 722 if (file->f_mode & FMODE_WRITE) 723 pipe->writers--; 724 725 if (pipe->readers || pipe->writers) { 726 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLHUP); 727 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM | EPOLLERR | EPOLLHUP); 728 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 729 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 730 } 731 __pipe_unlock(pipe); 732 733 put_pipe_info(inode, pipe); 734 return 0; 735 } 736 737 static int 738 pipe_fasync(int fd, struct file *filp, int on) 739 { 740 struct pipe_inode_info *pipe = filp->private_data; 741 int retval = 0; 742 743 __pipe_lock(pipe); 744 if (filp->f_mode & FMODE_READ) 745 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 746 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 747 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 748 if (retval < 0 && (filp->f_mode & FMODE_READ)) 749 /* this can happen only if on == T */ 750 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 751 } 752 __pipe_unlock(pipe); 753 return retval; 754 } 755 756 static unsigned long account_pipe_buffers(struct user_struct *user, 757 unsigned long old, unsigned long new) 758 { 759 return atomic_long_add_return(new - old, &user->pipe_bufs); 760 } 761 762 static bool too_many_pipe_buffers_soft(unsigned long user_bufs) 763 { 764 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 765 766 return soft_limit && user_bufs > soft_limit; 767 } 768 769 static bool too_many_pipe_buffers_hard(unsigned long user_bufs) 770 { 771 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 772 773 return hard_limit && user_bufs > hard_limit; 774 } 775 776 static bool is_unprivileged_user(void) 777 { 778 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 779 } 780 781 struct pipe_inode_info *alloc_pipe_info(void) 782 { 783 struct pipe_inode_info *pipe; 784 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 785 struct user_struct *user = get_current_user(); 786 unsigned long user_bufs; 787 unsigned int max_size = READ_ONCE(pipe_max_size); 788 789 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 790 if (pipe == NULL) 791 goto out_free_uid; 792 793 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 794 pipe_bufs = max_size >> PAGE_SHIFT; 795 796 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 797 798 if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { 799 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 800 pipe_bufs = 1; 801 } 802 803 if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) 804 goto out_revert_acct; 805 806 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 807 GFP_KERNEL_ACCOUNT); 808 809 if (pipe->bufs) { 810 init_waitqueue_head(&pipe->rd_wait); 811 init_waitqueue_head(&pipe->wr_wait); 812 pipe->r_counter = pipe->w_counter = 1; 813 pipe->max_usage = pipe_bufs; 814 pipe->ring_size = pipe_bufs; 815 pipe->user = user; 816 mutex_init(&pipe->mutex); 817 return pipe; 818 } 819 820 out_revert_acct: 821 (void) account_pipe_buffers(user, pipe_bufs, 0); 822 kfree(pipe); 823 out_free_uid: 824 free_uid(user); 825 return NULL; 826 } 827 828 void free_pipe_info(struct pipe_inode_info *pipe) 829 { 830 int i; 831 832 (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); 833 free_uid(pipe->user); 834 for (i = 0; i < pipe->ring_size; i++) { 835 struct pipe_buffer *buf = pipe->bufs + i; 836 if (buf->ops) 837 pipe_buf_release(pipe, buf); 838 } 839 if (pipe->tmp_page) 840 __free_page(pipe->tmp_page); 841 kfree(pipe->bufs); 842 kfree(pipe); 843 } 844 845 static struct vfsmount *pipe_mnt __read_mostly; 846 847 /* 848 * pipefs_dname() is called from d_path(). 849 */ 850 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 851 { 852 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 853 d_inode(dentry)->i_ino); 854 } 855 856 static const struct dentry_operations pipefs_dentry_operations = { 857 .d_dname = pipefs_dname, 858 }; 859 860 static struct inode * get_pipe_inode(void) 861 { 862 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 863 struct pipe_inode_info *pipe; 864 865 if (!inode) 866 goto fail_inode; 867 868 inode->i_ino = get_next_ino(); 869 870 pipe = alloc_pipe_info(); 871 if (!pipe) 872 goto fail_iput; 873 874 inode->i_pipe = pipe; 875 pipe->files = 2; 876 pipe->readers = pipe->writers = 1; 877 inode->i_fop = &pipefifo_fops; 878 879 /* 880 * Mark the inode dirty from the very beginning, 881 * that way it will never be moved to the dirty 882 * list because "mark_inode_dirty()" will think 883 * that it already _is_ on the dirty list. 884 */ 885 inode->i_state = I_DIRTY; 886 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 887 inode->i_uid = current_fsuid(); 888 inode->i_gid = current_fsgid(); 889 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 890 891 return inode; 892 893 fail_iput: 894 iput(inode); 895 896 fail_inode: 897 return NULL; 898 } 899 900 int create_pipe_files(struct file **res, int flags) 901 { 902 struct inode *inode = get_pipe_inode(); 903 struct file *f; 904 905 if (!inode) 906 return -ENFILE; 907 908 f = alloc_file_pseudo(inode, pipe_mnt, "", 909 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 910 &pipefifo_fops); 911 if (IS_ERR(f)) { 912 free_pipe_info(inode->i_pipe); 913 iput(inode); 914 return PTR_ERR(f); 915 } 916 917 f->private_data = inode->i_pipe; 918 919 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 920 &pipefifo_fops); 921 if (IS_ERR(res[0])) { 922 put_pipe_info(inode, inode->i_pipe); 923 fput(f); 924 return PTR_ERR(res[0]); 925 } 926 res[0]->private_data = inode->i_pipe; 927 res[1] = f; 928 stream_open(inode, res[0]); 929 stream_open(inode, res[1]); 930 return 0; 931 } 932 933 static int __do_pipe_flags(int *fd, struct file **files, int flags) 934 { 935 int error; 936 int fdw, fdr; 937 938 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 939 return -EINVAL; 940 941 error = create_pipe_files(files, flags); 942 if (error) 943 return error; 944 945 error = get_unused_fd_flags(flags); 946 if (error < 0) 947 goto err_read_pipe; 948 fdr = error; 949 950 error = get_unused_fd_flags(flags); 951 if (error < 0) 952 goto err_fdr; 953 fdw = error; 954 955 audit_fd_pair(fdr, fdw); 956 fd[0] = fdr; 957 fd[1] = fdw; 958 return 0; 959 960 err_fdr: 961 put_unused_fd(fdr); 962 err_read_pipe: 963 fput(files[0]); 964 fput(files[1]); 965 return error; 966 } 967 968 int do_pipe_flags(int *fd, int flags) 969 { 970 struct file *files[2]; 971 int error = __do_pipe_flags(fd, files, flags); 972 if (!error) { 973 fd_install(fd[0], files[0]); 974 fd_install(fd[1], files[1]); 975 } 976 return error; 977 } 978 979 /* 980 * sys_pipe() is the normal C calling standard for creating 981 * a pipe. It's not the way Unix traditionally does this, though. 982 */ 983 static int do_pipe2(int __user *fildes, int flags) 984 { 985 struct file *files[2]; 986 int fd[2]; 987 int error; 988 989 error = __do_pipe_flags(fd, files, flags); 990 if (!error) { 991 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 992 fput(files[0]); 993 fput(files[1]); 994 put_unused_fd(fd[0]); 995 put_unused_fd(fd[1]); 996 error = -EFAULT; 997 } else { 998 fd_install(fd[0], files[0]); 999 fd_install(fd[1], files[1]); 1000 } 1001 } 1002 return error; 1003 } 1004 1005 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1006 { 1007 return do_pipe2(fildes, flags); 1008 } 1009 1010 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1011 { 1012 return do_pipe2(fildes, 0); 1013 } 1014 1015 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1016 { 1017 int cur = *cnt; 1018 1019 while (cur == *cnt) { 1020 pipe_wait(pipe); 1021 if (signal_pending(current)) 1022 break; 1023 } 1024 return cur == *cnt ? -ERESTARTSYS : 0; 1025 } 1026 1027 static void wake_up_partner(struct pipe_inode_info *pipe) 1028 { 1029 wake_up_interruptible(&pipe->rd_wait); 1030 wake_up_interruptible(&pipe->wr_wait); 1031 } 1032 1033 static int fifo_open(struct inode *inode, struct file *filp) 1034 { 1035 struct pipe_inode_info *pipe; 1036 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1037 int ret; 1038 1039 filp->f_version = 0; 1040 1041 spin_lock(&inode->i_lock); 1042 if (inode->i_pipe) { 1043 pipe = inode->i_pipe; 1044 pipe->files++; 1045 spin_unlock(&inode->i_lock); 1046 } else { 1047 spin_unlock(&inode->i_lock); 1048 pipe = alloc_pipe_info(); 1049 if (!pipe) 1050 return -ENOMEM; 1051 pipe->files = 1; 1052 spin_lock(&inode->i_lock); 1053 if (unlikely(inode->i_pipe)) { 1054 inode->i_pipe->files++; 1055 spin_unlock(&inode->i_lock); 1056 free_pipe_info(pipe); 1057 pipe = inode->i_pipe; 1058 } else { 1059 inode->i_pipe = pipe; 1060 spin_unlock(&inode->i_lock); 1061 } 1062 } 1063 filp->private_data = pipe; 1064 /* OK, we have a pipe and it's pinned down */ 1065 1066 __pipe_lock(pipe); 1067 1068 /* We can only do regular read/write on fifos */ 1069 stream_open(inode, filp); 1070 1071 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1072 case FMODE_READ: 1073 /* 1074 * O_RDONLY 1075 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1076 * opened, even when there is no process writing the FIFO. 1077 */ 1078 pipe->r_counter++; 1079 if (pipe->readers++ == 0) 1080 wake_up_partner(pipe); 1081 1082 if (!is_pipe && !pipe->writers) { 1083 if ((filp->f_flags & O_NONBLOCK)) { 1084 /* suppress EPOLLHUP until we have 1085 * seen a writer */ 1086 filp->f_version = pipe->w_counter; 1087 } else { 1088 if (wait_for_partner(pipe, &pipe->w_counter)) 1089 goto err_rd; 1090 } 1091 } 1092 break; 1093 1094 case FMODE_WRITE: 1095 /* 1096 * O_WRONLY 1097 * POSIX.1 says that O_NONBLOCK means return -1 with 1098 * errno=ENXIO when there is no process reading the FIFO. 1099 */ 1100 ret = -ENXIO; 1101 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1102 goto err; 1103 1104 pipe->w_counter++; 1105 if (!pipe->writers++) 1106 wake_up_partner(pipe); 1107 1108 if (!is_pipe && !pipe->readers) { 1109 if (wait_for_partner(pipe, &pipe->r_counter)) 1110 goto err_wr; 1111 } 1112 break; 1113 1114 case FMODE_READ | FMODE_WRITE: 1115 /* 1116 * O_RDWR 1117 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1118 * This implementation will NEVER block on a O_RDWR open, since 1119 * the process can at least talk to itself. 1120 */ 1121 1122 pipe->readers++; 1123 pipe->writers++; 1124 pipe->r_counter++; 1125 pipe->w_counter++; 1126 if (pipe->readers == 1 || pipe->writers == 1) 1127 wake_up_partner(pipe); 1128 break; 1129 1130 default: 1131 ret = -EINVAL; 1132 goto err; 1133 } 1134 1135 /* Ok! */ 1136 __pipe_unlock(pipe); 1137 return 0; 1138 1139 err_rd: 1140 if (!--pipe->readers) 1141 wake_up_interruptible(&pipe->wr_wait); 1142 ret = -ERESTARTSYS; 1143 goto err; 1144 1145 err_wr: 1146 if (!--pipe->writers) 1147 wake_up_interruptible(&pipe->rd_wait); 1148 ret = -ERESTARTSYS; 1149 goto err; 1150 1151 err: 1152 __pipe_unlock(pipe); 1153 1154 put_pipe_info(inode, pipe); 1155 return ret; 1156 } 1157 1158 const struct file_operations pipefifo_fops = { 1159 .open = fifo_open, 1160 .llseek = no_llseek, 1161 .read_iter = pipe_read, 1162 .write_iter = pipe_write, 1163 .poll = pipe_poll, 1164 .unlocked_ioctl = pipe_ioctl, 1165 .release = pipe_release, 1166 .fasync = pipe_fasync, 1167 }; 1168 1169 /* 1170 * Currently we rely on the pipe array holding a power-of-2 number 1171 * of pages. Returns 0 on error. 1172 */ 1173 unsigned int round_pipe_size(unsigned long size) 1174 { 1175 if (size > (1U << 31)) 1176 return 0; 1177 1178 /* Minimum pipe size, as required by POSIX */ 1179 if (size < PAGE_SIZE) 1180 return PAGE_SIZE; 1181 1182 return roundup_pow_of_two(size); 1183 } 1184 1185 /* 1186 * Allocate a new array of pipe buffers and copy the info over. Returns the 1187 * pipe size if successful, or return -ERROR on error. 1188 */ 1189 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1190 { 1191 struct pipe_buffer *bufs; 1192 unsigned int size, nr_slots, head, tail, mask, n; 1193 unsigned long user_bufs; 1194 long ret = 0; 1195 1196 size = round_pipe_size(arg); 1197 nr_slots = size >> PAGE_SHIFT; 1198 1199 if (!nr_slots) 1200 return -EINVAL; 1201 1202 /* 1203 * If trying to increase the pipe capacity, check that an 1204 * unprivileged user is not trying to exceed various limits 1205 * (soft limit check here, hard limit check just below). 1206 * Decreasing the pipe capacity is always permitted, even 1207 * if the user is currently over a limit. 1208 */ 1209 if (nr_slots > pipe->ring_size && 1210 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1211 return -EPERM; 1212 1213 user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); 1214 1215 if (nr_slots > pipe->ring_size && 1216 (too_many_pipe_buffers_hard(user_bufs) || 1217 too_many_pipe_buffers_soft(user_bufs)) && 1218 is_unprivileged_user()) { 1219 ret = -EPERM; 1220 goto out_revert_acct; 1221 } 1222 1223 /* 1224 * We can shrink the pipe, if arg is greater than the ring occupancy. 1225 * Since we don't expect a lot of shrink+grow operations, just free and 1226 * allocate again like we would do for growing. If the pipe currently 1227 * contains more buffers than arg, then return busy. 1228 */ 1229 mask = pipe->ring_size - 1; 1230 head = pipe->head; 1231 tail = pipe->tail; 1232 n = pipe_occupancy(pipe->head, pipe->tail); 1233 if (nr_slots < n) { 1234 ret = -EBUSY; 1235 goto out_revert_acct; 1236 } 1237 1238 bufs = kcalloc(nr_slots, sizeof(*bufs), 1239 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1240 if (unlikely(!bufs)) { 1241 ret = -ENOMEM; 1242 goto out_revert_acct; 1243 } 1244 1245 /* 1246 * The pipe array wraps around, so just start the new one at zero 1247 * and adjust the indices. 1248 */ 1249 if (n > 0) { 1250 unsigned int h = head & mask; 1251 unsigned int t = tail & mask; 1252 if (h > t) { 1253 memcpy(bufs, pipe->bufs + t, 1254 n * sizeof(struct pipe_buffer)); 1255 } else { 1256 unsigned int tsize = pipe->ring_size - t; 1257 if (h > 0) 1258 memcpy(bufs + tsize, pipe->bufs, 1259 h * sizeof(struct pipe_buffer)); 1260 memcpy(bufs, pipe->bufs + t, 1261 tsize * sizeof(struct pipe_buffer)); 1262 } 1263 } 1264 1265 head = n; 1266 tail = 0; 1267 1268 kfree(pipe->bufs); 1269 pipe->bufs = bufs; 1270 pipe->ring_size = nr_slots; 1271 pipe->max_usage = nr_slots; 1272 pipe->tail = tail; 1273 pipe->head = head; 1274 wake_up_interruptible_all(&pipe->rd_wait); 1275 wake_up_interruptible_all(&pipe->wr_wait); 1276 return pipe->max_usage * PAGE_SIZE; 1277 1278 out_revert_acct: 1279 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); 1280 return ret; 1281 } 1282 1283 /* 1284 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1285 * location, so checking ->i_pipe is not enough to verify that this is a 1286 * pipe. 1287 */ 1288 struct pipe_inode_info *get_pipe_info(struct file *file) 1289 { 1290 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1291 } 1292 1293 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1294 { 1295 struct pipe_inode_info *pipe; 1296 long ret; 1297 1298 pipe = get_pipe_info(file); 1299 if (!pipe) 1300 return -EBADF; 1301 1302 __pipe_lock(pipe); 1303 1304 switch (cmd) { 1305 case F_SETPIPE_SZ: 1306 ret = pipe_set_size(pipe, arg); 1307 break; 1308 case F_GETPIPE_SZ: 1309 ret = pipe->max_usage * PAGE_SIZE; 1310 break; 1311 default: 1312 ret = -EINVAL; 1313 break; 1314 } 1315 1316 __pipe_unlock(pipe); 1317 return ret; 1318 } 1319 1320 static const struct super_operations pipefs_ops = { 1321 .destroy_inode = free_inode_nonrcu, 1322 .statfs = simple_statfs, 1323 }; 1324 1325 /* 1326 * pipefs should _never_ be mounted by userland - too much of security hassle, 1327 * no real gain from having the whole whorehouse mounted. So we don't need 1328 * any operations on the root directory. However, we need a non-trivial 1329 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1330 */ 1331 1332 static int pipefs_init_fs_context(struct fs_context *fc) 1333 { 1334 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1335 if (!ctx) 1336 return -ENOMEM; 1337 ctx->ops = &pipefs_ops; 1338 ctx->dops = &pipefs_dentry_operations; 1339 return 0; 1340 } 1341 1342 static struct file_system_type pipe_fs_type = { 1343 .name = "pipefs", 1344 .init_fs_context = pipefs_init_fs_context, 1345 .kill_sb = kill_anon_super, 1346 }; 1347 1348 static int __init init_pipe_fs(void) 1349 { 1350 int err = register_filesystem(&pipe_fs_type); 1351 1352 if (!err) { 1353 pipe_mnt = kern_mount(&pipe_fs_type); 1354 if (IS_ERR(pipe_mnt)) { 1355 err = PTR_ERR(pipe_mnt); 1356 unregister_filesystem(&pipe_fs_type); 1357 } 1358 } 1359 return err; 1360 } 1361 1362 fs_initcall(init_pipe_fs); 1363