1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 28 #include <linux/uaccess.h> 29 #include <asm/ioctls.h> 30 31 #include "internal.h" 32 33 /* 34 * The max size that a non-root user is allowed to grow the pipe. Can 35 * be set by root in /proc/sys/fs/pipe-max-size 36 */ 37 unsigned int pipe_max_size = 1048576; 38 39 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 40 * matches default values. 41 */ 42 unsigned long pipe_user_pages_hard; 43 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 44 45 /* 46 * We use head and tail indices that aren't masked off, except at the point of 47 * dereference, but rather they're allowed to wrap naturally. This means there 48 * isn't a dead spot in the buffer, but the ring has to be a power of two and 49 * <= 2^31. 50 * -- David Howells 2019-09-23. 51 * 52 * Reads with count = 0 should always return 0. 53 * -- Julian Bradfield 1999-06-07. 54 * 55 * FIFOs and Pipes now generate SIGIO for both readers and writers. 56 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 57 * 58 * pipe_read & write cleanup 59 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 60 */ 61 62 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 63 { 64 if (pipe->files) 65 mutex_lock_nested(&pipe->mutex, subclass); 66 } 67 68 void pipe_lock(struct pipe_inode_info *pipe) 69 { 70 /* 71 * pipe_lock() nests non-pipe inode locks (for writing to a file) 72 */ 73 pipe_lock_nested(pipe, I_MUTEX_PARENT); 74 } 75 EXPORT_SYMBOL(pipe_lock); 76 77 void pipe_unlock(struct pipe_inode_info *pipe) 78 { 79 if (pipe->files) 80 mutex_unlock(&pipe->mutex); 81 } 82 EXPORT_SYMBOL(pipe_unlock); 83 84 static inline void __pipe_lock(struct pipe_inode_info *pipe) 85 { 86 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 87 } 88 89 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 90 { 91 mutex_unlock(&pipe->mutex); 92 } 93 94 void pipe_double_lock(struct pipe_inode_info *pipe1, 95 struct pipe_inode_info *pipe2) 96 { 97 BUG_ON(pipe1 == pipe2); 98 99 if (pipe1 < pipe2) { 100 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 101 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 102 } else { 103 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 104 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 105 } 106 } 107 108 /* Drop the inode semaphore and wait for a pipe event, atomically */ 109 void pipe_wait(struct pipe_inode_info *pipe) 110 { 111 DEFINE_WAIT(rdwait); 112 DEFINE_WAIT(wrwait); 113 114 /* 115 * Pipes are system-local resources, so sleeping on them 116 * is considered a noninteractive wait: 117 */ 118 prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); 119 prepare_to_wait(&pipe->wr_wait, &wrwait, TASK_INTERRUPTIBLE); 120 pipe_unlock(pipe); 121 schedule(); 122 finish_wait(&pipe->rd_wait, &rdwait); 123 finish_wait(&pipe->wr_wait, &wrwait); 124 pipe_lock(pipe); 125 } 126 127 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 128 struct pipe_buffer *buf) 129 { 130 struct page *page = buf->page; 131 132 /* 133 * If nobody else uses this page, and we don't already have a 134 * temporary page, let's keep track of it as a one-deep 135 * allocation cache. (Otherwise just release our reference to it) 136 */ 137 if (page_count(page) == 1 && !pipe->tmp_page) 138 pipe->tmp_page = page; 139 else 140 put_page(page); 141 } 142 143 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 144 struct pipe_buffer *buf) 145 { 146 struct page *page = buf->page; 147 148 if (page_count(page) == 1) { 149 memcg_kmem_uncharge(page, 0); 150 __SetPageLocked(page); 151 return 0; 152 } 153 return 1; 154 } 155 156 /** 157 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 158 * @pipe: the pipe that the buffer belongs to 159 * @buf: the buffer to attempt to steal 160 * 161 * Description: 162 * This function attempts to steal the &struct page attached to 163 * @buf. If successful, this function returns 0 and returns with 164 * the page locked. The caller may then reuse the page for whatever 165 * he wishes; the typical use is insertion into a different file 166 * page cache. 167 */ 168 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 169 struct pipe_buffer *buf) 170 { 171 struct page *page = buf->page; 172 173 /* 174 * A reference of one is golden, that means that the owner of this 175 * page is the only one holding a reference to it. lock the page 176 * and return OK. 177 */ 178 if (page_count(page) == 1) { 179 lock_page(page); 180 return 0; 181 } 182 183 return 1; 184 } 185 EXPORT_SYMBOL(generic_pipe_buf_steal); 186 187 /** 188 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 189 * @pipe: the pipe that the buffer belongs to 190 * @buf: the buffer to get a reference to 191 * 192 * Description: 193 * This function grabs an extra reference to @buf. It's used in 194 * in the tee() system call, when we duplicate the buffers in one 195 * pipe into another. 196 */ 197 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 198 { 199 return try_get_page(buf->page); 200 } 201 EXPORT_SYMBOL(generic_pipe_buf_get); 202 203 /** 204 * generic_pipe_buf_confirm - verify contents of the pipe buffer 205 * @info: the pipe that the buffer belongs to 206 * @buf: the buffer to confirm 207 * 208 * Description: 209 * This function does nothing, because the generic pipe code uses 210 * pages that are always good when inserted into the pipe. 211 */ 212 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 213 struct pipe_buffer *buf) 214 { 215 return 0; 216 } 217 EXPORT_SYMBOL(generic_pipe_buf_confirm); 218 219 /** 220 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 221 * @pipe: the pipe that the buffer belongs to 222 * @buf: the buffer to put a reference to 223 * 224 * Description: 225 * This function releases a reference to @buf. 226 */ 227 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 228 struct pipe_buffer *buf) 229 { 230 put_page(buf->page); 231 } 232 EXPORT_SYMBOL(generic_pipe_buf_release); 233 234 /* New data written to a pipe may be appended to a buffer with this type. */ 235 static const struct pipe_buf_operations anon_pipe_buf_ops = { 236 .confirm = generic_pipe_buf_confirm, 237 .release = anon_pipe_buf_release, 238 .steal = anon_pipe_buf_steal, 239 .get = generic_pipe_buf_get, 240 }; 241 242 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 243 .confirm = generic_pipe_buf_confirm, 244 .release = anon_pipe_buf_release, 245 .steal = anon_pipe_buf_steal, 246 .get = generic_pipe_buf_get, 247 }; 248 249 static const struct pipe_buf_operations packet_pipe_buf_ops = { 250 .confirm = generic_pipe_buf_confirm, 251 .release = anon_pipe_buf_release, 252 .steal = anon_pipe_buf_steal, 253 .get = generic_pipe_buf_get, 254 }; 255 256 /** 257 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 258 * @buf: the buffer to mark 259 * 260 * Description: 261 * This function ensures that no future writes will be merged into the 262 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 263 * share the same backing page. 264 */ 265 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 266 { 267 if (buf->ops == &anon_pipe_buf_ops) 268 buf->ops = &anon_pipe_buf_nomerge_ops; 269 } 270 271 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 272 { 273 return buf->ops == &anon_pipe_buf_ops; 274 } 275 276 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 277 static inline bool pipe_readable(const struct pipe_inode_info *pipe) 278 { 279 unsigned int head = READ_ONCE(pipe->head); 280 unsigned int tail = READ_ONCE(pipe->tail); 281 unsigned int writers = READ_ONCE(pipe->writers); 282 283 return !pipe_empty(head, tail) || !writers; 284 } 285 286 static ssize_t 287 pipe_read(struct kiocb *iocb, struct iov_iter *to) 288 { 289 size_t total_len = iov_iter_count(to); 290 struct file *filp = iocb->ki_filp; 291 struct pipe_inode_info *pipe = filp->private_data; 292 bool was_full, wake_next_reader = false; 293 ssize_t ret; 294 295 /* Null read succeeds. */ 296 if (unlikely(total_len == 0)) 297 return 0; 298 299 ret = 0; 300 __pipe_lock(pipe); 301 302 /* 303 * We only wake up writers if the pipe was full when we started 304 * reading in order to avoid unnecessary wakeups. 305 * 306 * But when we do wake up writers, we do so using a sync wakeup 307 * (WF_SYNC), because we want them to get going and generate more 308 * data for us. 309 */ 310 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 311 for (;;) { 312 unsigned int head = pipe->head; 313 unsigned int tail = pipe->tail; 314 unsigned int mask = pipe->ring_size - 1; 315 316 if (!pipe_empty(head, tail)) { 317 struct pipe_buffer *buf = &pipe->bufs[tail & mask]; 318 size_t chars = buf->len; 319 size_t written; 320 int error; 321 322 if (chars > total_len) 323 chars = total_len; 324 325 error = pipe_buf_confirm(pipe, buf); 326 if (error) { 327 if (!ret) 328 ret = error; 329 break; 330 } 331 332 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 333 if (unlikely(written < chars)) { 334 if (!ret) 335 ret = -EFAULT; 336 break; 337 } 338 ret += chars; 339 buf->offset += chars; 340 buf->len -= chars; 341 342 /* Was it a packet buffer? Clean up and exit */ 343 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 344 total_len = chars; 345 buf->len = 0; 346 } 347 348 if (!buf->len) { 349 pipe_buf_release(pipe, buf); 350 spin_lock_irq(&pipe->rd_wait.lock); 351 tail++; 352 pipe->tail = tail; 353 spin_unlock_irq(&pipe->rd_wait.lock); 354 } 355 total_len -= chars; 356 if (!total_len) 357 break; /* common path: read succeeded */ 358 if (!pipe_empty(head, tail)) /* More to do? */ 359 continue; 360 } 361 362 if (!pipe->writers) 363 break; 364 if (ret) 365 break; 366 if (filp->f_flags & O_NONBLOCK) { 367 ret = -EAGAIN; 368 break; 369 } 370 __pipe_unlock(pipe); 371 372 /* 373 * We only get here if we didn't actually read anything. 374 * 375 * However, we could have seen (and removed) a zero-sized 376 * pipe buffer, and might have made space in the buffers 377 * that way. 378 * 379 * You can't make zero-sized pipe buffers by doing an empty 380 * write (not even in packet mode), but they can happen if 381 * the writer gets an EFAULT when trying to fill a buffer 382 * that already got allocated and inserted in the buffer 383 * array. 384 * 385 * So we still need to wake up any pending writers in the 386 * _very_ unlikely case that the pipe was full, but we got 387 * no data. 388 */ 389 if (unlikely(was_full)) { 390 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 391 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 392 } 393 394 /* 395 * But because we didn't read anything, at this point we can 396 * just return directly with -ERESTARTSYS if we're interrupted, 397 * since we've done any required wakeups and there's no need 398 * to mark anything accessed. And we've dropped the lock. 399 */ 400 if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) 401 return -ERESTARTSYS; 402 403 __pipe_lock(pipe); 404 was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); 405 wake_next_reader = true; 406 } 407 if (pipe_empty(pipe->head, pipe->tail)) 408 wake_next_reader = false; 409 __pipe_unlock(pipe); 410 411 if (was_full) { 412 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 413 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 414 } 415 if (wake_next_reader) 416 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 417 if (ret > 0) 418 file_accessed(filp); 419 return ret; 420 } 421 422 static inline int is_packetized(struct file *file) 423 { 424 return (file->f_flags & O_DIRECT) != 0; 425 } 426 427 /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ 428 static inline bool pipe_writable(const struct pipe_inode_info *pipe) 429 { 430 unsigned int head = READ_ONCE(pipe->head); 431 unsigned int tail = READ_ONCE(pipe->tail); 432 unsigned int max_usage = READ_ONCE(pipe->max_usage); 433 434 return !pipe_full(head, tail, max_usage) || 435 !READ_ONCE(pipe->readers); 436 } 437 438 static ssize_t 439 pipe_write(struct kiocb *iocb, struct iov_iter *from) 440 { 441 struct file *filp = iocb->ki_filp; 442 struct pipe_inode_info *pipe = filp->private_data; 443 unsigned int head; 444 ssize_t ret = 0; 445 size_t total_len = iov_iter_count(from); 446 ssize_t chars; 447 bool was_empty = false; 448 bool wake_next_writer = false; 449 450 /* Null write succeeds. */ 451 if (unlikely(total_len == 0)) 452 return 0; 453 454 __pipe_lock(pipe); 455 456 if (!pipe->readers) { 457 send_sig(SIGPIPE, current, 0); 458 ret = -EPIPE; 459 goto out; 460 } 461 462 /* 463 * Only wake up if the pipe started out empty, since 464 * otherwise there should be no readers waiting. 465 * 466 * If it wasn't empty we try to merge new data into 467 * the last buffer. 468 * 469 * That naturally merges small writes, but it also 470 * page-aligs the rest of the writes for large writes 471 * spanning multiple pages. 472 */ 473 head = pipe->head; 474 was_empty = pipe_empty(head, pipe->tail); 475 chars = total_len & (PAGE_SIZE-1); 476 if (chars && !was_empty) { 477 unsigned int mask = pipe->ring_size - 1; 478 struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; 479 int offset = buf->offset + buf->len; 480 481 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 482 ret = pipe_buf_confirm(pipe, buf); 483 if (ret) 484 goto out; 485 486 ret = copy_page_from_iter(buf->page, offset, chars, from); 487 if (unlikely(ret < chars)) { 488 ret = -EFAULT; 489 goto out; 490 } 491 492 buf->len += ret; 493 if (!iov_iter_count(from)) 494 goto out; 495 } 496 } 497 498 for (;;) { 499 if (!pipe->readers) { 500 send_sig(SIGPIPE, current, 0); 501 if (!ret) 502 ret = -EPIPE; 503 break; 504 } 505 506 head = pipe->head; 507 if (!pipe_full(head, pipe->tail, pipe->max_usage)) { 508 unsigned int mask = pipe->ring_size - 1; 509 struct pipe_buffer *buf = &pipe->bufs[head & mask]; 510 struct page *page = pipe->tmp_page; 511 int copied; 512 513 if (!page) { 514 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 515 if (unlikely(!page)) { 516 ret = ret ? : -ENOMEM; 517 break; 518 } 519 pipe->tmp_page = page; 520 } 521 522 /* Allocate a slot in the ring in advance and attach an 523 * empty buffer. If we fault or otherwise fail to use 524 * it, either the reader will consume it or it'll still 525 * be there for the next write. 526 */ 527 spin_lock_irq(&pipe->rd_wait.lock); 528 529 head = pipe->head; 530 if (pipe_full(head, pipe->tail, pipe->max_usage)) { 531 spin_unlock_irq(&pipe->rd_wait.lock); 532 continue; 533 } 534 535 pipe->head = head + 1; 536 spin_unlock_irq(&pipe->rd_wait.lock); 537 538 /* Insert it into the buffer array */ 539 buf = &pipe->bufs[head & mask]; 540 buf->page = page; 541 buf->ops = &anon_pipe_buf_ops; 542 buf->offset = 0; 543 buf->len = 0; 544 buf->flags = 0; 545 if (is_packetized(filp)) { 546 buf->ops = &packet_pipe_buf_ops; 547 buf->flags = PIPE_BUF_FLAG_PACKET; 548 } 549 pipe->tmp_page = NULL; 550 551 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 552 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 553 if (!ret) 554 ret = -EFAULT; 555 break; 556 } 557 ret += copied; 558 buf->offset = 0; 559 buf->len = copied; 560 561 if (!iov_iter_count(from)) 562 break; 563 } 564 565 if (!pipe_full(head, pipe->tail, pipe->max_usage)) 566 continue; 567 568 /* Wait for buffer space to become available. */ 569 if (filp->f_flags & O_NONBLOCK) { 570 if (!ret) 571 ret = -EAGAIN; 572 break; 573 } 574 if (signal_pending(current)) { 575 if (!ret) 576 ret = -ERESTARTSYS; 577 break; 578 } 579 580 /* 581 * We're going to release the pipe lock and wait for more 582 * space. We wake up any readers if necessary, and then 583 * after waiting we need to re-check whether the pipe 584 * become empty while we dropped the lock. 585 */ 586 __pipe_unlock(pipe); 587 if (was_empty) { 588 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 589 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 590 } 591 wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); 592 __pipe_lock(pipe); 593 was_empty = pipe_empty(pipe->head, pipe->tail); 594 wake_next_writer = true; 595 } 596 out: 597 if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) 598 wake_next_writer = false; 599 __pipe_unlock(pipe); 600 601 /* 602 * If we do do a wakeup event, we do a 'sync' wakeup, because we 603 * want the reader to start processing things asap, rather than 604 * leave the data pending. 605 * 606 * This is particularly important for small writes, because of 607 * how (for example) the GNU make jobserver uses small writes to 608 * wake up pending jobs 609 */ 610 if (was_empty) { 611 wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); 612 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 613 } 614 if (wake_next_writer) 615 wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); 616 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 617 int err = file_update_time(filp); 618 if (err) 619 ret = err; 620 sb_end_write(file_inode(filp)->i_sb); 621 } 622 return ret; 623 } 624 625 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 626 { 627 struct pipe_inode_info *pipe = filp->private_data; 628 int count, head, tail, mask; 629 630 switch (cmd) { 631 case FIONREAD: 632 __pipe_lock(pipe); 633 count = 0; 634 head = pipe->head; 635 tail = pipe->tail; 636 mask = pipe->ring_size - 1; 637 638 while (tail != head) { 639 count += pipe->bufs[tail & mask].len; 640 tail++; 641 } 642 __pipe_unlock(pipe); 643 644 return put_user(count, (int __user *)arg); 645 default: 646 return -ENOIOCTLCMD; 647 } 648 } 649 650 /* No kernel lock held - fine */ 651 static __poll_t 652 pipe_poll(struct file *filp, poll_table *wait) 653 { 654 __poll_t mask; 655 struct pipe_inode_info *pipe = filp->private_data; 656 unsigned int head, tail; 657 658 /* 659 * Reading pipe state only -- no need for acquiring the semaphore. 660 * 661 * But because this is racy, the code has to add the 662 * entry to the poll table _first_ .. 663 */ 664 if (filp->f_mode & FMODE_READ) 665 poll_wait(filp, &pipe->rd_wait, wait); 666 if (filp->f_mode & FMODE_WRITE) 667 poll_wait(filp, &pipe->wr_wait, wait); 668 669 /* 670 * .. and only then can you do the racy tests. That way, 671 * if something changes and you got it wrong, the poll 672 * table entry will wake you up and fix it. 673 */ 674 head = READ_ONCE(pipe->head); 675 tail = READ_ONCE(pipe->tail); 676 677 mask = 0; 678 if (filp->f_mode & FMODE_READ) { 679 if (!pipe_empty(head, tail)) 680 mask |= EPOLLIN | EPOLLRDNORM; 681 if (!pipe->writers && filp->f_version != pipe->w_counter) 682 mask |= EPOLLHUP; 683 } 684 685 if (filp->f_mode & FMODE_WRITE) { 686 if (!pipe_full(head, tail, pipe->max_usage)) 687 mask |= EPOLLOUT | EPOLLWRNORM; 688 /* 689 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 690 * behave exactly like pipes for poll(). 691 */ 692 if (!pipe->readers) 693 mask |= EPOLLERR; 694 } 695 696 return mask; 697 } 698 699 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 700 { 701 int kill = 0; 702 703 spin_lock(&inode->i_lock); 704 if (!--pipe->files) { 705 inode->i_pipe = NULL; 706 kill = 1; 707 } 708 spin_unlock(&inode->i_lock); 709 710 if (kill) 711 free_pipe_info(pipe); 712 } 713 714 static int 715 pipe_release(struct inode *inode, struct file *file) 716 { 717 struct pipe_inode_info *pipe = file->private_data; 718 719 __pipe_lock(pipe); 720 if (file->f_mode & FMODE_READ) 721 pipe->readers--; 722 if (file->f_mode & FMODE_WRITE) 723 pipe->writers--; 724 725 /* Was that the last reader or writer, but not the other side? */ 726 if (!pipe->readers != !pipe->writers) { 727 wake_up_interruptible_all(&pipe->rd_wait); 728 wake_up_interruptible_all(&pipe->wr_wait); 729 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 730 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 731 } 732 __pipe_unlock(pipe); 733 734 put_pipe_info(inode, pipe); 735 return 0; 736 } 737 738 static int 739 pipe_fasync(int fd, struct file *filp, int on) 740 { 741 struct pipe_inode_info *pipe = filp->private_data; 742 int retval = 0; 743 744 __pipe_lock(pipe); 745 if (filp->f_mode & FMODE_READ) 746 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 747 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 748 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 749 if (retval < 0 && (filp->f_mode & FMODE_READ)) 750 /* this can happen only if on == T */ 751 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 752 } 753 __pipe_unlock(pipe); 754 return retval; 755 } 756 757 static unsigned long account_pipe_buffers(struct user_struct *user, 758 unsigned long old, unsigned long new) 759 { 760 return atomic_long_add_return(new - old, &user->pipe_bufs); 761 } 762 763 static bool too_many_pipe_buffers_soft(unsigned long user_bufs) 764 { 765 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 766 767 return soft_limit && user_bufs > soft_limit; 768 } 769 770 static bool too_many_pipe_buffers_hard(unsigned long user_bufs) 771 { 772 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 773 774 return hard_limit && user_bufs > hard_limit; 775 } 776 777 static bool is_unprivileged_user(void) 778 { 779 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 780 } 781 782 struct pipe_inode_info *alloc_pipe_info(void) 783 { 784 struct pipe_inode_info *pipe; 785 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 786 struct user_struct *user = get_current_user(); 787 unsigned long user_bufs; 788 unsigned int max_size = READ_ONCE(pipe_max_size); 789 790 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 791 if (pipe == NULL) 792 goto out_free_uid; 793 794 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 795 pipe_bufs = max_size >> PAGE_SHIFT; 796 797 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 798 799 if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { 800 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 801 pipe_bufs = 1; 802 } 803 804 if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) 805 goto out_revert_acct; 806 807 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 808 GFP_KERNEL_ACCOUNT); 809 810 if (pipe->bufs) { 811 init_waitqueue_head(&pipe->rd_wait); 812 init_waitqueue_head(&pipe->wr_wait); 813 pipe->r_counter = pipe->w_counter = 1; 814 pipe->max_usage = pipe_bufs; 815 pipe->ring_size = pipe_bufs; 816 pipe->user = user; 817 mutex_init(&pipe->mutex); 818 return pipe; 819 } 820 821 out_revert_acct: 822 (void) account_pipe_buffers(user, pipe_bufs, 0); 823 kfree(pipe); 824 out_free_uid: 825 free_uid(user); 826 return NULL; 827 } 828 829 void free_pipe_info(struct pipe_inode_info *pipe) 830 { 831 int i; 832 833 (void) account_pipe_buffers(pipe->user, pipe->ring_size, 0); 834 free_uid(pipe->user); 835 for (i = 0; i < pipe->ring_size; i++) { 836 struct pipe_buffer *buf = pipe->bufs + i; 837 if (buf->ops) 838 pipe_buf_release(pipe, buf); 839 } 840 if (pipe->tmp_page) 841 __free_page(pipe->tmp_page); 842 kfree(pipe->bufs); 843 kfree(pipe); 844 } 845 846 static struct vfsmount *pipe_mnt __read_mostly; 847 848 /* 849 * pipefs_dname() is called from d_path(). 850 */ 851 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 852 { 853 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 854 d_inode(dentry)->i_ino); 855 } 856 857 static const struct dentry_operations pipefs_dentry_operations = { 858 .d_dname = pipefs_dname, 859 }; 860 861 static struct inode * get_pipe_inode(void) 862 { 863 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 864 struct pipe_inode_info *pipe; 865 866 if (!inode) 867 goto fail_inode; 868 869 inode->i_ino = get_next_ino(); 870 871 pipe = alloc_pipe_info(); 872 if (!pipe) 873 goto fail_iput; 874 875 inode->i_pipe = pipe; 876 pipe->files = 2; 877 pipe->readers = pipe->writers = 1; 878 inode->i_fop = &pipefifo_fops; 879 880 /* 881 * Mark the inode dirty from the very beginning, 882 * that way it will never be moved to the dirty 883 * list because "mark_inode_dirty()" will think 884 * that it already _is_ on the dirty list. 885 */ 886 inode->i_state = I_DIRTY; 887 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 888 inode->i_uid = current_fsuid(); 889 inode->i_gid = current_fsgid(); 890 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 891 892 return inode; 893 894 fail_iput: 895 iput(inode); 896 897 fail_inode: 898 return NULL; 899 } 900 901 int create_pipe_files(struct file **res, int flags) 902 { 903 struct inode *inode = get_pipe_inode(); 904 struct file *f; 905 906 if (!inode) 907 return -ENFILE; 908 909 f = alloc_file_pseudo(inode, pipe_mnt, "", 910 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 911 &pipefifo_fops); 912 if (IS_ERR(f)) { 913 free_pipe_info(inode->i_pipe); 914 iput(inode); 915 return PTR_ERR(f); 916 } 917 918 f->private_data = inode->i_pipe; 919 920 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 921 &pipefifo_fops); 922 if (IS_ERR(res[0])) { 923 put_pipe_info(inode, inode->i_pipe); 924 fput(f); 925 return PTR_ERR(res[0]); 926 } 927 res[0]->private_data = inode->i_pipe; 928 res[1] = f; 929 stream_open(inode, res[0]); 930 stream_open(inode, res[1]); 931 return 0; 932 } 933 934 static int __do_pipe_flags(int *fd, struct file **files, int flags) 935 { 936 int error; 937 int fdw, fdr; 938 939 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 940 return -EINVAL; 941 942 error = create_pipe_files(files, flags); 943 if (error) 944 return error; 945 946 error = get_unused_fd_flags(flags); 947 if (error < 0) 948 goto err_read_pipe; 949 fdr = error; 950 951 error = get_unused_fd_flags(flags); 952 if (error < 0) 953 goto err_fdr; 954 fdw = error; 955 956 audit_fd_pair(fdr, fdw); 957 fd[0] = fdr; 958 fd[1] = fdw; 959 return 0; 960 961 err_fdr: 962 put_unused_fd(fdr); 963 err_read_pipe: 964 fput(files[0]); 965 fput(files[1]); 966 return error; 967 } 968 969 int do_pipe_flags(int *fd, int flags) 970 { 971 struct file *files[2]; 972 int error = __do_pipe_flags(fd, files, flags); 973 if (!error) { 974 fd_install(fd[0], files[0]); 975 fd_install(fd[1], files[1]); 976 } 977 return error; 978 } 979 980 /* 981 * sys_pipe() is the normal C calling standard for creating 982 * a pipe. It's not the way Unix traditionally does this, though. 983 */ 984 static int do_pipe2(int __user *fildes, int flags) 985 { 986 struct file *files[2]; 987 int fd[2]; 988 int error; 989 990 error = __do_pipe_flags(fd, files, flags); 991 if (!error) { 992 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 993 fput(files[0]); 994 fput(files[1]); 995 put_unused_fd(fd[0]); 996 put_unused_fd(fd[1]); 997 error = -EFAULT; 998 } else { 999 fd_install(fd[0], files[0]); 1000 fd_install(fd[1], files[1]); 1001 } 1002 } 1003 return error; 1004 } 1005 1006 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1007 { 1008 return do_pipe2(fildes, flags); 1009 } 1010 1011 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1012 { 1013 return do_pipe2(fildes, 0); 1014 } 1015 1016 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 1017 { 1018 int cur = *cnt; 1019 1020 while (cur == *cnt) { 1021 pipe_wait(pipe); 1022 if (signal_pending(current)) 1023 break; 1024 } 1025 return cur == *cnt ? -ERESTARTSYS : 0; 1026 } 1027 1028 static void wake_up_partner(struct pipe_inode_info *pipe) 1029 { 1030 wake_up_interruptible_all(&pipe->rd_wait); 1031 wake_up_interruptible_all(&pipe->wr_wait); 1032 } 1033 1034 static int fifo_open(struct inode *inode, struct file *filp) 1035 { 1036 struct pipe_inode_info *pipe; 1037 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 1038 int ret; 1039 1040 filp->f_version = 0; 1041 1042 spin_lock(&inode->i_lock); 1043 if (inode->i_pipe) { 1044 pipe = inode->i_pipe; 1045 pipe->files++; 1046 spin_unlock(&inode->i_lock); 1047 } else { 1048 spin_unlock(&inode->i_lock); 1049 pipe = alloc_pipe_info(); 1050 if (!pipe) 1051 return -ENOMEM; 1052 pipe->files = 1; 1053 spin_lock(&inode->i_lock); 1054 if (unlikely(inode->i_pipe)) { 1055 inode->i_pipe->files++; 1056 spin_unlock(&inode->i_lock); 1057 free_pipe_info(pipe); 1058 pipe = inode->i_pipe; 1059 } else { 1060 inode->i_pipe = pipe; 1061 spin_unlock(&inode->i_lock); 1062 } 1063 } 1064 filp->private_data = pipe; 1065 /* OK, we have a pipe and it's pinned down */ 1066 1067 __pipe_lock(pipe); 1068 1069 /* We can only do regular read/write on fifos */ 1070 stream_open(inode, filp); 1071 1072 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 1073 case FMODE_READ: 1074 /* 1075 * O_RDONLY 1076 * POSIX.1 says that O_NONBLOCK means return with the FIFO 1077 * opened, even when there is no process writing the FIFO. 1078 */ 1079 pipe->r_counter++; 1080 if (pipe->readers++ == 0) 1081 wake_up_partner(pipe); 1082 1083 if (!is_pipe && !pipe->writers) { 1084 if ((filp->f_flags & O_NONBLOCK)) { 1085 /* suppress EPOLLHUP until we have 1086 * seen a writer */ 1087 filp->f_version = pipe->w_counter; 1088 } else { 1089 if (wait_for_partner(pipe, &pipe->w_counter)) 1090 goto err_rd; 1091 } 1092 } 1093 break; 1094 1095 case FMODE_WRITE: 1096 /* 1097 * O_WRONLY 1098 * POSIX.1 says that O_NONBLOCK means return -1 with 1099 * errno=ENXIO when there is no process reading the FIFO. 1100 */ 1101 ret = -ENXIO; 1102 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 1103 goto err; 1104 1105 pipe->w_counter++; 1106 if (!pipe->writers++) 1107 wake_up_partner(pipe); 1108 1109 if (!is_pipe && !pipe->readers) { 1110 if (wait_for_partner(pipe, &pipe->r_counter)) 1111 goto err_wr; 1112 } 1113 break; 1114 1115 case FMODE_READ | FMODE_WRITE: 1116 /* 1117 * O_RDWR 1118 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 1119 * This implementation will NEVER block on a O_RDWR open, since 1120 * the process can at least talk to itself. 1121 */ 1122 1123 pipe->readers++; 1124 pipe->writers++; 1125 pipe->r_counter++; 1126 pipe->w_counter++; 1127 if (pipe->readers == 1 || pipe->writers == 1) 1128 wake_up_partner(pipe); 1129 break; 1130 1131 default: 1132 ret = -EINVAL; 1133 goto err; 1134 } 1135 1136 /* Ok! */ 1137 __pipe_unlock(pipe); 1138 return 0; 1139 1140 err_rd: 1141 if (!--pipe->readers) 1142 wake_up_interruptible(&pipe->wr_wait); 1143 ret = -ERESTARTSYS; 1144 goto err; 1145 1146 err_wr: 1147 if (!--pipe->writers) 1148 wake_up_interruptible_all(&pipe->rd_wait); 1149 ret = -ERESTARTSYS; 1150 goto err; 1151 1152 err: 1153 __pipe_unlock(pipe); 1154 1155 put_pipe_info(inode, pipe); 1156 return ret; 1157 } 1158 1159 const struct file_operations pipefifo_fops = { 1160 .open = fifo_open, 1161 .llseek = no_llseek, 1162 .read_iter = pipe_read, 1163 .write_iter = pipe_write, 1164 .poll = pipe_poll, 1165 .unlocked_ioctl = pipe_ioctl, 1166 .release = pipe_release, 1167 .fasync = pipe_fasync, 1168 }; 1169 1170 /* 1171 * Currently we rely on the pipe array holding a power-of-2 number 1172 * of pages. Returns 0 on error. 1173 */ 1174 unsigned int round_pipe_size(unsigned long size) 1175 { 1176 if (size > (1U << 31)) 1177 return 0; 1178 1179 /* Minimum pipe size, as required by POSIX */ 1180 if (size < PAGE_SIZE) 1181 return PAGE_SIZE; 1182 1183 return roundup_pow_of_two(size); 1184 } 1185 1186 /* 1187 * Allocate a new array of pipe buffers and copy the info over. Returns the 1188 * pipe size if successful, or return -ERROR on error. 1189 */ 1190 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1191 { 1192 struct pipe_buffer *bufs; 1193 unsigned int size, nr_slots, head, tail, mask, n; 1194 unsigned long user_bufs; 1195 long ret = 0; 1196 1197 size = round_pipe_size(arg); 1198 nr_slots = size >> PAGE_SHIFT; 1199 1200 if (!nr_slots) 1201 return -EINVAL; 1202 1203 /* 1204 * If trying to increase the pipe capacity, check that an 1205 * unprivileged user is not trying to exceed various limits 1206 * (soft limit check here, hard limit check just below). 1207 * Decreasing the pipe capacity is always permitted, even 1208 * if the user is currently over a limit. 1209 */ 1210 if (nr_slots > pipe->ring_size && 1211 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1212 return -EPERM; 1213 1214 user_bufs = account_pipe_buffers(pipe->user, pipe->ring_size, nr_slots); 1215 1216 if (nr_slots > pipe->ring_size && 1217 (too_many_pipe_buffers_hard(user_bufs) || 1218 too_many_pipe_buffers_soft(user_bufs)) && 1219 is_unprivileged_user()) { 1220 ret = -EPERM; 1221 goto out_revert_acct; 1222 } 1223 1224 /* 1225 * We can shrink the pipe, if arg is greater than the ring occupancy. 1226 * Since we don't expect a lot of shrink+grow operations, just free and 1227 * allocate again like we would do for growing. If the pipe currently 1228 * contains more buffers than arg, then return busy. 1229 */ 1230 mask = pipe->ring_size - 1; 1231 head = pipe->head; 1232 tail = pipe->tail; 1233 n = pipe_occupancy(pipe->head, pipe->tail); 1234 if (nr_slots < n) { 1235 ret = -EBUSY; 1236 goto out_revert_acct; 1237 } 1238 1239 bufs = kcalloc(nr_slots, sizeof(*bufs), 1240 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1241 if (unlikely(!bufs)) { 1242 ret = -ENOMEM; 1243 goto out_revert_acct; 1244 } 1245 1246 /* 1247 * The pipe array wraps around, so just start the new one at zero 1248 * and adjust the indices. 1249 */ 1250 if (n > 0) { 1251 unsigned int h = head & mask; 1252 unsigned int t = tail & mask; 1253 if (h > t) { 1254 memcpy(bufs, pipe->bufs + t, 1255 n * sizeof(struct pipe_buffer)); 1256 } else { 1257 unsigned int tsize = pipe->ring_size - t; 1258 if (h > 0) 1259 memcpy(bufs + tsize, pipe->bufs, 1260 h * sizeof(struct pipe_buffer)); 1261 memcpy(bufs, pipe->bufs + t, 1262 tsize * sizeof(struct pipe_buffer)); 1263 } 1264 } 1265 1266 head = n; 1267 tail = 0; 1268 1269 kfree(pipe->bufs); 1270 pipe->bufs = bufs; 1271 pipe->ring_size = nr_slots; 1272 pipe->max_usage = nr_slots; 1273 pipe->tail = tail; 1274 pipe->head = head; 1275 1276 /* This might have made more room for writers */ 1277 wake_up_interruptible(&pipe->wr_wait); 1278 return pipe->max_usage * PAGE_SIZE; 1279 1280 out_revert_acct: 1281 (void) account_pipe_buffers(pipe->user, nr_slots, pipe->ring_size); 1282 return ret; 1283 } 1284 1285 /* 1286 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1287 * location, so checking ->i_pipe is not enough to verify that this is a 1288 * pipe. 1289 */ 1290 struct pipe_inode_info *get_pipe_info(struct file *file) 1291 { 1292 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1293 } 1294 1295 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1296 { 1297 struct pipe_inode_info *pipe; 1298 long ret; 1299 1300 pipe = get_pipe_info(file); 1301 if (!pipe) 1302 return -EBADF; 1303 1304 __pipe_lock(pipe); 1305 1306 switch (cmd) { 1307 case F_SETPIPE_SZ: 1308 ret = pipe_set_size(pipe, arg); 1309 break; 1310 case F_GETPIPE_SZ: 1311 ret = pipe->max_usage * PAGE_SIZE; 1312 break; 1313 default: 1314 ret = -EINVAL; 1315 break; 1316 } 1317 1318 __pipe_unlock(pipe); 1319 return ret; 1320 } 1321 1322 static const struct super_operations pipefs_ops = { 1323 .destroy_inode = free_inode_nonrcu, 1324 .statfs = simple_statfs, 1325 }; 1326 1327 /* 1328 * pipefs should _never_ be mounted by userland - too much of security hassle, 1329 * no real gain from having the whole whorehouse mounted. So we don't need 1330 * any operations on the root directory. However, we need a non-trivial 1331 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1332 */ 1333 1334 static int pipefs_init_fs_context(struct fs_context *fc) 1335 { 1336 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1337 if (!ctx) 1338 return -ENOMEM; 1339 ctx->ops = &pipefs_ops; 1340 ctx->dops = &pipefs_dentry_operations; 1341 return 0; 1342 } 1343 1344 static struct file_system_type pipe_fs_type = { 1345 .name = "pipefs", 1346 .init_fs_context = pipefs_init_fs_context, 1347 .kill_sb = kill_anon_super, 1348 }; 1349 1350 static int __init init_pipe_fs(void) 1351 { 1352 int err = register_filesystem(&pipe_fs_type); 1353 1354 if (!err) { 1355 pipe_mnt = kern_mount(&pipe_fs_type); 1356 if (IS_ERR(pipe_mnt)) { 1357 err = PTR_ERR(pipe_mnt); 1358 unregister_filesystem(&pipe_fs_type); 1359 } 1360 } 1361 return err; 1362 } 1363 1364 fs_initcall(init_pipe_fs); 1365