1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 28 #include <linux/uaccess.h> 29 #include <asm/ioctls.h> 30 31 #include "internal.h" 32 33 /* 34 * The max size that a non-root user is allowed to grow the pipe. Can 35 * be set by root in /proc/sys/fs/pipe-max-size 36 */ 37 unsigned int pipe_max_size = 1048576; 38 39 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 40 * matches default values. 41 */ 42 unsigned long pipe_user_pages_hard; 43 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 44 45 /* 46 * We use a start+len construction, which provides full use of the 47 * allocated memory. 48 * -- Florian Coosmann (FGC) 49 * 50 * Reads with count = 0 should always return 0. 51 * -- Julian Bradfield 1999-06-07. 52 * 53 * FIFOs and Pipes now generate SIGIO for both readers and writers. 54 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 55 * 56 * pipe_read & write cleanup 57 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 58 */ 59 60 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 61 { 62 if (pipe->files) 63 mutex_lock_nested(&pipe->mutex, subclass); 64 } 65 66 void pipe_lock(struct pipe_inode_info *pipe) 67 { 68 /* 69 * pipe_lock() nests non-pipe inode locks (for writing to a file) 70 */ 71 pipe_lock_nested(pipe, I_MUTEX_PARENT); 72 } 73 EXPORT_SYMBOL(pipe_lock); 74 75 void pipe_unlock(struct pipe_inode_info *pipe) 76 { 77 if (pipe->files) 78 mutex_unlock(&pipe->mutex); 79 } 80 EXPORT_SYMBOL(pipe_unlock); 81 82 static inline void __pipe_lock(struct pipe_inode_info *pipe) 83 { 84 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 85 } 86 87 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 88 { 89 mutex_unlock(&pipe->mutex); 90 } 91 92 void pipe_double_lock(struct pipe_inode_info *pipe1, 93 struct pipe_inode_info *pipe2) 94 { 95 BUG_ON(pipe1 == pipe2); 96 97 if (pipe1 < pipe2) { 98 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 99 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 100 } else { 101 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 102 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 103 } 104 } 105 106 /* Drop the inode semaphore and wait for a pipe event, atomically */ 107 void pipe_wait(struct pipe_inode_info *pipe) 108 { 109 DEFINE_WAIT(wait); 110 111 /* 112 * Pipes are system-local resources, so sleeping on them 113 * is considered a noninteractive wait: 114 */ 115 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 116 pipe_unlock(pipe); 117 schedule(); 118 finish_wait(&pipe->wait, &wait); 119 pipe_lock(pipe); 120 } 121 122 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 123 struct pipe_buffer *buf) 124 { 125 struct page *page = buf->page; 126 127 /* 128 * If nobody else uses this page, and we don't already have a 129 * temporary page, let's keep track of it as a one-deep 130 * allocation cache. (Otherwise just release our reference to it) 131 */ 132 if (page_count(page) == 1 && !pipe->tmp_page) 133 pipe->tmp_page = page; 134 else 135 put_page(page); 136 } 137 138 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 139 struct pipe_buffer *buf) 140 { 141 struct page *page = buf->page; 142 143 if (page_count(page) == 1) { 144 memcg_kmem_uncharge(page, 0); 145 __SetPageLocked(page); 146 return 0; 147 } 148 return 1; 149 } 150 151 /** 152 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 153 * @pipe: the pipe that the buffer belongs to 154 * @buf: the buffer to attempt to steal 155 * 156 * Description: 157 * This function attempts to steal the &struct page attached to 158 * @buf. If successful, this function returns 0 and returns with 159 * the page locked. The caller may then reuse the page for whatever 160 * he wishes; the typical use is insertion into a different file 161 * page cache. 162 */ 163 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 164 struct pipe_buffer *buf) 165 { 166 struct page *page = buf->page; 167 168 /* 169 * A reference of one is golden, that means that the owner of this 170 * page is the only one holding a reference to it. lock the page 171 * and return OK. 172 */ 173 if (page_count(page) == 1) { 174 lock_page(page); 175 return 0; 176 } 177 178 return 1; 179 } 180 EXPORT_SYMBOL(generic_pipe_buf_steal); 181 182 /** 183 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 184 * @pipe: the pipe that the buffer belongs to 185 * @buf: the buffer to get a reference to 186 * 187 * Description: 188 * This function grabs an extra reference to @buf. It's used in 189 * in the tee() system call, when we duplicate the buffers in one 190 * pipe into another. 191 */ 192 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 193 { 194 return try_get_page(buf->page); 195 } 196 EXPORT_SYMBOL(generic_pipe_buf_get); 197 198 /** 199 * generic_pipe_buf_confirm - verify contents of the pipe buffer 200 * @info: the pipe that the buffer belongs to 201 * @buf: the buffer to confirm 202 * 203 * Description: 204 * This function does nothing, because the generic pipe code uses 205 * pages that are always good when inserted into the pipe. 206 */ 207 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 208 struct pipe_buffer *buf) 209 { 210 return 0; 211 } 212 EXPORT_SYMBOL(generic_pipe_buf_confirm); 213 214 /** 215 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 216 * @pipe: the pipe that the buffer belongs to 217 * @buf: the buffer to put a reference to 218 * 219 * Description: 220 * This function releases a reference to @buf. 221 */ 222 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 223 struct pipe_buffer *buf) 224 { 225 put_page(buf->page); 226 } 227 EXPORT_SYMBOL(generic_pipe_buf_release); 228 229 /* New data written to a pipe may be appended to a buffer with this type. */ 230 static const struct pipe_buf_operations anon_pipe_buf_ops = { 231 .confirm = generic_pipe_buf_confirm, 232 .release = anon_pipe_buf_release, 233 .steal = anon_pipe_buf_steal, 234 .get = generic_pipe_buf_get, 235 }; 236 237 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 238 .confirm = generic_pipe_buf_confirm, 239 .release = anon_pipe_buf_release, 240 .steal = anon_pipe_buf_steal, 241 .get = generic_pipe_buf_get, 242 }; 243 244 static const struct pipe_buf_operations packet_pipe_buf_ops = { 245 .confirm = generic_pipe_buf_confirm, 246 .release = anon_pipe_buf_release, 247 .steal = anon_pipe_buf_steal, 248 .get = generic_pipe_buf_get, 249 }; 250 251 /** 252 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 253 * @buf: the buffer to mark 254 * 255 * Description: 256 * This function ensures that no future writes will be merged into the 257 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 258 * share the same backing page. 259 */ 260 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 261 { 262 if (buf->ops == &anon_pipe_buf_ops) 263 buf->ops = &anon_pipe_buf_nomerge_ops; 264 } 265 266 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 267 { 268 return buf->ops == &anon_pipe_buf_ops; 269 } 270 271 static ssize_t 272 pipe_read(struct kiocb *iocb, struct iov_iter *to) 273 { 274 size_t total_len = iov_iter_count(to); 275 struct file *filp = iocb->ki_filp; 276 struct pipe_inode_info *pipe = filp->private_data; 277 int do_wakeup; 278 ssize_t ret; 279 280 /* Null read succeeds. */ 281 if (unlikely(total_len == 0)) 282 return 0; 283 284 do_wakeup = 0; 285 ret = 0; 286 __pipe_lock(pipe); 287 for (;;) { 288 int bufs = pipe->nrbufs; 289 if (bufs) { 290 int curbuf = pipe->curbuf; 291 struct pipe_buffer *buf = pipe->bufs + curbuf; 292 size_t chars = buf->len; 293 size_t written; 294 int error; 295 296 if (chars > total_len) 297 chars = total_len; 298 299 error = pipe_buf_confirm(pipe, buf); 300 if (error) { 301 if (!ret) 302 ret = error; 303 break; 304 } 305 306 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 307 if (unlikely(written < chars)) { 308 if (!ret) 309 ret = -EFAULT; 310 break; 311 } 312 ret += chars; 313 buf->offset += chars; 314 buf->len -= chars; 315 316 /* Was it a packet buffer? Clean up and exit */ 317 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 318 total_len = chars; 319 buf->len = 0; 320 } 321 322 if (!buf->len) { 323 pipe_buf_release(pipe, buf); 324 curbuf = (curbuf + 1) & (pipe->buffers - 1); 325 pipe->curbuf = curbuf; 326 pipe->nrbufs = --bufs; 327 do_wakeup = 1; 328 } 329 total_len -= chars; 330 if (!total_len) 331 break; /* common path: read succeeded */ 332 } 333 if (bufs) /* More to do? */ 334 continue; 335 if (!pipe->writers) 336 break; 337 if (!pipe->waiting_writers) { 338 /* syscall merging: Usually we must not sleep 339 * if O_NONBLOCK is set, or if we got some data. 340 * But if a writer sleeps in kernel space, then 341 * we can wait for that data without violating POSIX. 342 */ 343 if (ret) 344 break; 345 if (filp->f_flags & O_NONBLOCK) { 346 ret = -EAGAIN; 347 break; 348 } 349 } 350 if (signal_pending(current)) { 351 if (!ret) 352 ret = -ERESTARTSYS; 353 break; 354 } 355 if (do_wakeup) { 356 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); 357 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 358 } 359 pipe_wait(pipe); 360 } 361 __pipe_unlock(pipe); 362 363 /* Signal writers asynchronously that there is more room. */ 364 if (do_wakeup) { 365 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); 366 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 367 } 368 if (ret > 0) 369 file_accessed(filp); 370 return ret; 371 } 372 373 static inline int is_packetized(struct file *file) 374 { 375 return (file->f_flags & O_DIRECT) != 0; 376 } 377 378 static ssize_t 379 pipe_write(struct kiocb *iocb, struct iov_iter *from) 380 { 381 struct file *filp = iocb->ki_filp; 382 struct pipe_inode_info *pipe = filp->private_data; 383 ssize_t ret = 0; 384 int do_wakeup = 0; 385 size_t total_len = iov_iter_count(from); 386 ssize_t chars; 387 388 /* Null write succeeds. */ 389 if (unlikely(total_len == 0)) 390 return 0; 391 392 __pipe_lock(pipe); 393 394 if (!pipe->readers) { 395 send_sig(SIGPIPE, current, 0); 396 ret = -EPIPE; 397 goto out; 398 } 399 400 /* We try to merge small writes */ 401 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 402 if (pipe->nrbufs && chars != 0) { 403 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 404 (pipe->buffers - 1); 405 struct pipe_buffer *buf = pipe->bufs + lastbuf; 406 int offset = buf->offset + buf->len; 407 408 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 409 ret = pipe_buf_confirm(pipe, buf); 410 if (ret) 411 goto out; 412 413 ret = copy_page_from_iter(buf->page, offset, chars, from); 414 if (unlikely(ret < chars)) { 415 ret = -EFAULT; 416 goto out; 417 } 418 do_wakeup = 1; 419 buf->len += ret; 420 if (!iov_iter_count(from)) 421 goto out; 422 } 423 } 424 425 for (;;) { 426 int bufs; 427 428 if (!pipe->readers) { 429 send_sig(SIGPIPE, current, 0); 430 if (!ret) 431 ret = -EPIPE; 432 break; 433 } 434 bufs = pipe->nrbufs; 435 if (bufs < pipe->buffers) { 436 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 437 struct pipe_buffer *buf = pipe->bufs + newbuf; 438 struct page *page = pipe->tmp_page; 439 int copied; 440 441 if (!page) { 442 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 443 if (unlikely(!page)) { 444 ret = ret ? : -ENOMEM; 445 break; 446 } 447 pipe->tmp_page = page; 448 } 449 /* Always wake up, even if the copy fails. Otherwise 450 * we lock up (O_NONBLOCK-)readers that sleep due to 451 * syscall merging. 452 * FIXME! Is this really true? 453 */ 454 do_wakeup = 1; 455 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 456 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 457 if (!ret) 458 ret = -EFAULT; 459 break; 460 } 461 ret += copied; 462 463 /* Insert it into the buffer array */ 464 buf->page = page; 465 buf->ops = &anon_pipe_buf_ops; 466 buf->offset = 0; 467 buf->len = copied; 468 buf->flags = 0; 469 if (is_packetized(filp)) { 470 buf->ops = &packet_pipe_buf_ops; 471 buf->flags = PIPE_BUF_FLAG_PACKET; 472 } 473 pipe->nrbufs = ++bufs; 474 pipe->tmp_page = NULL; 475 476 if (!iov_iter_count(from)) 477 break; 478 } 479 if (bufs < pipe->buffers) 480 continue; 481 if (filp->f_flags & O_NONBLOCK) { 482 if (!ret) 483 ret = -EAGAIN; 484 break; 485 } 486 if (signal_pending(current)) { 487 if (!ret) 488 ret = -ERESTARTSYS; 489 break; 490 } 491 if (do_wakeup) { 492 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); 493 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 494 do_wakeup = 0; 495 } 496 pipe->waiting_writers++; 497 pipe_wait(pipe); 498 pipe->waiting_writers--; 499 } 500 out: 501 __pipe_unlock(pipe); 502 if (do_wakeup) { 503 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); 504 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 505 } 506 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 507 int err = file_update_time(filp); 508 if (err) 509 ret = err; 510 sb_end_write(file_inode(filp)->i_sb); 511 } 512 return ret; 513 } 514 515 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 516 { 517 struct pipe_inode_info *pipe = filp->private_data; 518 int count, buf, nrbufs; 519 520 switch (cmd) { 521 case FIONREAD: 522 __pipe_lock(pipe); 523 count = 0; 524 buf = pipe->curbuf; 525 nrbufs = pipe->nrbufs; 526 while (--nrbufs >= 0) { 527 count += pipe->bufs[buf].len; 528 buf = (buf+1) & (pipe->buffers - 1); 529 } 530 __pipe_unlock(pipe); 531 532 return put_user(count, (int __user *)arg); 533 default: 534 return -ENOIOCTLCMD; 535 } 536 } 537 538 /* No kernel lock held - fine */ 539 static __poll_t 540 pipe_poll(struct file *filp, poll_table *wait) 541 { 542 __poll_t mask; 543 struct pipe_inode_info *pipe = filp->private_data; 544 int nrbufs; 545 546 poll_wait(filp, &pipe->wait, wait); 547 548 /* Reading only -- no need for acquiring the semaphore. */ 549 nrbufs = pipe->nrbufs; 550 mask = 0; 551 if (filp->f_mode & FMODE_READ) { 552 mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; 553 if (!pipe->writers && filp->f_version != pipe->w_counter) 554 mask |= EPOLLHUP; 555 } 556 557 if (filp->f_mode & FMODE_WRITE) { 558 mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0; 559 /* 560 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 561 * behave exactly like pipes for poll(). 562 */ 563 if (!pipe->readers) 564 mask |= EPOLLERR; 565 } 566 567 return mask; 568 } 569 570 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 571 { 572 int kill = 0; 573 574 spin_lock(&inode->i_lock); 575 if (!--pipe->files) { 576 inode->i_pipe = NULL; 577 kill = 1; 578 } 579 spin_unlock(&inode->i_lock); 580 581 if (kill) 582 free_pipe_info(pipe); 583 } 584 585 static int 586 pipe_release(struct inode *inode, struct file *file) 587 { 588 struct pipe_inode_info *pipe = file->private_data; 589 590 __pipe_lock(pipe); 591 if (file->f_mode & FMODE_READ) 592 pipe->readers--; 593 if (file->f_mode & FMODE_WRITE) 594 pipe->writers--; 595 596 if (pipe->readers || pipe->writers) { 597 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP); 598 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 599 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 600 } 601 __pipe_unlock(pipe); 602 603 put_pipe_info(inode, pipe); 604 return 0; 605 } 606 607 static int 608 pipe_fasync(int fd, struct file *filp, int on) 609 { 610 struct pipe_inode_info *pipe = filp->private_data; 611 int retval = 0; 612 613 __pipe_lock(pipe); 614 if (filp->f_mode & FMODE_READ) 615 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 616 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 617 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 618 if (retval < 0 && (filp->f_mode & FMODE_READ)) 619 /* this can happen only if on == T */ 620 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 621 } 622 __pipe_unlock(pipe); 623 return retval; 624 } 625 626 static unsigned long account_pipe_buffers(struct user_struct *user, 627 unsigned long old, unsigned long new) 628 { 629 return atomic_long_add_return(new - old, &user->pipe_bufs); 630 } 631 632 static bool too_many_pipe_buffers_soft(unsigned long user_bufs) 633 { 634 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 635 636 return soft_limit && user_bufs > soft_limit; 637 } 638 639 static bool too_many_pipe_buffers_hard(unsigned long user_bufs) 640 { 641 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 642 643 return hard_limit && user_bufs > hard_limit; 644 } 645 646 static bool is_unprivileged_user(void) 647 { 648 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 649 } 650 651 struct pipe_inode_info *alloc_pipe_info(void) 652 { 653 struct pipe_inode_info *pipe; 654 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 655 struct user_struct *user = get_current_user(); 656 unsigned long user_bufs; 657 unsigned int max_size = READ_ONCE(pipe_max_size); 658 659 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 660 if (pipe == NULL) 661 goto out_free_uid; 662 663 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 664 pipe_bufs = max_size >> PAGE_SHIFT; 665 666 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 667 668 if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { 669 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 670 pipe_bufs = 1; 671 } 672 673 if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) 674 goto out_revert_acct; 675 676 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 677 GFP_KERNEL_ACCOUNT); 678 679 if (pipe->bufs) { 680 init_waitqueue_head(&pipe->wait); 681 pipe->r_counter = pipe->w_counter = 1; 682 pipe->buffers = pipe_bufs; 683 pipe->user = user; 684 mutex_init(&pipe->mutex); 685 return pipe; 686 } 687 688 out_revert_acct: 689 (void) account_pipe_buffers(user, pipe_bufs, 0); 690 kfree(pipe); 691 out_free_uid: 692 free_uid(user); 693 return NULL; 694 } 695 696 void free_pipe_info(struct pipe_inode_info *pipe) 697 { 698 int i; 699 700 (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); 701 free_uid(pipe->user); 702 for (i = 0; i < pipe->buffers; i++) { 703 struct pipe_buffer *buf = pipe->bufs + i; 704 if (buf->ops) 705 pipe_buf_release(pipe, buf); 706 } 707 if (pipe->tmp_page) 708 __free_page(pipe->tmp_page); 709 kfree(pipe->bufs); 710 kfree(pipe); 711 } 712 713 static struct vfsmount *pipe_mnt __read_mostly; 714 715 /* 716 * pipefs_dname() is called from d_path(). 717 */ 718 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 719 { 720 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 721 d_inode(dentry)->i_ino); 722 } 723 724 static const struct dentry_operations pipefs_dentry_operations = { 725 .d_dname = pipefs_dname, 726 }; 727 728 static struct inode * get_pipe_inode(void) 729 { 730 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 731 struct pipe_inode_info *pipe; 732 733 if (!inode) 734 goto fail_inode; 735 736 inode->i_ino = get_next_ino(); 737 738 pipe = alloc_pipe_info(); 739 if (!pipe) 740 goto fail_iput; 741 742 inode->i_pipe = pipe; 743 pipe->files = 2; 744 pipe->readers = pipe->writers = 1; 745 inode->i_fop = &pipefifo_fops; 746 747 /* 748 * Mark the inode dirty from the very beginning, 749 * that way it will never be moved to the dirty 750 * list because "mark_inode_dirty()" will think 751 * that it already _is_ on the dirty list. 752 */ 753 inode->i_state = I_DIRTY; 754 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 755 inode->i_uid = current_fsuid(); 756 inode->i_gid = current_fsgid(); 757 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 758 759 return inode; 760 761 fail_iput: 762 iput(inode); 763 764 fail_inode: 765 return NULL; 766 } 767 768 int create_pipe_files(struct file **res, int flags) 769 { 770 struct inode *inode = get_pipe_inode(); 771 struct file *f; 772 773 if (!inode) 774 return -ENFILE; 775 776 f = alloc_file_pseudo(inode, pipe_mnt, "", 777 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 778 &pipefifo_fops); 779 if (IS_ERR(f)) { 780 free_pipe_info(inode->i_pipe); 781 iput(inode); 782 return PTR_ERR(f); 783 } 784 785 f->private_data = inode->i_pipe; 786 787 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 788 &pipefifo_fops); 789 if (IS_ERR(res[0])) { 790 put_pipe_info(inode, inode->i_pipe); 791 fput(f); 792 return PTR_ERR(res[0]); 793 } 794 res[0]->private_data = inode->i_pipe; 795 res[1] = f; 796 return 0; 797 } 798 799 static int __do_pipe_flags(int *fd, struct file **files, int flags) 800 { 801 int error; 802 int fdw, fdr; 803 804 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 805 return -EINVAL; 806 807 error = create_pipe_files(files, flags); 808 if (error) 809 return error; 810 811 error = get_unused_fd_flags(flags); 812 if (error < 0) 813 goto err_read_pipe; 814 fdr = error; 815 816 error = get_unused_fd_flags(flags); 817 if (error < 0) 818 goto err_fdr; 819 fdw = error; 820 821 audit_fd_pair(fdr, fdw); 822 fd[0] = fdr; 823 fd[1] = fdw; 824 return 0; 825 826 err_fdr: 827 put_unused_fd(fdr); 828 err_read_pipe: 829 fput(files[0]); 830 fput(files[1]); 831 return error; 832 } 833 834 int do_pipe_flags(int *fd, int flags) 835 { 836 struct file *files[2]; 837 int error = __do_pipe_flags(fd, files, flags); 838 if (!error) { 839 fd_install(fd[0], files[0]); 840 fd_install(fd[1], files[1]); 841 } 842 return error; 843 } 844 845 /* 846 * sys_pipe() is the normal C calling standard for creating 847 * a pipe. It's not the way Unix traditionally does this, though. 848 */ 849 static int do_pipe2(int __user *fildes, int flags) 850 { 851 struct file *files[2]; 852 int fd[2]; 853 int error; 854 855 error = __do_pipe_flags(fd, files, flags); 856 if (!error) { 857 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 858 fput(files[0]); 859 fput(files[1]); 860 put_unused_fd(fd[0]); 861 put_unused_fd(fd[1]); 862 error = -EFAULT; 863 } else { 864 fd_install(fd[0], files[0]); 865 fd_install(fd[1], files[1]); 866 } 867 } 868 return error; 869 } 870 871 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 872 { 873 return do_pipe2(fildes, flags); 874 } 875 876 SYSCALL_DEFINE1(pipe, int __user *, fildes) 877 { 878 return do_pipe2(fildes, 0); 879 } 880 881 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 882 { 883 int cur = *cnt; 884 885 while (cur == *cnt) { 886 pipe_wait(pipe); 887 if (signal_pending(current)) 888 break; 889 } 890 return cur == *cnt ? -ERESTARTSYS : 0; 891 } 892 893 static void wake_up_partner(struct pipe_inode_info *pipe) 894 { 895 wake_up_interruptible(&pipe->wait); 896 } 897 898 static int fifo_open(struct inode *inode, struct file *filp) 899 { 900 struct pipe_inode_info *pipe; 901 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 902 int ret; 903 904 filp->f_version = 0; 905 906 spin_lock(&inode->i_lock); 907 if (inode->i_pipe) { 908 pipe = inode->i_pipe; 909 pipe->files++; 910 spin_unlock(&inode->i_lock); 911 } else { 912 spin_unlock(&inode->i_lock); 913 pipe = alloc_pipe_info(); 914 if (!pipe) 915 return -ENOMEM; 916 pipe->files = 1; 917 spin_lock(&inode->i_lock); 918 if (unlikely(inode->i_pipe)) { 919 inode->i_pipe->files++; 920 spin_unlock(&inode->i_lock); 921 free_pipe_info(pipe); 922 pipe = inode->i_pipe; 923 } else { 924 inode->i_pipe = pipe; 925 spin_unlock(&inode->i_lock); 926 } 927 } 928 filp->private_data = pipe; 929 /* OK, we have a pipe and it's pinned down */ 930 931 __pipe_lock(pipe); 932 933 /* We can only do regular read/write on fifos */ 934 filp->f_mode &= (FMODE_READ | FMODE_WRITE); 935 936 switch (filp->f_mode) { 937 case FMODE_READ: 938 /* 939 * O_RDONLY 940 * POSIX.1 says that O_NONBLOCK means return with the FIFO 941 * opened, even when there is no process writing the FIFO. 942 */ 943 pipe->r_counter++; 944 if (pipe->readers++ == 0) 945 wake_up_partner(pipe); 946 947 if (!is_pipe && !pipe->writers) { 948 if ((filp->f_flags & O_NONBLOCK)) { 949 /* suppress EPOLLHUP until we have 950 * seen a writer */ 951 filp->f_version = pipe->w_counter; 952 } else { 953 if (wait_for_partner(pipe, &pipe->w_counter)) 954 goto err_rd; 955 } 956 } 957 break; 958 959 case FMODE_WRITE: 960 /* 961 * O_WRONLY 962 * POSIX.1 says that O_NONBLOCK means return -1 with 963 * errno=ENXIO when there is no process reading the FIFO. 964 */ 965 ret = -ENXIO; 966 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 967 goto err; 968 969 pipe->w_counter++; 970 if (!pipe->writers++) 971 wake_up_partner(pipe); 972 973 if (!is_pipe && !pipe->readers) { 974 if (wait_for_partner(pipe, &pipe->r_counter)) 975 goto err_wr; 976 } 977 break; 978 979 case FMODE_READ | FMODE_WRITE: 980 /* 981 * O_RDWR 982 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 983 * This implementation will NEVER block on a O_RDWR open, since 984 * the process can at least talk to itself. 985 */ 986 987 pipe->readers++; 988 pipe->writers++; 989 pipe->r_counter++; 990 pipe->w_counter++; 991 if (pipe->readers == 1 || pipe->writers == 1) 992 wake_up_partner(pipe); 993 break; 994 995 default: 996 ret = -EINVAL; 997 goto err; 998 } 999 1000 /* Ok! */ 1001 __pipe_unlock(pipe); 1002 return 0; 1003 1004 err_rd: 1005 if (!--pipe->readers) 1006 wake_up_interruptible(&pipe->wait); 1007 ret = -ERESTARTSYS; 1008 goto err; 1009 1010 err_wr: 1011 if (!--pipe->writers) 1012 wake_up_interruptible(&pipe->wait); 1013 ret = -ERESTARTSYS; 1014 goto err; 1015 1016 err: 1017 __pipe_unlock(pipe); 1018 1019 put_pipe_info(inode, pipe); 1020 return ret; 1021 } 1022 1023 const struct file_operations pipefifo_fops = { 1024 .open = fifo_open, 1025 .llseek = no_llseek, 1026 .read_iter = pipe_read, 1027 .write_iter = pipe_write, 1028 .poll = pipe_poll, 1029 .unlocked_ioctl = pipe_ioctl, 1030 .release = pipe_release, 1031 .fasync = pipe_fasync, 1032 }; 1033 1034 /* 1035 * Currently we rely on the pipe array holding a power-of-2 number 1036 * of pages. Returns 0 on error. 1037 */ 1038 unsigned int round_pipe_size(unsigned long size) 1039 { 1040 if (size > (1U << 31)) 1041 return 0; 1042 1043 /* Minimum pipe size, as required by POSIX */ 1044 if (size < PAGE_SIZE) 1045 return PAGE_SIZE; 1046 1047 return roundup_pow_of_two(size); 1048 } 1049 1050 /* 1051 * Allocate a new array of pipe buffers and copy the info over. Returns the 1052 * pipe size if successful, or return -ERROR on error. 1053 */ 1054 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1055 { 1056 struct pipe_buffer *bufs; 1057 unsigned int size, nr_pages; 1058 unsigned long user_bufs; 1059 long ret = 0; 1060 1061 size = round_pipe_size(arg); 1062 nr_pages = size >> PAGE_SHIFT; 1063 1064 if (!nr_pages) 1065 return -EINVAL; 1066 1067 /* 1068 * If trying to increase the pipe capacity, check that an 1069 * unprivileged user is not trying to exceed various limits 1070 * (soft limit check here, hard limit check just below). 1071 * Decreasing the pipe capacity is always permitted, even 1072 * if the user is currently over a limit. 1073 */ 1074 if (nr_pages > pipe->buffers && 1075 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1076 return -EPERM; 1077 1078 user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); 1079 1080 if (nr_pages > pipe->buffers && 1081 (too_many_pipe_buffers_hard(user_bufs) || 1082 too_many_pipe_buffers_soft(user_bufs)) && 1083 is_unprivileged_user()) { 1084 ret = -EPERM; 1085 goto out_revert_acct; 1086 } 1087 1088 /* 1089 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1090 * expect a lot of shrink+grow operations, just free and allocate 1091 * again like we would do for growing. If the pipe currently 1092 * contains more buffers than arg, then return busy. 1093 */ 1094 if (nr_pages < pipe->nrbufs) { 1095 ret = -EBUSY; 1096 goto out_revert_acct; 1097 } 1098 1099 bufs = kcalloc(nr_pages, sizeof(*bufs), 1100 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1101 if (unlikely(!bufs)) { 1102 ret = -ENOMEM; 1103 goto out_revert_acct; 1104 } 1105 1106 /* 1107 * The pipe array wraps around, so just start the new one at zero 1108 * and adjust the indexes. 1109 */ 1110 if (pipe->nrbufs) { 1111 unsigned int tail; 1112 unsigned int head; 1113 1114 tail = pipe->curbuf + pipe->nrbufs; 1115 if (tail < pipe->buffers) 1116 tail = 0; 1117 else 1118 tail &= (pipe->buffers - 1); 1119 1120 head = pipe->nrbufs - tail; 1121 if (head) 1122 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1123 if (tail) 1124 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1125 } 1126 1127 pipe->curbuf = 0; 1128 kfree(pipe->bufs); 1129 pipe->bufs = bufs; 1130 pipe->buffers = nr_pages; 1131 return nr_pages * PAGE_SIZE; 1132 1133 out_revert_acct: 1134 (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); 1135 return ret; 1136 } 1137 1138 /* 1139 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1140 * location, so checking ->i_pipe is not enough to verify that this is a 1141 * pipe. 1142 */ 1143 struct pipe_inode_info *get_pipe_info(struct file *file) 1144 { 1145 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1146 } 1147 1148 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1149 { 1150 struct pipe_inode_info *pipe; 1151 long ret; 1152 1153 pipe = get_pipe_info(file); 1154 if (!pipe) 1155 return -EBADF; 1156 1157 __pipe_lock(pipe); 1158 1159 switch (cmd) { 1160 case F_SETPIPE_SZ: 1161 ret = pipe_set_size(pipe, arg); 1162 break; 1163 case F_GETPIPE_SZ: 1164 ret = pipe->buffers * PAGE_SIZE; 1165 break; 1166 default: 1167 ret = -EINVAL; 1168 break; 1169 } 1170 1171 __pipe_unlock(pipe); 1172 return ret; 1173 } 1174 1175 static const struct super_operations pipefs_ops = { 1176 .destroy_inode = free_inode_nonrcu, 1177 .statfs = simple_statfs, 1178 }; 1179 1180 /* 1181 * pipefs should _never_ be mounted by userland - too much of security hassle, 1182 * no real gain from having the whole whorehouse mounted. So we don't need 1183 * any operations on the root directory. However, we need a non-trivial 1184 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1185 */ 1186 1187 static int pipefs_init_fs_context(struct fs_context *fc) 1188 { 1189 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1190 if (!ctx) 1191 return -ENOMEM; 1192 ctx->ops = &pipefs_ops; 1193 ctx->dops = &pipefs_dentry_operations; 1194 return 0; 1195 } 1196 1197 static struct file_system_type pipe_fs_type = { 1198 .name = "pipefs", 1199 .init_fs_context = pipefs_init_fs_context, 1200 .kill_sb = kill_anon_super, 1201 }; 1202 1203 static int __init init_pipe_fs(void) 1204 { 1205 int err = register_filesystem(&pipe_fs_type); 1206 1207 if (!err) { 1208 pipe_mnt = kern_mount(&pipe_fs_type); 1209 if (IS_ERR(pipe_mnt)) { 1210 err = PTR_ERR(pipe_mnt); 1211 unregister_filesystem(&pipe_fs_type); 1212 } 1213 } 1214 return err; 1215 } 1216 1217 fs_initcall(init_pipe_fs); 1218