1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * linux/fs/pipe.c 4 * 5 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 6 */ 7 8 #include <linux/mm.h> 9 #include <linux/file.h> 10 #include <linux/poll.h> 11 #include <linux/slab.h> 12 #include <linux/module.h> 13 #include <linux/init.h> 14 #include <linux/fs.h> 15 #include <linux/log2.h> 16 #include <linux/mount.h> 17 #include <linux/pseudo_fs.h> 18 #include <linux/magic.h> 19 #include <linux/pipe_fs_i.h> 20 #include <linux/uio.h> 21 #include <linux/highmem.h> 22 #include <linux/pagemap.h> 23 #include <linux/audit.h> 24 #include <linux/syscalls.h> 25 #include <linux/fcntl.h> 26 #include <linux/memcontrol.h> 27 28 #include <linux/uaccess.h> 29 #include <asm/ioctls.h> 30 31 #include "internal.h" 32 33 /* 34 * The max size that a non-root user is allowed to grow the pipe. Can 35 * be set by root in /proc/sys/fs/pipe-max-size 36 */ 37 unsigned int pipe_max_size = 1048576; 38 39 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 40 * matches default values. 41 */ 42 unsigned long pipe_user_pages_hard; 43 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 44 45 /* 46 * We use a start+len construction, which provides full use of the 47 * allocated memory. 48 * -- Florian Coosmann (FGC) 49 * 50 * Reads with count = 0 should always return 0. 51 * -- Julian Bradfield 1999-06-07. 52 * 53 * FIFOs and Pipes now generate SIGIO for both readers and writers. 54 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 55 * 56 * pipe_read & write cleanup 57 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 58 */ 59 60 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 61 { 62 if (pipe->files) 63 mutex_lock_nested(&pipe->mutex, subclass); 64 } 65 66 void pipe_lock(struct pipe_inode_info *pipe) 67 { 68 /* 69 * pipe_lock() nests non-pipe inode locks (for writing to a file) 70 */ 71 pipe_lock_nested(pipe, I_MUTEX_PARENT); 72 } 73 EXPORT_SYMBOL(pipe_lock); 74 75 void pipe_unlock(struct pipe_inode_info *pipe) 76 { 77 if (pipe->files) 78 mutex_unlock(&pipe->mutex); 79 } 80 EXPORT_SYMBOL(pipe_unlock); 81 82 static inline void __pipe_lock(struct pipe_inode_info *pipe) 83 { 84 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 85 } 86 87 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 88 { 89 mutex_unlock(&pipe->mutex); 90 } 91 92 void pipe_double_lock(struct pipe_inode_info *pipe1, 93 struct pipe_inode_info *pipe2) 94 { 95 BUG_ON(pipe1 == pipe2); 96 97 if (pipe1 < pipe2) { 98 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 99 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 100 } else { 101 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 102 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 103 } 104 } 105 106 /* Drop the inode semaphore and wait for a pipe event, atomically */ 107 void pipe_wait(struct pipe_inode_info *pipe) 108 { 109 DEFINE_WAIT(wait); 110 111 /* 112 * Pipes are system-local resources, so sleeping on them 113 * is considered a noninteractive wait: 114 */ 115 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 116 pipe_unlock(pipe); 117 schedule(); 118 finish_wait(&pipe->wait, &wait); 119 pipe_lock(pipe); 120 } 121 122 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 123 struct pipe_buffer *buf) 124 { 125 struct page *page = buf->page; 126 127 /* 128 * If nobody else uses this page, and we don't already have a 129 * temporary page, let's keep track of it as a one-deep 130 * allocation cache. (Otherwise just release our reference to it) 131 */ 132 if (page_count(page) == 1 && !pipe->tmp_page) 133 pipe->tmp_page = page; 134 else 135 put_page(page); 136 } 137 138 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 139 struct pipe_buffer *buf) 140 { 141 struct page *page = buf->page; 142 143 if (page_count(page) == 1) { 144 memcg_kmem_uncharge(page, 0); 145 __SetPageLocked(page); 146 return 0; 147 } 148 return 1; 149 } 150 151 /** 152 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 153 * @pipe: the pipe that the buffer belongs to 154 * @buf: the buffer to attempt to steal 155 * 156 * Description: 157 * This function attempts to steal the &struct page attached to 158 * @buf. If successful, this function returns 0 and returns with 159 * the page locked. The caller may then reuse the page for whatever 160 * he wishes; the typical use is insertion into a different file 161 * page cache. 162 */ 163 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 164 struct pipe_buffer *buf) 165 { 166 struct page *page = buf->page; 167 168 /* 169 * A reference of one is golden, that means that the owner of this 170 * page is the only one holding a reference to it. lock the page 171 * and return OK. 172 */ 173 if (page_count(page) == 1) { 174 lock_page(page); 175 return 0; 176 } 177 178 return 1; 179 } 180 EXPORT_SYMBOL(generic_pipe_buf_steal); 181 182 /** 183 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 184 * @pipe: the pipe that the buffer belongs to 185 * @buf: the buffer to get a reference to 186 * 187 * Description: 188 * This function grabs an extra reference to @buf. It's used in 189 * in the tee() system call, when we duplicate the buffers in one 190 * pipe into another. 191 */ 192 bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 193 { 194 return try_get_page(buf->page); 195 } 196 EXPORT_SYMBOL(generic_pipe_buf_get); 197 198 /** 199 * generic_pipe_buf_confirm - verify contents of the pipe buffer 200 * @info: the pipe that the buffer belongs to 201 * @buf: the buffer to confirm 202 * 203 * Description: 204 * This function does nothing, because the generic pipe code uses 205 * pages that are always good when inserted into the pipe. 206 */ 207 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 208 struct pipe_buffer *buf) 209 { 210 return 0; 211 } 212 EXPORT_SYMBOL(generic_pipe_buf_confirm); 213 214 /** 215 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 216 * @pipe: the pipe that the buffer belongs to 217 * @buf: the buffer to put a reference to 218 * 219 * Description: 220 * This function releases a reference to @buf. 221 */ 222 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 223 struct pipe_buffer *buf) 224 { 225 put_page(buf->page); 226 } 227 EXPORT_SYMBOL(generic_pipe_buf_release); 228 229 /* New data written to a pipe may be appended to a buffer with this type. */ 230 static const struct pipe_buf_operations anon_pipe_buf_ops = { 231 .confirm = generic_pipe_buf_confirm, 232 .release = anon_pipe_buf_release, 233 .steal = anon_pipe_buf_steal, 234 .get = generic_pipe_buf_get, 235 }; 236 237 static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { 238 .confirm = generic_pipe_buf_confirm, 239 .release = anon_pipe_buf_release, 240 .steal = anon_pipe_buf_steal, 241 .get = generic_pipe_buf_get, 242 }; 243 244 static const struct pipe_buf_operations packet_pipe_buf_ops = { 245 .confirm = generic_pipe_buf_confirm, 246 .release = anon_pipe_buf_release, 247 .steal = anon_pipe_buf_steal, 248 .get = generic_pipe_buf_get, 249 }; 250 251 /** 252 * pipe_buf_mark_unmergeable - mark a &struct pipe_buffer as unmergeable 253 * @buf: the buffer to mark 254 * 255 * Description: 256 * This function ensures that no future writes will be merged into the 257 * given &struct pipe_buffer. This is necessary when multiple pipe buffers 258 * share the same backing page. 259 */ 260 void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) 261 { 262 if (buf->ops == &anon_pipe_buf_ops) 263 buf->ops = &anon_pipe_buf_nomerge_ops; 264 } 265 266 static bool pipe_buf_can_merge(struct pipe_buffer *buf) 267 { 268 return buf->ops == &anon_pipe_buf_ops; 269 } 270 271 static ssize_t 272 pipe_read(struct kiocb *iocb, struct iov_iter *to) 273 { 274 size_t total_len = iov_iter_count(to); 275 struct file *filp = iocb->ki_filp; 276 struct pipe_inode_info *pipe = filp->private_data; 277 int do_wakeup; 278 ssize_t ret; 279 280 /* Null read succeeds. */ 281 if (unlikely(total_len == 0)) 282 return 0; 283 284 do_wakeup = 0; 285 ret = 0; 286 __pipe_lock(pipe); 287 for (;;) { 288 int bufs = pipe->nrbufs; 289 if (bufs) { 290 int curbuf = pipe->curbuf; 291 struct pipe_buffer *buf = pipe->bufs + curbuf; 292 size_t chars = buf->len; 293 size_t written; 294 int error; 295 296 if (chars > total_len) 297 chars = total_len; 298 299 error = pipe_buf_confirm(pipe, buf); 300 if (error) { 301 if (!ret) 302 ret = error; 303 break; 304 } 305 306 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 307 if (unlikely(written < chars)) { 308 if (!ret) 309 ret = -EFAULT; 310 break; 311 } 312 ret += chars; 313 buf->offset += chars; 314 buf->len -= chars; 315 316 /* Was it a packet buffer? Clean up and exit */ 317 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 318 total_len = chars; 319 buf->len = 0; 320 } 321 322 if (!buf->len) { 323 pipe_buf_release(pipe, buf); 324 curbuf = (curbuf + 1) & (pipe->buffers - 1); 325 pipe->curbuf = curbuf; 326 pipe->nrbufs = --bufs; 327 do_wakeup = 1; 328 } 329 total_len -= chars; 330 if (!total_len) 331 break; /* common path: read succeeded */ 332 } 333 if (bufs) /* More to do? */ 334 continue; 335 if (!pipe->writers) 336 break; 337 if (!pipe->waiting_writers) { 338 /* syscall merging: Usually we must not sleep 339 * if O_NONBLOCK is set, or if we got some data. 340 * But if a writer sleeps in kernel space, then 341 * we can wait for that data without violating POSIX. 342 */ 343 if (ret) 344 break; 345 if (filp->f_flags & O_NONBLOCK) { 346 ret = -EAGAIN; 347 break; 348 } 349 } 350 if (signal_pending(current)) { 351 if (!ret) 352 ret = -ERESTARTSYS; 353 break; 354 } 355 if (do_wakeup) { 356 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); 357 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 358 } 359 pipe_wait(pipe); 360 } 361 __pipe_unlock(pipe); 362 363 /* Signal writers asynchronously that there is more room. */ 364 if (do_wakeup) { 365 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM); 366 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 367 } 368 if (ret > 0) 369 file_accessed(filp); 370 return ret; 371 } 372 373 static inline int is_packetized(struct file *file) 374 { 375 return (file->f_flags & O_DIRECT) != 0; 376 } 377 378 static ssize_t 379 pipe_write(struct kiocb *iocb, struct iov_iter *from) 380 { 381 struct file *filp = iocb->ki_filp; 382 struct pipe_inode_info *pipe = filp->private_data; 383 ssize_t ret = 0; 384 int do_wakeup = 0; 385 size_t total_len = iov_iter_count(from); 386 ssize_t chars; 387 388 /* Null write succeeds. */ 389 if (unlikely(total_len == 0)) 390 return 0; 391 392 __pipe_lock(pipe); 393 394 if (!pipe->readers) { 395 send_sig(SIGPIPE, current, 0); 396 ret = -EPIPE; 397 goto out; 398 } 399 400 /* We try to merge small writes */ 401 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 402 if (pipe->nrbufs && chars != 0) { 403 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 404 (pipe->buffers - 1); 405 struct pipe_buffer *buf = pipe->bufs + lastbuf; 406 int offset = buf->offset + buf->len; 407 408 if (pipe_buf_can_merge(buf) && offset + chars <= PAGE_SIZE) { 409 ret = pipe_buf_confirm(pipe, buf); 410 if (ret) 411 goto out; 412 413 ret = copy_page_from_iter(buf->page, offset, chars, from); 414 if (unlikely(ret < chars)) { 415 ret = -EFAULT; 416 goto out; 417 } 418 do_wakeup = 1; 419 buf->len += ret; 420 if (!iov_iter_count(from)) 421 goto out; 422 } 423 } 424 425 for (;;) { 426 int bufs; 427 428 if (!pipe->readers) { 429 send_sig(SIGPIPE, current, 0); 430 if (!ret) 431 ret = -EPIPE; 432 break; 433 } 434 bufs = pipe->nrbufs; 435 if (bufs < pipe->buffers) { 436 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 437 struct pipe_buffer *buf = pipe->bufs + newbuf; 438 struct page *page = pipe->tmp_page; 439 int copied; 440 441 if (!page) { 442 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 443 if (unlikely(!page)) { 444 ret = ret ? : -ENOMEM; 445 break; 446 } 447 pipe->tmp_page = page; 448 } 449 /* Always wake up, even if the copy fails. Otherwise 450 * we lock up (O_NONBLOCK-)readers that sleep due to 451 * syscall merging. 452 * FIXME! Is this really true? 453 */ 454 do_wakeup = 1; 455 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 456 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 457 if (!ret) 458 ret = -EFAULT; 459 break; 460 } 461 ret += copied; 462 463 /* Insert it into the buffer array */ 464 buf->page = page; 465 buf->ops = &anon_pipe_buf_ops; 466 buf->offset = 0; 467 buf->len = copied; 468 buf->flags = 0; 469 if (is_packetized(filp)) { 470 buf->ops = &packet_pipe_buf_ops; 471 buf->flags = PIPE_BUF_FLAG_PACKET; 472 } 473 pipe->nrbufs = ++bufs; 474 pipe->tmp_page = NULL; 475 476 if (!iov_iter_count(from)) 477 break; 478 } 479 if (bufs < pipe->buffers) 480 continue; 481 if (filp->f_flags & O_NONBLOCK) { 482 if (!ret) 483 ret = -EAGAIN; 484 break; 485 } 486 if (signal_pending(current)) { 487 if (!ret) 488 ret = -ERESTARTSYS; 489 break; 490 } 491 if (do_wakeup) { 492 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); 493 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 494 do_wakeup = 0; 495 } 496 pipe->waiting_writers++; 497 pipe_wait(pipe); 498 pipe->waiting_writers--; 499 } 500 out: 501 __pipe_unlock(pipe); 502 if (do_wakeup) { 503 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLRDNORM); 504 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 505 } 506 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 507 int err = file_update_time(filp); 508 if (err) 509 ret = err; 510 sb_end_write(file_inode(filp)->i_sb); 511 } 512 return ret; 513 } 514 515 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 516 { 517 struct pipe_inode_info *pipe = filp->private_data; 518 int count, buf, nrbufs; 519 520 switch (cmd) { 521 case FIONREAD: 522 __pipe_lock(pipe); 523 count = 0; 524 buf = pipe->curbuf; 525 nrbufs = pipe->nrbufs; 526 while (--nrbufs >= 0) { 527 count += pipe->bufs[buf].len; 528 buf = (buf+1) & (pipe->buffers - 1); 529 } 530 __pipe_unlock(pipe); 531 532 return put_user(count, (int __user *)arg); 533 default: 534 return -ENOIOCTLCMD; 535 } 536 } 537 538 /* No kernel lock held - fine */ 539 static __poll_t 540 pipe_poll(struct file *filp, poll_table *wait) 541 { 542 __poll_t mask; 543 struct pipe_inode_info *pipe = filp->private_data; 544 int nrbufs; 545 546 poll_wait(filp, &pipe->wait, wait); 547 548 /* Reading only -- no need for acquiring the semaphore. */ 549 nrbufs = pipe->nrbufs; 550 mask = 0; 551 if (filp->f_mode & FMODE_READ) { 552 mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0; 553 if (!pipe->writers && filp->f_version != pipe->w_counter) 554 mask |= EPOLLHUP; 555 } 556 557 if (filp->f_mode & FMODE_WRITE) { 558 mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0; 559 /* 560 * Most Unices do not set EPOLLERR for FIFOs but on Linux they 561 * behave exactly like pipes for poll(). 562 */ 563 if (!pipe->readers) 564 mask |= EPOLLERR; 565 } 566 567 return mask; 568 } 569 570 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 571 { 572 int kill = 0; 573 574 spin_lock(&inode->i_lock); 575 if (!--pipe->files) { 576 inode->i_pipe = NULL; 577 kill = 1; 578 } 579 spin_unlock(&inode->i_lock); 580 581 if (kill) 582 free_pipe_info(pipe); 583 } 584 585 static int 586 pipe_release(struct inode *inode, struct file *file) 587 { 588 struct pipe_inode_info *pipe = file->private_data; 589 590 __pipe_lock(pipe); 591 if (file->f_mode & FMODE_READ) 592 pipe->readers--; 593 if (file->f_mode & FMODE_WRITE) 594 pipe->writers--; 595 596 if (pipe->readers || pipe->writers) { 597 wake_up_interruptible_sync_poll(&pipe->wait, EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM | EPOLLERR | EPOLLHUP); 598 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 599 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 600 } 601 __pipe_unlock(pipe); 602 603 put_pipe_info(inode, pipe); 604 return 0; 605 } 606 607 static int 608 pipe_fasync(int fd, struct file *filp, int on) 609 { 610 struct pipe_inode_info *pipe = filp->private_data; 611 int retval = 0; 612 613 __pipe_lock(pipe); 614 if (filp->f_mode & FMODE_READ) 615 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 616 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 617 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 618 if (retval < 0 && (filp->f_mode & FMODE_READ)) 619 /* this can happen only if on == T */ 620 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 621 } 622 __pipe_unlock(pipe); 623 return retval; 624 } 625 626 static unsigned long account_pipe_buffers(struct user_struct *user, 627 unsigned long old, unsigned long new) 628 { 629 return atomic_long_add_return(new - old, &user->pipe_bufs); 630 } 631 632 static bool too_many_pipe_buffers_soft(unsigned long user_bufs) 633 { 634 unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); 635 636 return soft_limit && user_bufs > soft_limit; 637 } 638 639 static bool too_many_pipe_buffers_hard(unsigned long user_bufs) 640 { 641 unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); 642 643 return hard_limit && user_bufs > hard_limit; 644 } 645 646 static bool is_unprivileged_user(void) 647 { 648 return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); 649 } 650 651 struct pipe_inode_info *alloc_pipe_info(void) 652 { 653 struct pipe_inode_info *pipe; 654 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 655 struct user_struct *user = get_current_user(); 656 unsigned long user_bufs; 657 unsigned int max_size = READ_ONCE(pipe_max_size); 658 659 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 660 if (pipe == NULL) 661 goto out_free_uid; 662 663 if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) 664 pipe_bufs = max_size >> PAGE_SHIFT; 665 666 user_bufs = account_pipe_buffers(user, 0, pipe_bufs); 667 668 if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { 669 user_bufs = account_pipe_buffers(user, pipe_bufs, 1); 670 pipe_bufs = 1; 671 } 672 673 if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) 674 goto out_revert_acct; 675 676 pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), 677 GFP_KERNEL_ACCOUNT); 678 679 if (pipe->bufs) { 680 init_waitqueue_head(&pipe->wait); 681 pipe->r_counter = pipe->w_counter = 1; 682 pipe->buffers = pipe_bufs; 683 pipe->user = user; 684 mutex_init(&pipe->mutex); 685 return pipe; 686 } 687 688 out_revert_acct: 689 (void) account_pipe_buffers(user, pipe_bufs, 0); 690 kfree(pipe); 691 out_free_uid: 692 free_uid(user); 693 return NULL; 694 } 695 696 void free_pipe_info(struct pipe_inode_info *pipe) 697 { 698 int i; 699 700 (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); 701 free_uid(pipe->user); 702 for (i = 0; i < pipe->buffers; i++) { 703 struct pipe_buffer *buf = pipe->bufs + i; 704 if (buf->ops) 705 pipe_buf_release(pipe, buf); 706 } 707 if (pipe->tmp_page) 708 __free_page(pipe->tmp_page); 709 kfree(pipe->bufs); 710 kfree(pipe); 711 } 712 713 static struct vfsmount *pipe_mnt __read_mostly; 714 715 /* 716 * pipefs_dname() is called from d_path(). 717 */ 718 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 719 { 720 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 721 d_inode(dentry)->i_ino); 722 } 723 724 static const struct dentry_operations pipefs_dentry_operations = { 725 .d_dname = pipefs_dname, 726 }; 727 728 static struct inode * get_pipe_inode(void) 729 { 730 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 731 struct pipe_inode_info *pipe; 732 733 if (!inode) 734 goto fail_inode; 735 736 inode->i_ino = get_next_ino(); 737 738 pipe = alloc_pipe_info(); 739 if (!pipe) 740 goto fail_iput; 741 742 inode->i_pipe = pipe; 743 pipe->files = 2; 744 pipe->readers = pipe->writers = 1; 745 inode->i_fop = &pipefifo_fops; 746 747 /* 748 * Mark the inode dirty from the very beginning, 749 * that way it will never be moved to the dirty 750 * list because "mark_inode_dirty()" will think 751 * that it already _is_ on the dirty list. 752 */ 753 inode->i_state = I_DIRTY; 754 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 755 inode->i_uid = current_fsuid(); 756 inode->i_gid = current_fsgid(); 757 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 758 759 return inode; 760 761 fail_iput: 762 iput(inode); 763 764 fail_inode: 765 return NULL; 766 } 767 768 int create_pipe_files(struct file **res, int flags) 769 { 770 struct inode *inode = get_pipe_inode(); 771 struct file *f; 772 773 if (!inode) 774 return -ENFILE; 775 776 f = alloc_file_pseudo(inode, pipe_mnt, "", 777 O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), 778 &pipefifo_fops); 779 if (IS_ERR(f)) { 780 free_pipe_info(inode->i_pipe); 781 iput(inode); 782 return PTR_ERR(f); 783 } 784 785 f->private_data = inode->i_pipe; 786 787 res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), 788 &pipefifo_fops); 789 if (IS_ERR(res[0])) { 790 put_pipe_info(inode, inode->i_pipe); 791 fput(f); 792 return PTR_ERR(res[0]); 793 } 794 res[0]->private_data = inode->i_pipe; 795 res[1] = f; 796 stream_open(inode, res[0]); 797 stream_open(inode, res[1]); 798 return 0; 799 } 800 801 static int __do_pipe_flags(int *fd, struct file **files, int flags) 802 { 803 int error; 804 int fdw, fdr; 805 806 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 807 return -EINVAL; 808 809 error = create_pipe_files(files, flags); 810 if (error) 811 return error; 812 813 error = get_unused_fd_flags(flags); 814 if (error < 0) 815 goto err_read_pipe; 816 fdr = error; 817 818 error = get_unused_fd_flags(flags); 819 if (error < 0) 820 goto err_fdr; 821 fdw = error; 822 823 audit_fd_pair(fdr, fdw); 824 fd[0] = fdr; 825 fd[1] = fdw; 826 return 0; 827 828 err_fdr: 829 put_unused_fd(fdr); 830 err_read_pipe: 831 fput(files[0]); 832 fput(files[1]); 833 return error; 834 } 835 836 int do_pipe_flags(int *fd, int flags) 837 { 838 struct file *files[2]; 839 int error = __do_pipe_flags(fd, files, flags); 840 if (!error) { 841 fd_install(fd[0], files[0]); 842 fd_install(fd[1], files[1]); 843 } 844 return error; 845 } 846 847 /* 848 * sys_pipe() is the normal C calling standard for creating 849 * a pipe. It's not the way Unix traditionally does this, though. 850 */ 851 static int do_pipe2(int __user *fildes, int flags) 852 { 853 struct file *files[2]; 854 int fd[2]; 855 int error; 856 857 error = __do_pipe_flags(fd, files, flags); 858 if (!error) { 859 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 860 fput(files[0]); 861 fput(files[1]); 862 put_unused_fd(fd[0]); 863 put_unused_fd(fd[1]); 864 error = -EFAULT; 865 } else { 866 fd_install(fd[0], files[0]); 867 fd_install(fd[1], files[1]); 868 } 869 } 870 return error; 871 } 872 873 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 874 { 875 return do_pipe2(fildes, flags); 876 } 877 878 SYSCALL_DEFINE1(pipe, int __user *, fildes) 879 { 880 return do_pipe2(fildes, 0); 881 } 882 883 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 884 { 885 int cur = *cnt; 886 887 while (cur == *cnt) { 888 pipe_wait(pipe); 889 if (signal_pending(current)) 890 break; 891 } 892 return cur == *cnt ? -ERESTARTSYS : 0; 893 } 894 895 static void wake_up_partner(struct pipe_inode_info *pipe) 896 { 897 wake_up_interruptible(&pipe->wait); 898 } 899 900 static int fifo_open(struct inode *inode, struct file *filp) 901 { 902 struct pipe_inode_info *pipe; 903 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 904 int ret; 905 906 filp->f_version = 0; 907 908 spin_lock(&inode->i_lock); 909 if (inode->i_pipe) { 910 pipe = inode->i_pipe; 911 pipe->files++; 912 spin_unlock(&inode->i_lock); 913 } else { 914 spin_unlock(&inode->i_lock); 915 pipe = alloc_pipe_info(); 916 if (!pipe) 917 return -ENOMEM; 918 pipe->files = 1; 919 spin_lock(&inode->i_lock); 920 if (unlikely(inode->i_pipe)) { 921 inode->i_pipe->files++; 922 spin_unlock(&inode->i_lock); 923 free_pipe_info(pipe); 924 pipe = inode->i_pipe; 925 } else { 926 inode->i_pipe = pipe; 927 spin_unlock(&inode->i_lock); 928 } 929 } 930 filp->private_data = pipe; 931 /* OK, we have a pipe and it's pinned down */ 932 933 __pipe_lock(pipe); 934 935 /* We can only do regular read/write on fifos */ 936 stream_open(inode, filp); 937 938 switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { 939 case FMODE_READ: 940 /* 941 * O_RDONLY 942 * POSIX.1 says that O_NONBLOCK means return with the FIFO 943 * opened, even when there is no process writing the FIFO. 944 */ 945 pipe->r_counter++; 946 if (pipe->readers++ == 0) 947 wake_up_partner(pipe); 948 949 if (!is_pipe && !pipe->writers) { 950 if ((filp->f_flags & O_NONBLOCK)) { 951 /* suppress EPOLLHUP until we have 952 * seen a writer */ 953 filp->f_version = pipe->w_counter; 954 } else { 955 if (wait_for_partner(pipe, &pipe->w_counter)) 956 goto err_rd; 957 } 958 } 959 break; 960 961 case FMODE_WRITE: 962 /* 963 * O_WRONLY 964 * POSIX.1 says that O_NONBLOCK means return -1 with 965 * errno=ENXIO when there is no process reading the FIFO. 966 */ 967 ret = -ENXIO; 968 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 969 goto err; 970 971 pipe->w_counter++; 972 if (!pipe->writers++) 973 wake_up_partner(pipe); 974 975 if (!is_pipe && !pipe->readers) { 976 if (wait_for_partner(pipe, &pipe->r_counter)) 977 goto err_wr; 978 } 979 break; 980 981 case FMODE_READ | FMODE_WRITE: 982 /* 983 * O_RDWR 984 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 985 * This implementation will NEVER block on a O_RDWR open, since 986 * the process can at least talk to itself. 987 */ 988 989 pipe->readers++; 990 pipe->writers++; 991 pipe->r_counter++; 992 pipe->w_counter++; 993 if (pipe->readers == 1 || pipe->writers == 1) 994 wake_up_partner(pipe); 995 break; 996 997 default: 998 ret = -EINVAL; 999 goto err; 1000 } 1001 1002 /* Ok! */ 1003 __pipe_unlock(pipe); 1004 return 0; 1005 1006 err_rd: 1007 if (!--pipe->readers) 1008 wake_up_interruptible(&pipe->wait); 1009 ret = -ERESTARTSYS; 1010 goto err; 1011 1012 err_wr: 1013 if (!--pipe->writers) 1014 wake_up_interruptible(&pipe->wait); 1015 ret = -ERESTARTSYS; 1016 goto err; 1017 1018 err: 1019 __pipe_unlock(pipe); 1020 1021 put_pipe_info(inode, pipe); 1022 return ret; 1023 } 1024 1025 const struct file_operations pipefifo_fops = { 1026 .open = fifo_open, 1027 .llseek = no_llseek, 1028 .read_iter = pipe_read, 1029 .write_iter = pipe_write, 1030 .poll = pipe_poll, 1031 .unlocked_ioctl = pipe_ioctl, 1032 .release = pipe_release, 1033 .fasync = pipe_fasync, 1034 }; 1035 1036 /* 1037 * Currently we rely on the pipe array holding a power-of-2 number 1038 * of pages. Returns 0 on error. 1039 */ 1040 unsigned int round_pipe_size(unsigned long size) 1041 { 1042 if (size > (1U << 31)) 1043 return 0; 1044 1045 /* Minimum pipe size, as required by POSIX */ 1046 if (size < PAGE_SIZE) 1047 return PAGE_SIZE; 1048 1049 return roundup_pow_of_two(size); 1050 } 1051 1052 /* 1053 * Allocate a new array of pipe buffers and copy the info over. Returns the 1054 * pipe size if successful, or return -ERROR on error. 1055 */ 1056 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) 1057 { 1058 struct pipe_buffer *bufs; 1059 unsigned int size, nr_pages; 1060 unsigned long user_bufs; 1061 long ret = 0; 1062 1063 size = round_pipe_size(arg); 1064 nr_pages = size >> PAGE_SHIFT; 1065 1066 if (!nr_pages) 1067 return -EINVAL; 1068 1069 /* 1070 * If trying to increase the pipe capacity, check that an 1071 * unprivileged user is not trying to exceed various limits 1072 * (soft limit check here, hard limit check just below). 1073 * Decreasing the pipe capacity is always permitted, even 1074 * if the user is currently over a limit. 1075 */ 1076 if (nr_pages > pipe->buffers && 1077 size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) 1078 return -EPERM; 1079 1080 user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); 1081 1082 if (nr_pages > pipe->buffers && 1083 (too_many_pipe_buffers_hard(user_bufs) || 1084 too_many_pipe_buffers_soft(user_bufs)) && 1085 is_unprivileged_user()) { 1086 ret = -EPERM; 1087 goto out_revert_acct; 1088 } 1089 1090 /* 1091 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1092 * expect a lot of shrink+grow operations, just free and allocate 1093 * again like we would do for growing. If the pipe currently 1094 * contains more buffers than arg, then return busy. 1095 */ 1096 if (nr_pages < pipe->nrbufs) { 1097 ret = -EBUSY; 1098 goto out_revert_acct; 1099 } 1100 1101 bufs = kcalloc(nr_pages, sizeof(*bufs), 1102 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1103 if (unlikely(!bufs)) { 1104 ret = -ENOMEM; 1105 goto out_revert_acct; 1106 } 1107 1108 /* 1109 * The pipe array wraps around, so just start the new one at zero 1110 * and adjust the indexes. 1111 */ 1112 if (pipe->nrbufs) { 1113 unsigned int tail; 1114 unsigned int head; 1115 1116 tail = pipe->curbuf + pipe->nrbufs; 1117 if (tail < pipe->buffers) 1118 tail = 0; 1119 else 1120 tail &= (pipe->buffers - 1); 1121 1122 head = pipe->nrbufs - tail; 1123 if (head) 1124 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1125 if (tail) 1126 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1127 } 1128 1129 pipe->curbuf = 0; 1130 kfree(pipe->bufs); 1131 pipe->bufs = bufs; 1132 pipe->buffers = nr_pages; 1133 return nr_pages * PAGE_SIZE; 1134 1135 out_revert_acct: 1136 (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); 1137 return ret; 1138 } 1139 1140 /* 1141 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1142 * location, so checking ->i_pipe is not enough to verify that this is a 1143 * pipe. 1144 */ 1145 struct pipe_inode_info *get_pipe_info(struct file *file) 1146 { 1147 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1148 } 1149 1150 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1151 { 1152 struct pipe_inode_info *pipe; 1153 long ret; 1154 1155 pipe = get_pipe_info(file); 1156 if (!pipe) 1157 return -EBADF; 1158 1159 __pipe_lock(pipe); 1160 1161 switch (cmd) { 1162 case F_SETPIPE_SZ: 1163 ret = pipe_set_size(pipe, arg); 1164 break; 1165 case F_GETPIPE_SZ: 1166 ret = pipe->buffers * PAGE_SIZE; 1167 break; 1168 default: 1169 ret = -EINVAL; 1170 break; 1171 } 1172 1173 __pipe_unlock(pipe); 1174 return ret; 1175 } 1176 1177 static const struct super_operations pipefs_ops = { 1178 .destroy_inode = free_inode_nonrcu, 1179 .statfs = simple_statfs, 1180 }; 1181 1182 /* 1183 * pipefs should _never_ be mounted by userland - too much of security hassle, 1184 * no real gain from having the whole whorehouse mounted. So we don't need 1185 * any operations on the root directory. However, we need a non-trivial 1186 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1187 */ 1188 1189 static int pipefs_init_fs_context(struct fs_context *fc) 1190 { 1191 struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); 1192 if (!ctx) 1193 return -ENOMEM; 1194 ctx->ops = &pipefs_ops; 1195 ctx->dops = &pipefs_dentry_operations; 1196 return 0; 1197 } 1198 1199 static struct file_system_type pipe_fs_type = { 1200 .name = "pipefs", 1201 .init_fs_context = pipefs_init_fs_context, 1202 .kill_sb = kill_anon_super, 1203 }; 1204 1205 static int __init init_pipe_fs(void) 1206 { 1207 int err = register_filesystem(&pipe_fs_type); 1208 1209 if (!err) { 1210 pipe_mnt = kern_mount(&pipe_fs_type); 1211 if (IS_ERR(pipe_mnt)) { 1212 err = PTR_ERR(pipe_mnt); 1213 unregister_filesystem(&pipe_fs_type); 1214 } 1215 } 1216 return err; 1217 } 1218 1219 fs_initcall(init_pipe_fs); 1220