1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 #include <linux/memcontrol.h> 25 26 #include <asm/uaccess.h> 27 #include <asm/ioctls.h> 28 29 #include "internal.h" 30 31 /* 32 * The max size that a non-root user is allowed to grow the pipe. Can 33 * be set by root in /proc/sys/fs/pipe-max-size 34 */ 35 unsigned int pipe_max_size = 1048576; 36 37 /* 38 * Minimum pipe size, as required by POSIX 39 */ 40 unsigned int pipe_min_size = PAGE_SIZE; 41 42 /* Maximum allocatable pages per user. Hard limit is unset by default, soft 43 * matches default values. 44 */ 45 unsigned long pipe_user_pages_hard; 46 unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; 47 48 /* 49 * We use a start+len construction, which provides full use of the 50 * allocated memory. 51 * -- Florian Coosmann (FGC) 52 * 53 * Reads with count = 0 should always return 0. 54 * -- Julian Bradfield 1999-06-07. 55 * 56 * FIFOs and Pipes now generate SIGIO for both readers and writers. 57 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 58 * 59 * pipe_read & write cleanup 60 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 61 */ 62 63 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 64 { 65 if (pipe->files) 66 mutex_lock_nested(&pipe->mutex, subclass); 67 } 68 69 void pipe_lock(struct pipe_inode_info *pipe) 70 { 71 /* 72 * pipe_lock() nests non-pipe inode locks (for writing to a file) 73 */ 74 pipe_lock_nested(pipe, I_MUTEX_PARENT); 75 } 76 EXPORT_SYMBOL(pipe_lock); 77 78 void pipe_unlock(struct pipe_inode_info *pipe) 79 { 80 if (pipe->files) 81 mutex_unlock(&pipe->mutex); 82 } 83 EXPORT_SYMBOL(pipe_unlock); 84 85 static inline void __pipe_lock(struct pipe_inode_info *pipe) 86 { 87 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 88 } 89 90 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 91 { 92 mutex_unlock(&pipe->mutex); 93 } 94 95 void pipe_double_lock(struct pipe_inode_info *pipe1, 96 struct pipe_inode_info *pipe2) 97 { 98 BUG_ON(pipe1 == pipe2); 99 100 if (pipe1 < pipe2) { 101 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 102 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 103 } else { 104 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 105 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 106 } 107 } 108 109 /* Drop the inode semaphore and wait for a pipe event, atomically */ 110 void pipe_wait(struct pipe_inode_info *pipe) 111 { 112 DEFINE_WAIT(wait); 113 114 /* 115 * Pipes are system-local resources, so sleeping on them 116 * is considered a noninteractive wait: 117 */ 118 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 119 pipe_unlock(pipe); 120 schedule(); 121 finish_wait(&pipe->wait, &wait); 122 pipe_lock(pipe); 123 } 124 125 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 126 struct pipe_buffer *buf) 127 { 128 struct page *page = buf->page; 129 130 /* 131 * If nobody else uses this page, and we don't already have a 132 * temporary page, let's keep track of it as a one-deep 133 * allocation cache. (Otherwise just release our reference to it) 134 */ 135 if (page_count(page) == 1 && !pipe->tmp_page) 136 pipe->tmp_page = page; 137 else 138 put_page(page); 139 } 140 141 static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, 142 struct pipe_buffer *buf) 143 { 144 struct page *page = buf->page; 145 146 if (page_count(page) == 1) { 147 if (memcg_kmem_enabled()) 148 memcg_kmem_uncharge(page, 0); 149 __SetPageLocked(page); 150 return 0; 151 } 152 return 1; 153 } 154 155 /** 156 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 157 * @pipe: the pipe that the buffer belongs to 158 * @buf: the buffer to attempt to steal 159 * 160 * Description: 161 * This function attempts to steal the &struct page attached to 162 * @buf. If successful, this function returns 0 and returns with 163 * the page locked. The caller may then reuse the page for whatever 164 * he wishes; the typical use is insertion into a different file 165 * page cache. 166 */ 167 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 168 struct pipe_buffer *buf) 169 { 170 struct page *page = buf->page; 171 172 /* 173 * A reference of one is golden, that means that the owner of this 174 * page is the only one holding a reference to it. lock the page 175 * and return OK. 176 */ 177 if (page_count(page) == 1) { 178 lock_page(page); 179 return 0; 180 } 181 182 return 1; 183 } 184 EXPORT_SYMBOL(generic_pipe_buf_steal); 185 186 /** 187 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 188 * @pipe: the pipe that the buffer belongs to 189 * @buf: the buffer to get a reference to 190 * 191 * Description: 192 * This function grabs an extra reference to @buf. It's used in 193 * in the tee() system call, when we duplicate the buffers in one 194 * pipe into another. 195 */ 196 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 197 { 198 get_page(buf->page); 199 } 200 EXPORT_SYMBOL(generic_pipe_buf_get); 201 202 /** 203 * generic_pipe_buf_confirm - verify contents of the pipe buffer 204 * @info: the pipe that the buffer belongs to 205 * @buf: the buffer to confirm 206 * 207 * Description: 208 * This function does nothing, because the generic pipe code uses 209 * pages that are always good when inserted into the pipe. 210 */ 211 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 212 struct pipe_buffer *buf) 213 { 214 return 0; 215 } 216 EXPORT_SYMBOL(generic_pipe_buf_confirm); 217 218 /** 219 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 220 * @pipe: the pipe that the buffer belongs to 221 * @buf: the buffer to put a reference to 222 * 223 * Description: 224 * This function releases a reference to @buf. 225 */ 226 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 227 struct pipe_buffer *buf) 228 { 229 put_page(buf->page); 230 } 231 EXPORT_SYMBOL(generic_pipe_buf_release); 232 233 static const struct pipe_buf_operations anon_pipe_buf_ops = { 234 .can_merge = 1, 235 .confirm = generic_pipe_buf_confirm, 236 .release = anon_pipe_buf_release, 237 .steal = anon_pipe_buf_steal, 238 .get = generic_pipe_buf_get, 239 }; 240 241 static const struct pipe_buf_operations packet_pipe_buf_ops = { 242 .can_merge = 0, 243 .confirm = generic_pipe_buf_confirm, 244 .release = anon_pipe_buf_release, 245 .steal = anon_pipe_buf_steal, 246 .get = generic_pipe_buf_get, 247 }; 248 249 static ssize_t 250 pipe_read(struct kiocb *iocb, struct iov_iter *to) 251 { 252 size_t total_len = iov_iter_count(to); 253 struct file *filp = iocb->ki_filp; 254 struct pipe_inode_info *pipe = filp->private_data; 255 int do_wakeup; 256 ssize_t ret; 257 258 /* Null read succeeds. */ 259 if (unlikely(total_len == 0)) 260 return 0; 261 262 do_wakeup = 0; 263 ret = 0; 264 __pipe_lock(pipe); 265 for (;;) { 266 int bufs = pipe->nrbufs; 267 if (bufs) { 268 int curbuf = pipe->curbuf; 269 struct pipe_buffer *buf = pipe->bufs + curbuf; 270 const struct pipe_buf_operations *ops = buf->ops; 271 size_t chars = buf->len; 272 size_t written; 273 int error; 274 275 if (chars > total_len) 276 chars = total_len; 277 278 error = ops->confirm(pipe, buf); 279 if (error) { 280 if (!ret) 281 ret = error; 282 break; 283 } 284 285 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 286 if (unlikely(written < chars)) { 287 if (!ret) 288 ret = -EFAULT; 289 break; 290 } 291 ret += chars; 292 buf->offset += chars; 293 buf->len -= chars; 294 295 /* Was it a packet buffer? Clean up and exit */ 296 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 297 total_len = chars; 298 buf->len = 0; 299 } 300 301 if (!buf->len) { 302 buf->ops = NULL; 303 ops->release(pipe, buf); 304 curbuf = (curbuf + 1) & (pipe->buffers - 1); 305 pipe->curbuf = curbuf; 306 pipe->nrbufs = --bufs; 307 do_wakeup = 1; 308 } 309 total_len -= chars; 310 if (!total_len) 311 break; /* common path: read succeeded */ 312 } 313 if (bufs) /* More to do? */ 314 continue; 315 if (!pipe->writers) 316 break; 317 if (!pipe->waiting_writers) { 318 /* syscall merging: Usually we must not sleep 319 * if O_NONBLOCK is set, or if we got some data. 320 * But if a writer sleeps in kernel space, then 321 * we can wait for that data without violating POSIX. 322 */ 323 if (ret) 324 break; 325 if (filp->f_flags & O_NONBLOCK) { 326 ret = -EAGAIN; 327 break; 328 } 329 } 330 if (signal_pending(current)) { 331 if (!ret) 332 ret = -ERESTARTSYS; 333 break; 334 } 335 if (do_wakeup) { 336 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 337 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 338 } 339 pipe_wait(pipe); 340 } 341 __pipe_unlock(pipe); 342 343 /* Signal writers asynchronously that there is more room. */ 344 if (do_wakeup) { 345 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 346 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 347 } 348 if (ret > 0) 349 file_accessed(filp); 350 return ret; 351 } 352 353 static inline int is_packetized(struct file *file) 354 { 355 return (file->f_flags & O_DIRECT) != 0; 356 } 357 358 static ssize_t 359 pipe_write(struct kiocb *iocb, struct iov_iter *from) 360 { 361 struct file *filp = iocb->ki_filp; 362 struct pipe_inode_info *pipe = filp->private_data; 363 ssize_t ret = 0; 364 int do_wakeup = 0; 365 size_t total_len = iov_iter_count(from); 366 ssize_t chars; 367 368 /* Null write succeeds. */ 369 if (unlikely(total_len == 0)) 370 return 0; 371 372 __pipe_lock(pipe); 373 374 if (!pipe->readers) { 375 send_sig(SIGPIPE, current, 0); 376 ret = -EPIPE; 377 goto out; 378 } 379 380 /* We try to merge small writes */ 381 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 382 if (pipe->nrbufs && chars != 0) { 383 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 384 (pipe->buffers - 1); 385 struct pipe_buffer *buf = pipe->bufs + lastbuf; 386 const struct pipe_buf_operations *ops = buf->ops; 387 int offset = buf->offset + buf->len; 388 389 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 390 ret = ops->confirm(pipe, buf); 391 if (ret) 392 goto out; 393 394 ret = copy_page_from_iter(buf->page, offset, chars, from); 395 if (unlikely(ret < chars)) { 396 ret = -EFAULT; 397 goto out; 398 } 399 do_wakeup = 1; 400 buf->len += ret; 401 if (!iov_iter_count(from)) 402 goto out; 403 } 404 } 405 406 for (;;) { 407 int bufs; 408 409 if (!pipe->readers) { 410 send_sig(SIGPIPE, current, 0); 411 if (!ret) 412 ret = -EPIPE; 413 break; 414 } 415 bufs = pipe->nrbufs; 416 if (bufs < pipe->buffers) { 417 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 418 struct pipe_buffer *buf = pipe->bufs + newbuf; 419 struct page *page = pipe->tmp_page; 420 int copied; 421 422 if (!page) { 423 page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); 424 if (unlikely(!page)) { 425 ret = ret ? : -ENOMEM; 426 break; 427 } 428 pipe->tmp_page = page; 429 } 430 /* Always wake up, even if the copy fails. Otherwise 431 * we lock up (O_NONBLOCK-)readers that sleep due to 432 * syscall merging. 433 * FIXME! Is this really true? 434 */ 435 do_wakeup = 1; 436 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 437 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 438 if (!ret) 439 ret = -EFAULT; 440 break; 441 } 442 ret += copied; 443 444 /* Insert it into the buffer array */ 445 buf->page = page; 446 buf->ops = &anon_pipe_buf_ops; 447 buf->offset = 0; 448 buf->len = copied; 449 buf->flags = 0; 450 if (is_packetized(filp)) { 451 buf->ops = &packet_pipe_buf_ops; 452 buf->flags = PIPE_BUF_FLAG_PACKET; 453 } 454 pipe->nrbufs = ++bufs; 455 pipe->tmp_page = NULL; 456 457 if (!iov_iter_count(from)) 458 break; 459 } 460 if (bufs < pipe->buffers) 461 continue; 462 if (filp->f_flags & O_NONBLOCK) { 463 if (!ret) 464 ret = -EAGAIN; 465 break; 466 } 467 if (signal_pending(current)) { 468 if (!ret) 469 ret = -ERESTARTSYS; 470 break; 471 } 472 if (do_wakeup) { 473 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 474 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 475 do_wakeup = 0; 476 } 477 pipe->waiting_writers++; 478 pipe_wait(pipe); 479 pipe->waiting_writers--; 480 } 481 out: 482 __pipe_unlock(pipe); 483 if (do_wakeup) { 484 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 485 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 486 } 487 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 488 int err = file_update_time(filp); 489 if (err) 490 ret = err; 491 sb_end_write(file_inode(filp)->i_sb); 492 } 493 return ret; 494 } 495 496 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 497 { 498 struct pipe_inode_info *pipe = filp->private_data; 499 int count, buf, nrbufs; 500 501 switch (cmd) { 502 case FIONREAD: 503 __pipe_lock(pipe); 504 count = 0; 505 buf = pipe->curbuf; 506 nrbufs = pipe->nrbufs; 507 while (--nrbufs >= 0) { 508 count += pipe->bufs[buf].len; 509 buf = (buf+1) & (pipe->buffers - 1); 510 } 511 __pipe_unlock(pipe); 512 513 return put_user(count, (int __user *)arg); 514 default: 515 return -ENOIOCTLCMD; 516 } 517 } 518 519 /* No kernel lock held - fine */ 520 static unsigned int 521 pipe_poll(struct file *filp, poll_table *wait) 522 { 523 unsigned int mask; 524 struct pipe_inode_info *pipe = filp->private_data; 525 int nrbufs; 526 527 poll_wait(filp, &pipe->wait, wait); 528 529 /* Reading only -- no need for acquiring the semaphore. */ 530 nrbufs = pipe->nrbufs; 531 mask = 0; 532 if (filp->f_mode & FMODE_READ) { 533 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 534 if (!pipe->writers && filp->f_version != pipe->w_counter) 535 mask |= POLLHUP; 536 } 537 538 if (filp->f_mode & FMODE_WRITE) { 539 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 540 /* 541 * Most Unices do not set POLLERR for FIFOs but on Linux they 542 * behave exactly like pipes for poll(). 543 */ 544 if (!pipe->readers) 545 mask |= POLLERR; 546 } 547 548 return mask; 549 } 550 551 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 552 { 553 int kill = 0; 554 555 spin_lock(&inode->i_lock); 556 if (!--pipe->files) { 557 inode->i_pipe = NULL; 558 kill = 1; 559 } 560 spin_unlock(&inode->i_lock); 561 562 if (kill) 563 free_pipe_info(pipe); 564 } 565 566 static int 567 pipe_release(struct inode *inode, struct file *file) 568 { 569 struct pipe_inode_info *pipe = file->private_data; 570 571 __pipe_lock(pipe); 572 if (file->f_mode & FMODE_READ) 573 pipe->readers--; 574 if (file->f_mode & FMODE_WRITE) 575 pipe->writers--; 576 577 if (pipe->readers || pipe->writers) { 578 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 579 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 580 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 581 } 582 __pipe_unlock(pipe); 583 584 put_pipe_info(inode, pipe); 585 return 0; 586 } 587 588 static int 589 pipe_fasync(int fd, struct file *filp, int on) 590 { 591 struct pipe_inode_info *pipe = filp->private_data; 592 int retval = 0; 593 594 __pipe_lock(pipe); 595 if (filp->f_mode & FMODE_READ) 596 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 597 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 598 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 599 if (retval < 0 && (filp->f_mode & FMODE_READ)) 600 /* this can happen only if on == T */ 601 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 602 } 603 __pipe_unlock(pipe); 604 return retval; 605 } 606 607 static void account_pipe_buffers(struct pipe_inode_info *pipe, 608 unsigned long old, unsigned long new) 609 { 610 atomic_long_add(new - old, &pipe->user->pipe_bufs); 611 } 612 613 static bool too_many_pipe_buffers_soft(struct user_struct *user) 614 { 615 return pipe_user_pages_soft && 616 atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; 617 } 618 619 static bool too_many_pipe_buffers_hard(struct user_struct *user) 620 { 621 return pipe_user_pages_hard && 622 atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; 623 } 624 625 struct pipe_inode_info *alloc_pipe_info(void) 626 { 627 struct pipe_inode_info *pipe; 628 629 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); 630 if (pipe) { 631 unsigned long pipe_bufs = PIPE_DEF_BUFFERS; 632 struct user_struct *user = get_current_user(); 633 634 if (!too_many_pipe_buffers_hard(user)) { 635 if (too_many_pipe_buffers_soft(user)) 636 pipe_bufs = 1; 637 pipe->bufs = kcalloc(pipe_bufs, 638 sizeof(struct pipe_buffer), 639 GFP_KERNEL_ACCOUNT); 640 } 641 642 if (pipe->bufs) { 643 init_waitqueue_head(&pipe->wait); 644 pipe->r_counter = pipe->w_counter = 1; 645 pipe->buffers = pipe_bufs; 646 pipe->user = user; 647 account_pipe_buffers(pipe, 0, pipe_bufs); 648 mutex_init(&pipe->mutex); 649 return pipe; 650 } 651 free_uid(user); 652 kfree(pipe); 653 } 654 655 return NULL; 656 } 657 658 void free_pipe_info(struct pipe_inode_info *pipe) 659 { 660 int i; 661 662 account_pipe_buffers(pipe, pipe->buffers, 0); 663 free_uid(pipe->user); 664 for (i = 0; i < pipe->buffers; i++) { 665 struct pipe_buffer *buf = pipe->bufs + i; 666 if (buf->ops) 667 buf->ops->release(pipe, buf); 668 } 669 if (pipe->tmp_page) 670 __free_page(pipe->tmp_page); 671 kfree(pipe->bufs); 672 kfree(pipe); 673 } 674 675 static struct vfsmount *pipe_mnt __read_mostly; 676 677 /* 678 * pipefs_dname() is called from d_path(). 679 */ 680 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 681 { 682 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 683 d_inode(dentry)->i_ino); 684 } 685 686 static const struct dentry_operations pipefs_dentry_operations = { 687 .d_dname = pipefs_dname, 688 }; 689 690 static struct inode * get_pipe_inode(void) 691 { 692 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 693 struct pipe_inode_info *pipe; 694 695 if (!inode) 696 goto fail_inode; 697 698 inode->i_ino = get_next_ino(); 699 700 pipe = alloc_pipe_info(); 701 if (!pipe) 702 goto fail_iput; 703 704 inode->i_pipe = pipe; 705 pipe->files = 2; 706 pipe->readers = pipe->writers = 1; 707 inode->i_fop = &pipefifo_fops; 708 709 /* 710 * Mark the inode dirty from the very beginning, 711 * that way it will never be moved to the dirty 712 * list because "mark_inode_dirty()" will think 713 * that it already _is_ on the dirty list. 714 */ 715 inode->i_state = I_DIRTY; 716 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 717 inode->i_uid = current_fsuid(); 718 inode->i_gid = current_fsgid(); 719 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 720 721 return inode; 722 723 fail_iput: 724 iput(inode); 725 726 fail_inode: 727 return NULL; 728 } 729 730 int create_pipe_files(struct file **res, int flags) 731 { 732 int err; 733 struct inode *inode = get_pipe_inode(); 734 struct file *f; 735 struct path path; 736 static struct qstr name = { .name = "" }; 737 738 if (!inode) 739 return -ENFILE; 740 741 err = -ENOMEM; 742 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 743 if (!path.dentry) 744 goto err_inode; 745 path.mnt = mntget(pipe_mnt); 746 747 d_instantiate(path.dentry, inode); 748 749 f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); 750 if (IS_ERR(f)) { 751 err = PTR_ERR(f); 752 goto err_dentry; 753 } 754 755 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 756 f->private_data = inode->i_pipe; 757 758 res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); 759 if (IS_ERR(res[0])) { 760 err = PTR_ERR(res[0]); 761 goto err_file; 762 } 763 764 path_get(&path); 765 res[0]->private_data = inode->i_pipe; 766 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 767 res[1] = f; 768 return 0; 769 770 err_file: 771 put_filp(f); 772 err_dentry: 773 free_pipe_info(inode->i_pipe); 774 path_put(&path); 775 return err; 776 777 err_inode: 778 free_pipe_info(inode->i_pipe); 779 iput(inode); 780 return err; 781 } 782 783 static int __do_pipe_flags(int *fd, struct file **files, int flags) 784 { 785 int error; 786 int fdw, fdr; 787 788 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 789 return -EINVAL; 790 791 error = create_pipe_files(files, flags); 792 if (error) 793 return error; 794 795 error = get_unused_fd_flags(flags); 796 if (error < 0) 797 goto err_read_pipe; 798 fdr = error; 799 800 error = get_unused_fd_flags(flags); 801 if (error < 0) 802 goto err_fdr; 803 fdw = error; 804 805 audit_fd_pair(fdr, fdw); 806 fd[0] = fdr; 807 fd[1] = fdw; 808 return 0; 809 810 err_fdr: 811 put_unused_fd(fdr); 812 err_read_pipe: 813 fput(files[0]); 814 fput(files[1]); 815 return error; 816 } 817 818 int do_pipe_flags(int *fd, int flags) 819 { 820 struct file *files[2]; 821 int error = __do_pipe_flags(fd, files, flags); 822 if (!error) { 823 fd_install(fd[0], files[0]); 824 fd_install(fd[1], files[1]); 825 } 826 return error; 827 } 828 829 /* 830 * sys_pipe() is the normal C calling standard for creating 831 * a pipe. It's not the way Unix traditionally does this, though. 832 */ 833 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 834 { 835 struct file *files[2]; 836 int fd[2]; 837 int error; 838 839 error = __do_pipe_flags(fd, files, flags); 840 if (!error) { 841 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 842 fput(files[0]); 843 fput(files[1]); 844 put_unused_fd(fd[0]); 845 put_unused_fd(fd[1]); 846 error = -EFAULT; 847 } else { 848 fd_install(fd[0], files[0]); 849 fd_install(fd[1], files[1]); 850 } 851 } 852 return error; 853 } 854 855 SYSCALL_DEFINE1(pipe, int __user *, fildes) 856 { 857 return sys_pipe2(fildes, 0); 858 } 859 860 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 861 { 862 int cur = *cnt; 863 864 while (cur == *cnt) { 865 pipe_wait(pipe); 866 if (signal_pending(current)) 867 break; 868 } 869 return cur == *cnt ? -ERESTARTSYS : 0; 870 } 871 872 static void wake_up_partner(struct pipe_inode_info *pipe) 873 { 874 wake_up_interruptible(&pipe->wait); 875 } 876 877 static int fifo_open(struct inode *inode, struct file *filp) 878 { 879 struct pipe_inode_info *pipe; 880 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 881 int ret; 882 883 filp->f_version = 0; 884 885 spin_lock(&inode->i_lock); 886 if (inode->i_pipe) { 887 pipe = inode->i_pipe; 888 pipe->files++; 889 spin_unlock(&inode->i_lock); 890 } else { 891 spin_unlock(&inode->i_lock); 892 pipe = alloc_pipe_info(); 893 if (!pipe) 894 return -ENOMEM; 895 pipe->files = 1; 896 spin_lock(&inode->i_lock); 897 if (unlikely(inode->i_pipe)) { 898 inode->i_pipe->files++; 899 spin_unlock(&inode->i_lock); 900 free_pipe_info(pipe); 901 pipe = inode->i_pipe; 902 } else { 903 inode->i_pipe = pipe; 904 spin_unlock(&inode->i_lock); 905 } 906 } 907 filp->private_data = pipe; 908 /* OK, we have a pipe and it's pinned down */ 909 910 __pipe_lock(pipe); 911 912 /* We can only do regular read/write on fifos */ 913 filp->f_mode &= (FMODE_READ | FMODE_WRITE); 914 915 switch (filp->f_mode) { 916 case FMODE_READ: 917 /* 918 * O_RDONLY 919 * POSIX.1 says that O_NONBLOCK means return with the FIFO 920 * opened, even when there is no process writing the FIFO. 921 */ 922 pipe->r_counter++; 923 if (pipe->readers++ == 0) 924 wake_up_partner(pipe); 925 926 if (!is_pipe && !pipe->writers) { 927 if ((filp->f_flags & O_NONBLOCK)) { 928 /* suppress POLLHUP until we have 929 * seen a writer */ 930 filp->f_version = pipe->w_counter; 931 } else { 932 if (wait_for_partner(pipe, &pipe->w_counter)) 933 goto err_rd; 934 } 935 } 936 break; 937 938 case FMODE_WRITE: 939 /* 940 * O_WRONLY 941 * POSIX.1 says that O_NONBLOCK means return -1 with 942 * errno=ENXIO when there is no process reading the FIFO. 943 */ 944 ret = -ENXIO; 945 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 946 goto err; 947 948 pipe->w_counter++; 949 if (!pipe->writers++) 950 wake_up_partner(pipe); 951 952 if (!is_pipe && !pipe->readers) { 953 if (wait_for_partner(pipe, &pipe->r_counter)) 954 goto err_wr; 955 } 956 break; 957 958 case FMODE_READ | FMODE_WRITE: 959 /* 960 * O_RDWR 961 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 962 * This implementation will NEVER block on a O_RDWR open, since 963 * the process can at least talk to itself. 964 */ 965 966 pipe->readers++; 967 pipe->writers++; 968 pipe->r_counter++; 969 pipe->w_counter++; 970 if (pipe->readers == 1 || pipe->writers == 1) 971 wake_up_partner(pipe); 972 break; 973 974 default: 975 ret = -EINVAL; 976 goto err; 977 } 978 979 /* Ok! */ 980 __pipe_unlock(pipe); 981 return 0; 982 983 err_rd: 984 if (!--pipe->readers) 985 wake_up_interruptible(&pipe->wait); 986 ret = -ERESTARTSYS; 987 goto err; 988 989 err_wr: 990 if (!--pipe->writers) 991 wake_up_interruptible(&pipe->wait); 992 ret = -ERESTARTSYS; 993 goto err; 994 995 err: 996 __pipe_unlock(pipe); 997 998 put_pipe_info(inode, pipe); 999 return ret; 1000 } 1001 1002 const struct file_operations pipefifo_fops = { 1003 .open = fifo_open, 1004 .llseek = no_llseek, 1005 .read_iter = pipe_read, 1006 .write_iter = pipe_write, 1007 .poll = pipe_poll, 1008 .unlocked_ioctl = pipe_ioctl, 1009 .release = pipe_release, 1010 .fasync = pipe_fasync, 1011 }; 1012 1013 /* 1014 * Allocate a new array of pipe buffers and copy the info over. Returns the 1015 * pipe size if successful, or return -ERROR on error. 1016 */ 1017 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1018 { 1019 struct pipe_buffer *bufs; 1020 1021 /* 1022 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1023 * expect a lot of shrink+grow operations, just free and allocate 1024 * again like we would do for growing. If the pipe currently 1025 * contains more buffers than arg, then return busy. 1026 */ 1027 if (nr_pages < pipe->nrbufs) 1028 return -EBUSY; 1029 1030 bufs = kcalloc(nr_pages, sizeof(*bufs), 1031 GFP_KERNEL_ACCOUNT | __GFP_NOWARN); 1032 if (unlikely(!bufs)) 1033 return -ENOMEM; 1034 1035 /* 1036 * The pipe array wraps around, so just start the new one at zero 1037 * and adjust the indexes. 1038 */ 1039 if (pipe->nrbufs) { 1040 unsigned int tail; 1041 unsigned int head; 1042 1043 tail = pipe->curbuf + pipe->nrbufs; 1044 if (tail < pipe->buffers) 1045 tail = 0; 1046 else 1047 tail &= (pipe->buffers - 1); 1048 1049 head = pipe->nrbufs - tail; 1050 if (head) 1051 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1052 if (tail) 1053 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1054 } 1055 1056 account_pipe_buffers(pipe, pipe->buffers, nr_pages); 1057 pipe->curbuf = 0; 1058 kfree(pipe->bufs); 1059 pipe->bufs = bufs; 1060 pipe->buffers = nr_pages; 1061 return nr_pages * PAGE_SIZE; 1062 } 1063 1064 /* 1065 * Currently we rely on the pipe array holding a power-of-2 number 1066 * of pages. 1067 */ 1068 static inline unsigned int round_pipe_size(unsigned int size) 1069 { 1070 unsigned long nr_pages; 1071 1072 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1073 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1074 } 1075 1076 /* 1077 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1078 * will return an error. 1079 */ 1080 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1081 size_t *lenp, loff_t *ppos) 1082 { 1083 int ret; 1084 1085 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1086 if (ret < 0 || !write) 1087 return ret; 1088 1089 pipe_max_size = round_pipe_size(pipe_max_size); 1090 return ret; 1091 } 1092 1093 /* 1094 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1095 * location, so checking ->i_pipe is not enough to verify that this is a 1096 * pipe. 1097 */ 1098 struct pipe_inode_info *get_pipe_info(struct file *file) 1099 { 1100 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1101 } 1102 1103 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1104 { 1105 struct pipe_inode_info *pipe; 1106 long ret; 1107 1108 pipe = get_pipe_info(file); 1109 if (!pipe) 1110 return -EBADF; 1111 1112 __pipe_lock(pipe); 1113 1114 switch (cmd) { 1115 case F_SETPIPE_SZ: { 1116 unsigned int size, nr_pages; 1117 1118 size = round_pipe_size(arg); 1119 nr_pages = size >> PAGE_SHIFT; 1120 1121 ret = -EINVAL; 1122 if (!nr_pages) 1123 goto out; 1124 1125 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1126 ret = -EPERM; 1127 goto out; 1128 } else if ((too_many_pipe_buffers_hard(pipe->user) || 1129 too_many_pipe_buffers_soft(pipe->user)) && 1130 !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { 1131 ret = -EPERM; 1132 goto out; 1133 } 1134 ret = pipe_set_size(pipe, nr_pages); 1135 break; 1136 } 1137 case F_GETPIPE_SZ: 1138 ret = pipe->buffers * PAGE_SIZE; 1139 break; 1140 default: 1141 ret = -EINVAL; 1142 break; 1143 } 1144 1145 out: 1146 __pipe_unlock(pipe); 1147 return ret; 1148 } 1149 1150 static const struct super_operations pipefs_ops = { 1151 .destroy_inode = free_inode_nonrcu, 1152 .statfs = simple_statfs, 1153 }; 1154 1155 /* 1156 * pipefs should _never_ be mounted by userland - too much of security hassle, 1157 * no real gain from having the whole whorehouse mounted. So we don't need 1158 * any operations on the root directory. However, we need a non-trivial 1159 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1160 */ 1161 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1162 int flags, const char *dev_name, void *data) 1163 { 1164 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1165 &pipefs_dentry_operations, PIPEFS_MAGIC); 1166 } 1167 1168 static struct file_system_type pipe_fs_type = { 1169 .name = "pipefs", 1170 .mount = pipefs_mount, 1171 .kill_sb = kill_anon_super, 1172 }; 1173 1174 static int __init init_pipe_fs(void) 1175 { 1176 int err = register_filesystem(&pipe_fs_type); 1177 1178 if (!err) { 1179 pipe_mnt = kern_mount(&pipe_fs_type); 1180 if (IS_ERR(pipe_mnt)) { 1181 err = PTR_ERR(pipe_mnt); 1182 unregister_filesystem(&pipe_fs_type); 1183 } 1184 } 1185 return err; 1186 } 1187 1188 fs_initcall(init_pipe_fs); 1189