1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 #include <linux/aio.h> 25 26 #include <asm/uaccess.h> 27 #include <asm/ioctls.h> 28 29 #include "internal.h" 30 31 /* 32 * The max size that a non-root user is allowed to grow the pipe. Can 33 * be set by root in /proc/sys/fs/pipe-max-size 34 */ 35 unsigned int pipe_max_size = 1048576; 36 37 /* 38 * Minimum pipe size, as required by POSIX 39 */ 40 unsigned int pipe_min_size = PAGE_SIZE; 41 42 /* 43 * We use a start+len construction, which provides full use of the 44 * allocated memory. 45 * -- Florian Coosmann (FGC) 46 * 47 * Reads with count = 0 should always return 0. 48 * -- Julian Bradfield 1999-06-07. 49 * 50 * FIFOs and Pipes now generate SIGIO for both readers and writers. 51 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 52 * 53 * pipe_read & write cleanup 54 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 55 */ 56 57 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 58 { 59 if (pipe->files) 60 mutex_lock_nested(&pipe->mutex, subclass); 61 } 62 63 void pipe_lock(struct pipe_inode_info *pipe) 64 { 65 /* 66 * pipe_lock() nests non-pipe inode locks (for writing to a file) 67 */ 68 pipe_lock_nested(pipe, I_MUTEX_PARENT); 69 } 70 EXPORT_SYMBOL(pipe_lock); 71 72 void pipe_unlock(struct pipe_inode_info *pipe) 73 { 74 if (pipe->files) 75 mutex_unlock(&pipe->mutex); 76 } 77 EXPORT_SYMBOL(pipe_unlock); 78 79 static inline void __pipe_lock(struct pipe_inode_info *pipe) 80 { 81 mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); 82 } 83 84 static inline void __pipe_unlock(struct pipe_inode_info *pipe) 85 { 86 mutex_unlock(&pipe->mutex); 87 } 88 89 void pipe_double_lock(struct pipe_inode_info *pipe1, 90 struct pipe_inode_info *pipe2) 91 { 92 BUG_ON(pipe1 == pipe2); 93 94 if (pipe1 < pipe2) { 95 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 96 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 97 } else { 98 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 99 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 100 } 101 } 102 103 /* Drop the inode semaphore and wait for a pipe event, atomically */ 104 void pipe_wait(struct pipe_inode_info *pipe) 105 { 106 DEFINE_WAIT(wait); 107 108 /* 109 * Pipes are system-local resources, so sleeping on them 110 * is considered a noninteractive wait: 111 */ 112 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 113 pipe_unlock(pipe); 114 schedule(); 115 finish_wait(&pipe->wait, &wait); 116 pipe_lock(pipe); 117 } 118 119 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 120 struct pipe_buffer *buf) 121 { 122 struct page *page = buf->page; 123 124 /* 125 * If nobody else uses this page, and we don't already have a 126 * temporary page, let's keep track of it as a one-deep 127 * allocation cache. (Otherwise just release our reference to it) 128 */ 129 if (page_count(page) == 1 && !pipe->tmp_page) 130 pipe->tmp_page = page; 131 else 132 page_cache_release(page); 133 } 134 135 /** 136 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 137 * @pipe: the pipe that the buffer belongs to 138 * @buf: the buffer to attempt to steal 139 * 140 * Description: 141 * This function attempts to steal the &struct page attached to 142 * @buf. If successful, this function returns 0 and returns with 143 * the page locked. The caller may then reuse the page for whatever 144 * he wishes; the typical use is insertion into a different file 145 * page cache. 146 */ 147 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 148 struct pipe_buffer *buf) 149 { 150 struct page *page = buf->page; 151 152 /* 153 * A reference of one is golden, that means that the owner of this 154 * page is the only one holding a reference to it. lock the page 155 * and return OK. 156 */ 157 if (page_count(page) == 1) { 158 lock_page(page); 159 return 0; 160 } 161 162 return 1; 163 } 164 EXPORT_SYMBOL(generic_pipe_buf_steal); 165 166 /** 167 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 168 * @pipe: the pipe that the buffer belongs to 169 * @buf: the buffer to get a reference to 170 * 171 * Description: 172 * This function grabs an extra reference to @buf. It's used in 173 * in the tee() system call, when we duplicate the buffers in one 174 * pipe into another. 175 */ 176 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 177 { 178 page_cache_get(buf->page); 179 } 180 EXPORT_SYMBOL(generic_pipe_buf_get); 181 182 /** 183 * generic_pipe_buf_confirm - verify contents of the pipe buffer 184 * @info: the pipe that the buffer belongs to 185 * @buf: the buffer to confirm 186 * 187 * Description: 188 * This function does nothing, because the generic pipe code uses 189 * pages that are always good when inserted into the pipe. 190 */ 191 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 192 struct pipe_buffer *buf) 193 { 194 return 0; 195 } 196 EXPORT_SYMBOL(generic_pipe_buf_confirm); 197 198 /** 199 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 200 * @pipe: the pipe that the buffer belongs to 201 * @buf: the buffer to put a reference to 202 * 203 * Description: 204 * This function releases a reference to @buf. 205 */ 206 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 207 struct pipe_buffer *buf) 208 { 209 page_cache_release(buf->page); 210 } 211 EXPORT_SYMBOL(generic_pipe_buf_release); 212 213 static const struct pipe_buf_operations anon_pipe_buf_ops = { 214 .can_merge = 1, 215 .confirm = generic_pipe_buf_confirm, 216 .release = anon_pipe_buf_release, 217 .steal = generic_pipe_buf_steal, 218 .get = generic_pipe_buf_get, 219 }; 220 221 static const struct pipe_buf_operations packet_pipe_buf_ops = { 222 .can_merge = 0, 223 .confirm = generic_pipe_buf_confirm, 224 .release = anon_pipe_buf_release, 225 .steal = generic_pipe_buf_steal, 226 .get = generic_pipe_buf_get, 227 }; 228 229 static ssize_t 230 pipe_read(struct kiocb *iocb, struct iov_iter *to) 231 { 232 size_t total_len = iov_iter_count(to); 233 struct file *filp = iocb->ki_filp; 234 struct pipe_inode_info *pipe = filp->private_data; 235 int do_wakeup; 236 ssize_t ret; 237 238 /* Null read succeeds. */ 239 if (unlikely(total_len == 0)) 240 return 0; 241 242 do_wakeup = 0; 243 ret = 0; 244 __pipe_lock(pipe); 245 for (;;) { 246 int bufs = pipe->nrbufs; 247 if (bufs) { 248 int curbuf = pipe->curbuf; 249 struct pipe_buffer *buf = pipe->bufs + curbuf; 250 const struct pipe_buf_operations *ops = buf->ops; 251 size_t chars = buf->len; 252 size_t written; 253 int error; 254 255 if (chars > total_len) 256 chars = total_len; 257 258 error = ops->confirm(pipe, buf); 259 if (error) { 260 if (!ret) 261 ret = error; 262 break; 263 } 264 265 written = copy_page_to_iter(buf->page, buf->offset, chars, to); 266 if (unlikely(written < chars)) { 267 if (!ret) 268 ret = -EFAULT; 269 break; 270 } 271 ret += chars; 272 buf->offset += chars; 273 buf->len -= chars; 274 275 /* Was it a packet buffer? Clean up and exit */ 276 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 277 total_len = chars; 278 buf->len = 0; 279 } 280 281 if (!buf->len) { 282 buf->ops = NULL; 283 ops->release(pipe, buf); 284 curbuf = (curbuf + 1) & (pipe->buffers - 1); 285 pipe->curbuf = curbuf; 286 pipe->nrbufs = --bufs; 287 do_wakeup = 1; 288 } 289 total_len -= chars; 290 if (!total_len) 291 break; /* common path: read succeeded */ 292 } 293 if (bufs) /* More to do? */ 294 continue; 295 if (!pipe->writers) 296 break; 297 if (!pipe->waiting_writers) { 298 /* syscall merging: Usually we must not sleep 299 * if O_NONBLOCK is set, or if we got some data. 300 * But if a writer sleeps in kernel space, then 301 * we can wait for that data without violating POSIX. 302 */ 303 if (ret) 304 break; 305 if (filp->f_flags & O_NONBLOCK) { 306 ret = -EAGAIN; 307 break; 308 } 309 } 310 if (signal_pending(current)) { 311 if (!ret) 312 ret = -ERESTARTSYS; 313 break; 314 } 315 if (do_wakeup) { 316 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 317 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 318 } 319 pipe_wait(pipe); 320 } 321 __pipe_unlock(pipe); 322 323 /* Signal writers asynchronously that there is more room. */ 324 if (do_wakeup) { 325 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 326 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 327 } 328 if (ret > 0) 329 file_accessed(filp); 330 return ret; 331 } 332 333 static inline int is_packetized(struct file *file) 334 { 335 return (file->f_flags & O_DIRECT) != 0; 336 } 337 338 static ssize_t 339 pipe_write(struct kiocb *iocb, struct iov_iter *from) 340 { 341 struct file *filp = iocb->ki_filp; 342 struct pipe_inode_info *pipe = filp->private_data; 343 ssize_t ret = 0; 344 int do_wakeup = 0; 345 size_t total_len = iov_iter_count(from); 346 ssize_t chars; 347 348 /* Null write succeeds. */ 349 if (unlikely(total_len == 0)) 350 return 0; 351 352 __pipe_lock(pipe); 353 354 if (!pipe->readers) { 355 send_sig(SIGPIPE, current, 0); 356 ret = -EPIPE; 357 goto out; 358 } 359 360 /* We try to merge small writes */ 361 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 362 if (pipe->nrbufs && chars != 0) { 363 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 364 (pipe->buffers - 1); 365 struct pipe_buffer *buf = pipe->bufs + lastbuf; 366 const struct pipe_buf_operations *ops = buf->ops; 367 int offset = buf->offset + buf->len; 368 369 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 370 int error = ops->confirm(pipe, buf); 371 if (error) 372 goto out; 373 374 ret = copy_page_from_iter(buf->page, offset, chars, from); 375 if (unlikely(ret < chars)) { 376 error = -EFAULT; 377 goto out; 378 } 379 do_wakeup = 1; 380 buf->len += chars; 381 ret = chars; 382 if (!iov_iter_count(from)) 383 goto out; 384 } 385 } 386 387 for (;;) { 388 int bufs; 389 390 if (!pipe->readers) { 391 send_sig(SIGPIPE, current, 0); 392 if (!ret) 393 ret = -EPIPE; 394 break; 395 } 396 bufs = pipe->nrbufs; 397 if (bufs < pipe->buffers) { 398 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 399 struct pipe_buffer *buf = pipe->bufs + newbuf; 400 struct page *page = pipe->tmp_page; 401 int copied; 402 403 if (!page) { 404 page = alloc_page(GFP_HIGHUSER); 405 if (unlikely(!page)) { 406 ret = ret ? : -ENOMEM; 407 break; 408 } 409 pipe->tmp_page = page; 410 } 411 /* Always wake up, even if the copy fails. Otherwise 412 * we lock up (O_NONBLOCK-)readers that sleep due to 413 * syscall merging. 414 * FIXME! Is this really true? 415 */ 416 do_wakeup = 1; 417 copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); 418 if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { 419 if (!ret) 420 ret = -EFAULT; 421 break; 422 } 423 ret += copied; 424 425 /* Insert it into the buffer array */ 426 buf->page = page; 427 buf->ops = &anon_pipe_buf_ops; 428 buf->offset = 0; 429 buf->len = copied; 430 buf->flags = 0; 431 if (is_packetized(filp)) { 432 buf->ops = &packet_pipe_buf_ops; 433 buf->flags = PIPE_BUF_FLAG_PACKET; 434 } 435 pipe->nrbufs = ++bufs; 436 pipe->tmp_page = NULL; 437 438 if (!iov_iter_count(from)) 439 break; 440 } 441 if (bufs < pipe->buffers) 442 continue; 443 if (filp->f_flags & O_NONBLOCK) { 444 if (!ret) 445 ret = -EAGAIN; 446 break; 447 } 448 if (signal_pending(current)) { 449 if (!ret) 450 ret = -ERESTARTSYS; 451 break; 452 } 453 if (do_wakeup) { 454 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 455 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 456 do_wakeup = 0; 457 } 458 pipe->waiting_writers++; 459 pipe_wait(pipe); 460 pipe->waiting_writers--; 461 } 462 out: 463 __pipe_unlock(pipe); 464 if (do_wakeup) { 465 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 466 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 467 } 468 if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { 469 int err = file_update_time(filp); 470 if (err) 471 ret = err; 472 sb_end_write(file_inode(filp)->i_sb); 473 } 474 return ret; 475 } 476 477 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 478 { 479 struct pipe_inode_info *pipe = filp->private_data; 480 int count, buf, nrbufs; 481 482 switch (cmd) { 483 case FIONREAD: 484 __pipe_lock(pipe); 485 count = 0; 486 buf = pipe->curbuf; 487 nrbufs = pipe->nrbufs; 488 while (--nrbufs >= 0) { 489 count += pipe->bufs[buf].len; 490 buf = (buf+1) & (pipe->buffers - 1); 491 } 492 __pipe_unlock(pipe); 493 494 return put_user(count, (int __user *)arg); 495 default: 496 return -ENOIOCTLCMD; 497 } 498 } 499 500 /* No kernel lock held - fine */ 501 static unsigned int 502 pipe_poll(struct file *filp, poll_table *wait) 503 { 504 unsigned int mask; 505 struct pipe_inode_info *pipe = filp->private_data; 506 int nrbufs; 507 508 poll_wait(filp, &pipe->wait, wait); 509 510 /* Reading only -- no need for acquiring the semaphore. */ 511 nrbufs = pipe->nrbufs; 512 mask = 0; 513 if (filp->f_mode & FMODE_READ) { 514 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 515 if (!pipe->writers && filp->f_version != pipe->w_counter) 516 mask |= POLLHUP; 517 } 518 519 if (filp->f_mode & FMODE_WRITE) { 520 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 521 /* 522 * Most Unices do not set POLLERR for FIFOs but on Linux they 523 * behave exactly like pipes for poll(). 524 */ 525 if (!pipe->readers) 526 mask |= POLLERR; 527 } 528 529 return mask; 530 } 531 532 static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) 533 { 534 int kill = 0; 535 536 spin_lock(&inode->i_lock); 537 if (!--pipe->files) { 538 inode->i_pipe = NULL; 539 kill = 1; 540 } 541 spin_unlock(&inode->i_lock); 542 543 if (kill) 544 free_pipe_info(pipe); 545 } 546 547 static int 548 pipe_release(struct inode *inode, struct file *file) 549 { 550 struct pipe_inode_info *pipe = file->private_data; 551 552 __pipe_lock(pipe); 553 if (file->f_mode & FMODE_READ) 554 pipe->readers--; 555 if (file->f_mode & FMODE_WRITE) 556 pipe->writers--; 557 558 if (pipe->readers || pipe->writers) { 559 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 560 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 561 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 562 } 563 __pipe_unlock(pipe); 564 565 put_pipe_info(inode, pipe); 566 return 0; 567 } 568 569 static int 570 pipe_fasync(int fd, struct file *filp, int on) 571 { 572 struct pipe_inode_info *pipe = filp->private_data; 573 int retval = 0; 574 575 __pipe_lock(pipe); 576 if (filp->f_mode & FMODE_READ) 577 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 578 if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { 579 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 580 if (retval < 0 && (filp->f_mode & FMODE_READ)) 581 /* this can happen only if on == T */ 582 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 583 } 584 __pipe_unlock(pipe); 585 return retval; 586 } 587 588 struct pipe_inode_info *alloc_pipe_info(void) 589 { 590 struct pipe_inode_info *pipe; 591 592 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 593 if (pipe) { 594 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 595 if (pipe->bufs) { 596 init_waitqueue_head(&pipe->wait); 597 pipe->r_counter = pipe->w_counter = 1; 598 pipe->buffers = PIPE_DEF_BUFFERS; 599 mutex_init(&pipe->mutex); 600 return pipe; 601 } 602 kfree(pipe); 603 } 604 605 return NULL; 606 } 607 608 void free_pipe_info(struct pipe_inode_info *pipe) 609 { 610 int i; 611 612 for (i = 0; i < pipe->buffers; i++) { 613 struct pipe_buffer *buf = pipe->bufs + i; 614 if (buf->ops) 615 buf->ops->release(pipe, buf); 616 } 617 if (pipe->tmp_page) 618 __free_page(pipe->tmp_page); 619 kfree(pipe->bufs); 620 kfree(pipe); 621 } 622 623 static struct vfsmount *pipe_mnt __read_mostly; 624 625 /* 626 * pipefs_dname() is called from d_path(). 627 */ 628 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 629 { 630 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 631 dentry->d_inode->i_ino); 632 } 633 634 static const struct dentry_operations pipefs_dentry_operations = { 635 .d_dname = pipefs_dname, 636 }; 637 638 static struct inode * get_pipe_inode(void) 639 { 640 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 641 struct pipe_inode_info *pipe; 642 643 if (!inode) 644 goto fail_inode; 645 646 inode->i_ino = get_next_ino(); 647 648 pipe = alloc_pipe_info(); 649 if (!pipe) 650 goto fail_iput; 651 652 inode->i_pipe = pipe; 653 pipe->files = 2; 654 pipe->readers = pipe->writers = 1; 655 inode->i_fop = &pipefifo_fops; 656 657 /* 658 * Mark the inode dirty from the very beginning, 659 * that way it will never be moved to the dirty 660 * list because "mark_inode_dirty()" will think 661 * that it already _is_ on the dirty list. 662 */ 663 inode->i_state = I_DIRTY; 664 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 665 inode->i_uid = current_fsuid(); 666 inode->i_gid = current_fsgid(); 667 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 668 669 return inode; 670 671 fail_iput: 672 iput(inode); 673 674 fail_inode: 675 return NULL; 676 } 677 678 int create_pipe_files(struct file **res, int flags) 679 { 680 int err; 681 struct inode *inode = get_pipe_inode(); 682 struct file *f; 683 struct path path; 684 static struct qstr name = { .name = "" }; 685 686 if (!inode) 687 return -ENFILE; 688 689 err = -ENOMEM; 690 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 691 if (!path.dentry) 692 goto err_inode; 693 path.mnt = mntget(pipe_mnt); 694 695 d_instantiate(path.dentry, inode); 696 697 err = -ENFILE; 698 f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); 699 if (IS_ERR(f)) 700 goto err_dentry; 701 702 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 703 f->private_data = inode->i_pipe; 704 705 res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); 706 if (IS_ERR(res[0])) 707 goto err_file; 708 709 path_get(&path); 710 res[0]->private_data = inode->i_pipe; 711 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 712 res[1] = f; 713 return 0; 714 715 err_file: 716 put_filp(f); 717 err_dentry: 718 free_pipe_info(inode->i_pipe); 719 path_put(&path); 720 return err; 721 722 err_inode: 723 free_pipe_info(inode->i_pipe); 724 iput(inode); 725 return err; 726 } 727 728 static int __do_pipe_flags(int *fd, struct file **files, int flags) 729 { 730 int error; 731 int fdw, fdr; 732 733 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 734 return -EINVAL; 735 736 error = create_pipe_files(files, flags); 737 if (error) 738 return error; 739 740 error = get_unused_fd_flags(flags); 741 if (error < 0) 742 goto err_read_pipe; 743 fdr = error; 744 745 error = get_unused_fd_flags(flags); 746 if (error < 0) 747 goto err_fdr; 748 fdw = error; 749 750 audit_fd_pair(fdr, fdw); 751 fd[0] = fdr; 752 fd[1] = fdw; 753 return 0; 754 755 err_fdr: 756 put_unused_fd(fdr); 757 err_read_pipe: 758 fput(files[0]); 759 fput(files[1]); 760 return error; 761 } 762 763 int do_pipe_flags(int *fd, int flags) 764 { 765 struct file *files[2]; 766 int error = __do_pipe_flags(fd, files, flags); 767 if (!error) { 768 fd_install(fd[0], files[0]); 769 fd_install(fd[1], files[1]); 770 } 771 return error; 772 } 773 774 /* 775 * sys_pipe() is the normal C calling standard for creating 776 * a pipe. It's not the way Unix traditionally does this, though. 777 */ 778 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 779 { 780 struct file *files[2]; 781 int fd[2]; 782 int error; 783 784 error = __do_pipe_flags(fd, files, flags); 785 if (!error) { 786 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 787 fput(files[0]); 788 fput(files[1]); 789 put_unused_fd(fd[0]); 790 put_unused_fd(fd[1]); 791 error = -EFAULT; 792 } else { 793 fd_install(fd[0], files[0]); 794 fd_install(fd[1], files[1]); 795 } 796 } 797 return error; 798 } 799 800 SYSCALL_DEFINE1(pipe, int __user *, fildes) 801 { 802 return sys_pipe2(fildes, 0); 803 } 804 805 static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) 806 { 807 int cur = *cnt; 808 809 while (cur == *cnt) { 810 pipe_wait(pipe); 811 if (signal_pending(current)) 812 break; 813 } 814 return cur == *cnt ? -ERESTARTSYS : 0; 815 } 816 817 static void wake_up_partner(struct pipe_inode_info *pipe) 818 { 819 wake_up_interruptible(&pipe->wait); 820 } 821 822 static int fifo_open(struct inode *inode, struct file *filp) 823 { 824 struct pipe_inode_info *pipe; 825 bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; 826 int ret; 827 828 filp->f_version = 0; 829 830 spin_lock(&inode->i_lock); 831 if (inode->i_pipe) { 832 pipe = inode->i_pipe; 833 pipe->files++; 834 spin_unlock(&inode->i_lock); 835 } else { 836 spin_unlock(&inode->i_lock); 837 pipe = alloc_pipe_info(); 838 if (!pipe) 839 return -ENOMEM; 840 pipe->files = 1; 841 spin_lock(&inode->i_lock); 842 if (unlikely(inode->i_pipe)) { 843 inode->i_pipe->files++; 844 spin_unlock(&inode->i_lock); 845 free_pipe_info(pipe); 846 pipe = inode->i_pipe; 847 } else { 848 inode->i_pipe = pipe; 849 spin_unlock(&inode->i_lock); 850 } 851 } 852 filp->private_data = pipe; 853 /* OK, we have a pipe and it's pinned down */ 854 855 __pipe_lock(pipe); 856 857 /* We can only do regular read/write on fifos */ 858 filp->f_mode &= (FMODE_READ | FMODE_WRITE); 859 860 switch (filp->f_mode) { 861 case FMODE_READ: 862 /* 863 * O_RDONLY 864 * POSIX.1 says that O_NONBLOCK means return with the FIFO 865 * opened, even when there is no process writing the FIFO. 866 */ 867 pipe->r_counter++; 868 if (pipe->readers++ == 0) 869 wake_up_partner(pipe); 870 871 if (!is_pipe && !pipe->writers) { 872 if ((filp->f_flags & O_NONBLOCK)) { 873 /* suppress POLLHUP until we have 874 * seen a writer */ 875 filp->f_version = pipe->w_counter; 876 } else { 877 if (wait_for_partner(pipe, &pipe->w_counter)) 878 goto err_rd; 879 } 880 } 881 break; 882 883 case FMODE_WRITE: 884 /* 885 * O_WRONLY 886 * POSIX.1 says that O_NONBLOCK means return -1 with 887 * errno=ENXIO when there is no process reading the FIFO. 888 */ 889 ret = -ENXIO; 890 if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) 891 goto err; 892 893 pipe->w_counter++; 894 if (!pipe->writers++) 895 wake_up_partner(pipe); 896 897 if (!is_pipe && !pipe->readers) { 898 if (wait_for_partner(pipe, &pipe->r_counter)) 899 goto err_wr; 900 } 901 break; 902 903 case FMODE_READ | FMODE_WRITE: 904 /* 905 * O_RDWR 906 * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. 907 * This implementation will NEVER block on a O_RDWR open, since 908 * the process can at least talk to itself. 909 */ 910 911 pipe->readers++; 912 pipe->writers++; 913 pipe->r_counter++; 914 pipe->w_counter++; 915 if (pipe->readers == 1 || pipe->writers == 1) 916 wake_up_partner(pipe); 917 break; 918 919 default: 920 ret = -EINVAL; 921 goto err; 922 } 923 924 /* Ok! */ 925 __pipe_unlock(pipe); 926 return 0; 927 928 err_rd: 929 if (!--pipe->readers) 930 wake_up_interruptible(&pipe->wait); 931 ret = -ERESTARTSYS; 932 goto err; 933 934 err_wr: 935 if (!--pipe->writers) 936 wake_up_interruptible(&pipe->wait); 937 ret = -ERESTARTSYS; 938 goto err; 939 940 err: 941 __pipe_unlock(pipe); 942 943 put_pipe_info(inode, pipe); 944 return ret; 945 } 946 947 const struct file_operations pipefifo_fops = { 948 .open = fifo_open, 949 .llseek = no_llseek, 950 .read = new_sync_read, 951 .read_iter = pipe_read, 952 .write = new_sync_write, 953 .write_iter = pipe_write, 954 .poll = pipe_poll, 955 .unlocked_ioctl = pipe_ioctl, 956 .release = pipe_release, 957 .fasync = pipe_fasync, 958 }; 959 960 /* 961 * Allocate a new array of pipe buffers and copy the info over. Returns the 962 * pipe size if successful, or return -ERROR on error. 963 */ 964 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 965 { 966 struct pipe_buffer *bufs; 967 968 /* 969 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 970 * expect a lot of shrink+grow operations, just free and allocate 971 * again like we would do for growing. If the pipe currently 972 * contains more buffers than arg, then return busy. 973 */ 974 if (nr_pages < pipe->nrbufs) 975 return -EBUSY; 976 977 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 978 if (unlikely(!bufs)) 979 return -ENOMEM; 980 981 /* 982 * The pipe array wraps around, so just start the new one at zero 983 * and adjust the indexes. 984 */ 985 if (pipe->nrbufs) { 986 unsigned int tail; 987 unsigned int head; 988 989 tail = pipe->curbuf + pipe->nrbufs; 990 if (tail < pipe->buffers) 991 tail = 0; 992 else 993 tail &= (pipe->buffers - 1); 994 995 head = pipe->nrbufs - tail; 996 if (head) 997 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 998 if (tail) 999 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1000 } 1001 1002 pipe->curbuf = 0; 1003 kfree(pipe->bufs); 1004 pipe->bufs = bufs; 1005 pipe->buffers = nr_pages; 1006 return nr_pages * PAGE_SIZE; 1007 } 1008 1009 /* 1010 * Currently we rely on the pipe array holding a power-of-2 number 1011 * of pages. 1012 */ 1013 static inline unsigned int round_pipe_size(unsigned int size) 1014 { 1015 unsigned long nr_pages; 1016 1017 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1018 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1019 } 1020 1021 /* 1022 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1023 * will return an error. 1024 */ 1025 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1026 size_t *lenp, loff_t *ppos) 1027 { 1028 int ret; 1029 1030 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1031 if (ret < 0 || !write) 1032 return ret; 1033 1034 pipe_max_size = round_pipe_size(pipe_max_size); 1035 return ret; 1036 } 1037 1038 /* 1039 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1040 * location, so checking ->i_pipe is not enough to verify that this is a 1041 * pipe. 1042 */ 1043 struct pipe_inode_info *get_pipe_info(struct file *file) 1044 { 1045 return file->f_op == &pipefifo_fops ? file->private_data : NULL; 1046 } 1047 1048 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1049 { 1050 struct pipe_inode_info *pipe; 1051 long ret; 1052 1053 pipe = get_pipe_info(file); 1054 if (!pipe) 1055 return -EBADF; 1056 1057 __pipe_lock(pipe); 1058 1059 switch (cmd) { 1060 case F_SETPIPE_SZ: { 1061 unsigned int size, nr_pages; 1062 1063 size = round_pipe_size(arg); 1064 nr_pages = size >> PAGE_SHIFT; 1065 1066 ret = -EINVAL; 1067 if (!nr_pages) 1068 goto out; 1069 1070 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1071 ret = -EPERM; 1072 goto out; 1073 } 1074 ret = pipe_set_size(pipe, nr_pages); 1075 break; 1076 } 1077 case F_GETPIPE_SZ: 1078 ret = pipe->buffers * PAGE_SIZE; 1079 break; 1080 default: 1081 ret = -EINVAL; 1082 break; 1083 } 1084 1085 out: 1086 __pipe_unlock(pipe); 1087 return ret; 1088 } 1089 1090 static const struct super_operations pipefs_ops = { 1091 .destroy_inode = free_inode_nonrcu, 1092 .statfs = simple_statfs, 1093 }; 1094 1095 /* 1096 * pipefs should _never_ be mounted by userland - too much of security hassle, 1097 * no real gain from having the whole whorehouse mounted. So we don't need 1098 * any operations on the root directory. However, we need a non-trivial 1099 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1100 */ 1101 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1102 int flags, const char *dev_name, void *data) 1103 { 1104 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1105 &pipefs_dentry_operations, PIPEFS_MAGIC); 1106 } 1107 1108 static struct file_system_type pipe_fs_type = { 1109 .name = "pipefs", 1110 .mount = pipefs_mount, 1111 .kill_sb = kill_anon_super, 1112 }; 1113 1114 static int __init init_pipe_fs(void) 1115 { 1116 int err = register_filesystem(&pipe_fs_type); 1117 1118 if (!err) { 1119 pipe_mnt = kern_mount(&pipe_fs_type); 1120 if (IS_ERR(pipe_mnt)) { 1121 err = PTR_ERR(pipe_mnt); 1122 unregister_filesystem(&pipe_fs_type); 1123 } 1124 } 1125 return err; 1126 } 1127 1128 fs_initcall(init_pipe_fs); 1129