1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 25 #include <asm/uaccess.h> 26 #include <asm/ioctls.h> 27 28 /* 29 * The max size that a non-root user is allowed to grow the pipe. Can 30 * be set by root in /proc/sys/fs/pipe-max-size 31 */ 32 unsigned int pipe_max_size = 1048576; 33 34 /* 35 * Minimum pipe size, as required by POSIX 36 */ 37 unsigned int pipe_min_size = PAGE_SIZE; 38 39 /* 40 * We use a start+len construction, which provides full use of the 41 * allocated memory. 42 * -- Florian Coosmann (FGC) 43 * 44 * Reads with count = 0 should always return 0. 45 * -- Julian Bradfield 1999-06-07. 46 * 47 * FIFOs and Pipes now generate SIGIO for both readers and writers. 48 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 49 * 50 * pipe_read & write cleanup 51 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 52 */ 53 54 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 55 { 56 if (pipe->inode) 57 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 58 } 59 60 void pipe_lock(struct pipe_inode_info *pipe) 61 { 62 /* 63 * pipe_lock() nests non-pipe inode locks (for writing to a file) 64 */ 65 pipe_lock_nested(pipe, I_MUTEX_PARENT); 66 } 67 EXPORT_SYMBOL(pipe_lock); 68 69 void pipe_unlock(struct pipe_inode_info *pipe) 70 { 71 if (pipe->inode) 72 mutex_unlock(&pipe->inode->i_mutex); 73 } 74 EXPORT_SYMBOL(pipe_unlock); 75 76 void pipe_double_lock(struct pipe_inode_info *pipe1, 77 struct pipe_inode_info *pipe2) 78 { 79 BUG_ON(pipe1 == pipe2); 80 81 if (pipe1 < pipe2) { 82 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 83 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 84 } else { 85 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 86 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 87 } 88 } 89 90 /* Drop the inode semaphore and wait for a pipe event, atomically */ 91 void pipe_wait(struct pipe_inode_info *pipe) 92 { 93 DEFINE_WAIT(wait); 94 95 /* 96 * Pipes are system-local resources, so sleeping on them 97 * is considered a noninteractive wait: 98 */ 99 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 100 pipe_unlock(pipe); 101 schedule(); 102 finish_wait(&pipe->wait, &wait); 103 pipe_lock(pipe); 104 } 105 106 static int 107 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 108 int atomic) 109 { 110 unsigned long copy; 111 112 while (len > 0) { 113 while (!iov->iov_len) 114 iov++; 115 copy = min_t(unsigned long, len, iov->iov_len); 116 117 if (atomic) { 118 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 119 return -EFAULT; 120 } else { 121 if (copy_from_user(to, iov->iov_base, copy)) 122 return -EFAULT; 123 } 124 to += copy; 125 len -= copy; 126 iov->iov_base += copy; 127 iov->iov_len -= copy; 128 } 129 return 0; 130 } 131 132 static int 133 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 134 int atomic) 135 { 136 unsigned long copy; 137 138 while (len > 0) { 139 while (!iov->iov_len) 140 iov++; 141 copy = min_t(unsigned long, len, iov->iov_len); 142 143 if (atomic) { 144 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 145 return -EFAULT; 146 } else { 147 if (copy_to_user(iov->iov_base, from, copy)) 148 return -EFAULT; 149 } 150 from += copy; 151 len -= copy; 152 iov->iov_base += copy; 153 iov->iov_len -= copy; 154 } 155 return 0; 156 } 157 158 /* 159 * Attempt to pre-fault in the user memory, so we can use atomic copies. 160 * Returns the number of bytes not faulted in. 161 */ 162 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 163 { 164 while (!iov->iov_len) 165 iov++; 166 167 while (len > 0) { 168 unsigned long this_len; 169 170 this_len = min_t(unsigned long, len, iov->iov_len); 171 if (fault_in_pages_writeable(iov->iov_base, this_len)) 172 break; 173 174 len -= this_len; 175 iov++; 176 } 177 178 return len; 179 } 180 181 /* 182 * Pre-fault in the user memory, so we can use atomic copies. 183 */ 184 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 185 { 186 while (!iov->iov_len) 187 iov++; 188 189 while (len > 0) { 190 unsigned long this_len; 191 192 this_len = min_t(unsigned long, len, iov->iov_len); 193 fault_in_pages_readable(iov->iov_base, this_len); 194 len -= this_len; 195 iov++; 196 } 197 } 198 199 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 200 struct pipe_buffer *buf) 201 { 202 struct page *page = buf->page; 203 204 /* 205 * If nobody else uses this page, and we don't already have a 206 * temporary page, let's keep track of it as a one-deep 207 * allocation cache. (Otherwise just release our reference to it) 208 */ 209 if (page_count(page) == 1 && !pipe->tmp_page) 210 pipe->tmp_page = page; 211 else 212 page_cache_release(page); 213 } 214 215 /** 216 * generic_pipe_buf_map - virtually map a pipe buffer 217 * @pipe: the pipe that the buffer belongs to 218 * @buf: the buffer that should be mapped 219 * @atomic: whether to use an atomic map 220 * 221 * Description: 222 * This function returns a kernel virtual address mapping for the 223 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 224 * and the caller has to be careful not to fault before calling 225 * the unmap function. 226 * 227 * Note that this function occupies KM_USER0 if @atomic != 0. 228 */ 229 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 230 struct pipe_buffer *buf, int atomic) 231 { 232 if (atomic) { 233 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 234 return kmap_atomic(buf->page); 235 } 236 237 return kmap(buf->page); 238 } 239 EXPORT_SYMBOL(generic_pipe_buf_map); 240 241 /** 242 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 243 * @pipe: the pipe that the buffer belongs to 244 * @buf: the buffer that should be unmapped 245 * @map_data: the data that the mapping function returned 246 * 247 * Description: 248 * This function undoes the mapping that ->map() provided. 249 */ 250 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 251 struct pipe_buffer *buf, void *map_data) 252 { 253 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 254 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 255 kunmap_atomic(map_data); 256 } else 257 kunmap(buf->page); 258 } 259 EXPORT_SYMBOL(generic_pipe_buf_unmap); 260 261 /** 262 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 263 * @pipe: the pipe that the buffer belongs to 264 * @buf: the buffer to attempt to steal 265 * 266 * Description: 267 * This function attempts to steal the &struct page attached to 268 * @buf. If successful, this function returns 0 and returns with 269 * the page locked. The caller may then reuse the page for whatever 270 * he wishes; the typical use is insertion into a different file 271 * page cache. 272 */ 273 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 274 struct pipe_buffer *buf) 275 { 276 struct page *page = buf->page; 277 278 /* 279 * A reference of one is golden, that means that the owner of this 280 * page is the only one holding a reference to it. lock the page 281 * and return OK. 282 */ 283 if (page_count(page) == 1) { 284 lock_page(page); 285 return 0; 286 } 287 288 return 1; 289 } 290 EXPORT_SYMBOL(generic_pipe_buf_steal); 291 292 /** 293 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 294 * @pipe: the pipe that the buffer belongs to 295 * @buf: the buffer to get a reference to 296 * 297 * Description: 298 * This function grabs an extra reference to @buf. It's used in 299 * in the tee() system call, when we duplicate the buffers in one 300 * pipe into another. 301 */ 302 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 303 { 304 page_cache_get(buf->page); 305 } 306 EXPORT_SYMBOL(generic_pipe_buf_get); 307 308 /** 309 * generic_pipe_buf_confirm - verify contents of the pipe buffer 310 * @info: the pipe that the buffer belongs to 311 * @buf: the buffer to confirm 312 * 313 * Description: 314 * This function does nothing, because the generic pipe code uses 315 * pages that are always good when inserted into the pipe. 316 */ 317 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 318 struct pipe_buffer *buf) 319 { 320 return 0; 321 } 322 EXPORT_SYMBOL(generic_pipe_buf_confirm); 323 324 /** 325 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 326 * @pipe: the pipe that the buffer belongs to 327 * @buf: the buffer to put a reference to 328 * 329 * Description: 330 * This function releases a reference to @buf. 331 */ 332 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 333 struct pipe_buffer *buf) 334 { 335 page_cache_release(buf->page); 336 } 337 EXPORT_SYMBOL(generic_pipe_buf_release); 338 339 static const struct pipe_buf_operations anon_pipe_buf_ops = { 340 .can_merge = 1, 341 .map = generic_pipe_buf_map, 342 .unmap = generic_pipe_buf_unmap, 343 .confirm = generic_pipe_buf_confirm, 344 .release = anon_pipe_buf_release, 345 .steal = generic_pipe_buf_steal, 346 .get = generic_pipe_buf_get, 347 }; 348 349 static const struct pipe_buf_operations packet_pipe_buf_ops = { 350 .can_merge = 0, 351 .map = generic_pipe_buf_map, 352 .unmap = generic_pipe_buf_unmap, 353 .confirm = generic_pipe_buf_confirm, 354 .release = anon_pipe_buf_release, 355 .steal = generic_pipe_buf_steal, 356 .get = generic_pipe_buf_get, 357 }; 358 359 static ssize_t 360 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 361 unsigned long nr_segs, loff_t pos) 362 { 363 struct file *filp = iocb->ki_filp; 364 struct inode *inode = filp->f_path.dentry->d_inode; 365 struct pipe_inode_info *pipe; 366 int do_wakeup; 367 ssize_t ret; 368 struct iovec *iov = (struct iovec *)_iov; 369 size_t total_len; 370 371 total_len = iov_length(iov, nr_segs); 372 /* Null read succeeds. */ 373 if (unlikely(total_len == 0)) 374 return 0; 375 376 do_wakeup = 0; 377 ret = 0; 378 mutex_lock(&inode->i_mutex); 379 pipe = inode->i_pipe; 380 for (;;) { 381 int bufs = pipe->nrbufs; 382 if (bufs) { 383 int curbuf = pipe->curbuf; 384 struct pipe_buffer *buf = pipe->bufs + curbuf; 385 const struct pipe_buf_operations *ops = buf->ops; 386 void *addr; 387 size_t chars = buf->len; 388 int error, atomic; 389 390 if (chars > total_len) 391 chars = total_len; 392 393 error = ops->confirm(pipe, buf); 394 if (error) { 395 if (!ret) 396 ret = error; 397 break; 398 } 399 400 atomic = !iov_fault_in_pages_write(iov, chars); 401 redo: 402 addr = ops->map(pipe, buf, atomic); 403 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 404 ops->unmap(pipe, buf, addr); 405 if (unlikely(error)) { 406 /* 407 * Just retry with the slow path if we failed. 408 */ 409 if (atomic) { 410 atomic = 0; 411 goto redo; 412 } 413 if (!ret) 414 ret = error; 415 break; 416 } 417 ret += chars; 418 buf->offset += chars; 419 buf->len -= chars; 420 421 /* Was it a packet buffer? Clean up and exit */ 422 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 423 total_len = chars; 424 buf->len = 0; 425 } 426 427 if (!buf->len) { 428 buf->ops = NULL; 429 ops->release(pipe, buf); 430 curbuf = (curbuf + 1) & (pipe->buffers - 1); 431 pipe->curbuf = curbuf; 432 pipe->nrbufs = --bufs; 433 do_wakeup = 1; 434 } 435 total_len -= chars; 436 if (!total_len) 437 break; /* common path: read succeeded */ 438 } 439 if (bufs) /* More to do? */ 440 continue; 441 if (!pipe->writers) 442 break; 443 if (!pipe->waiting_writers) { 444 /* syscall merging: Usually we must not sleep 445 * if O_NONBLOCK is set, or if we got some data. 446 * But if a writer sleeps in kernel space, then 447 * we can wait for that data without violating POSIX. 448 */ 449 if (ret) 450 break; 451 if (filp->f_flags & O_NONBLOCK) { 452 ret = -EAGAIN; 453 break; 454 } 455 } 456 if (signal_pending(current)) { 457 if (!ret) 458 ret = -ERESTARTSYS; 459 break; 460 } 461 if (do_wakeup) { 462 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 463 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 464 } 465 pipe_wait(pipe); 466 } 467 mutex_unlock(&inode->i_mutex); 468 469 /* Signal writers asynchronously that there is more room. */ 470 if (do_wakeup) { 471 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 472 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 473 } 474 if (ret > 0) 475 file_accessed(filp); 476 return ret; 477 } 478 479 static inline int is_packetized(struct file *file) 480 { 481 return (file->f_flags & O_DIRECT) != 0; 482 } 483 484 static ssize_t 485 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 486 unsigned long nr_segs, loff_t ppos) 487 { 488 struct file *filp = iocb->ki_filp; 489 struct inode *inode = filp->f_path.dentry->d_inode; 490 struct pipe_inode_info *pipe; 491 ssize_t ret; 492 int do_wakeup; 493 struct iovec *iov = (struct iovec *)_iov; 494 size_t total_len; 495 ssize_t chars; 496 497 total_len = iov_length(iov, nr_segs); 498 /* Null write succeeds. */ 499 if (unlikely(total_len == 0)) 500 return 0; 501 502 do_wakeup = 0; 503 ret = 0; 504 mutex_lock(&inode->i_mutex); 505 pipe = inode->i_pipe; 506 507 if (!pipe->readers) { 508 send_sig(SIGPIPE, current, 0); 509 ret = -EPIPE; 510 goto out; 511 } 512 513 /* We try to merge small writes */ 514 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 515 if (pipe->nrbufs && chars != 0) { 516 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 517 (pipe->buffers - 1); 518 struct pipe_buffer *buf = pipe->bufs + lastbuf; 519 const struct pipe_buf_operations *ops = buf->ops; 520 int offset = buf->offset + buf->len; 521 522 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 523 int error, atomic = 1; 524 void *addr; 525 526 error = ops->confirm(pipe, buf); 527 if (error) 528 goto out; 529 530 iov_fault_in_pages_read(iov, chars); 531 redo1: 532 addr = ops->map(pipe, buf, atomic); 533 error = pipe_iov_copy_from_user(offset + addr, iov, 534 chars, atomic); 535 ops->unmap(pipe, buf, addr); 536 ret = error; 537 do_wakeup = 1; 538 if (error) { 539 if (atomic) { 540 atomic = 0; 541 goto redo1; 542 } 543 goto out; 544 } 545 buf->len += chars; 546 total_len -= chars; 547 ret = chars; 548 if (!total_len) 549 goto out; 550 } 551 } 552 553 for (;;) { 554 int bufs; 555 556 if (!pipe->readers) { 557 send_sig(SIGPIPE, current, 0); 558 if (!ret) 559 ret = -EPIPE; 560 break; 561 } 562 bufs = pipe->nrbufs; 563 if (bufs < pipe->buffers) { 564 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 565 struct pipe_buffer *buf = pipe->bufs + newbuf; 566 struct page *page = pipe->tmp_page; 567 char *src; 568 int error, atomic = 1; 569 570 if (!page) { 571 page = alloc_page(GFP_HIGHUSER); 572 if (unlikely(!page)) { 573 ret = ret ? : -ENOMEM; 574 break; 575 } 576 pipe->tmp_page = page; 577 } 578 /* Always wake up, even if the copy fails. Otherwise 579 * we lock up (O_NONBLOCK-)readers that sleep due to 580 * syscall merging. 581 * FIXME! Is this really true? 582 */ 583 do_wakeup = 1; 584 chars = PAGE_SIZE; 585 if (chars > total_len) 586 chars = total_len; 587 588 iov_fault_in_pages_read(iov, chars); 589 redo2: 590 if (atomic) 591 src = kmap_atomic(page); 592 else 593 src = kmap(page); 594 595 error = pipe_iov_copy_from_user(src, iov, chars, 596 atomic); 597 if (atomic) 598 kunmap_atomic(src); 599 else 600 kunmap(page); 601 602 if (unlikely(error)) { 603 if (atomic) { 604 atomic = 0; 605 goto redo2; 606 } 607 if (!ret) 608 ret = error; 609 break; 610 } 611 ret += chars; 612 613 /* Insert it into the buffer array */ 614 buf->page = page; 615 buf->ops = &anon_pipe_buf_ops; 616 buf->offset = 0; 617 buf->len = chars; 618 buf->flags = 0; 619 if (is_packetized(filp)) { 620 buf->ops = &packet_pipe_buf_ops; 621 buf->flags = PIPE_BUF_FLAG_PACKET; 622 } 623 pipe->nrbufs = ++bufs; 624 pipe->tmp_page = NULL; 625 626 total_len -= chars; 627 if (!total_len) 628 break; 629 } 630 if (bufs < pipe->buffers) 631 continue; 632 if (filp->f_flags & O_NONBLOCK) { 633 if (!ret) 634 ret = -EAGAIN; 635 break; 636 } 637 if (signal_pending(current)) { 638 if (!ret) 639 ret = -ERESTARTSYS; 640 break; 641 } 642 if (do_wakeup) { 643 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 644 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 645 do_wakeup = 0; 646 } 647 pipe->waiting_writers++; 648 pipe_wait(pipe); 649 pipe->waiting_writers--; 650 } 651 out: 652 mutex_unlock(&inode->i_mutex); 653 if (do_wakeup) { 654 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 655 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 656 } 657 if (ret > 0) 658 file_update_time(filp); 659 return ret; 660 } 661 662 static ssize_t 663 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 664 { 665 return -EBADF; 666 } 667 668 static ssize_t 669 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 670 loff_t *ppos) 671 { 672 return -EBADF; 673 } 674 675 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 676 { 677 struct inode *inode = filp->f_path.dentry->d_inode; 678 struct pipe_inode_info *pipe; 679 int count, buf, nrbufs; 680 681 switch (cmd) { 682 case FIONREAD: 683 mutex_lock(&inode->i_mutex); 684 pipe = inode->i_pipe; 685 count = 0; 686 buf = pipe->curbuf; 687 nrbufs = pipe->nrbufs; 688 while (--nrbufs >= 0) { 689 count += pipe->bufs[buf].len; 690 buf = (buf+1) & (pipe->buffers - 1); 691 } 692 mutex_unlock(&inode->i_mutex); 693 694 return put_user(count, (int __user *)arg); 695 default: 696 return -EINVAL; 697 } 698 } 699 700 /* No kernel lock held - fine */ 701 static unsigned int 702 pipe_poll(struct file *filp, poll_table *wait) 703 { 704 unsigned int mask; 705 struct inode *inode = filp->f_path.dentry->d_inode; 706 struct pipe_inode_info *pipe = inode->i_pipe; 707 int nrbufs; 708 709 poll_wait(filp, &pipe->wait, wait); 710 711 /* Reading only -- no need for acquiring the semaphore. */ 712 nrbufs = pipe->nrbufs; 713 mask = 0; 714 if (filp->f_mode & FMODE_READ) { 715 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 716 if (!pipe->writers && filp->f_version != pipe->w_counter) 717 mask |= POLLHUP; 718 } 719 720 if (filp->f_mode & FMODE_WRITE) { 721 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 722 /* 723 * Most Unices do not set POLLERR for FIFOs but on Linux they 724 * behave exactly like pipes for poll(). 725 */ 726 if (!pipe->readers) 727 mask |= POLLERR; 728 } 729 730 return mask; 731 } 732 733 static int 734 pipe_release(struct inode *inode, int decr, int decw) 735 { 736 struct pipe_inode_info *pipe; 737 738 mutex_lock(&inode->i_mutex); 739 pipe = inode->i_pipe; 740 pipe->readers -= decr; 741 pipe->writers -= decw; 742 743 if (!pipe->readers && !pipe->writers) { 744 free_pipe_info(inode); 745 } else { 746 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 747 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 748 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 749 } 750 mutex_unlock(&inode->i_mutex); 751 752 return 0; 753 } 754 755 static int 756 pipe_read_fasync(int fd, struct file *filp, int on) 757 { 758 struct inode *inode = filp->f_path.dentry->d_inode; 759 int retval; 760 761 mutex_lock(&inode->i_mutex); 762 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 763 mutex_unlock(&inode->i_mutex); 764 765 return retval; 766 } 767 768 769 static int 770 pipe_write_fasync(int fd, struct file *filp, int on) 771 { 772 struct inode *inode = filp->f_path.dentry->d_inode; 773 int retval; 774 775 mutex_lock(&inode->i_mutex); 776 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 777 mutex_unlock(&inode->i_mutex); 778 779 return retval; 780 } 781 782 783 static int 784 pipe_rdwr_fasync(int fd, struct file *filp, int on) 785 { 786 struct inode *inode = filp->f_path.dentry->d_inode; 787 struct pipe_inode_info *pipe = inode->i_pipe; 788 int retval; 789 790 mutex_lock(&inode->i_mutex); 791 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 792 if (retval >= 0) { 793 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 794 if (retval < 0) /* this can happen only if on == T */ 795 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 796 } 797 mutex_unlock(&inode->i_mutex); 798 return retval; 799 } 800 801 802 static int 803 pipe_read_release(struct inode *inode, struct file *filp) 804 { 805 return pipe_release(inode, 1, 0); 806 } 807 808 static int 809 pipe_write_release(struct inode *inode, struct file *filp) 810 { 811 return pipe_release(inode, 0, 1); 812 } 813 814 static int 815 pipe_rdwr_release(struct inode *inode, struct file *filp) 816 { 817 int decr, decw; 818 819 decr = (filp->f_mode & FMODE_READ) != 0; 820 decw = (filp->f_mode & FMODE_WRITE) != 0; 821 return pipe_release(inode, decr, decw); 822 } 823 824 static int 825 pipe_read_open(struct inode *inode, struct file *filp) 826 { 827 int ret = -ENOENT; 828 829 mutex_lock(&inode->i_mutex); 830 831 if (inode->i_pipe) { 832 ret = 0; 833 inode->i_pipe->readers++; 834 } 835 836 mutex_unlock(&inode->i_mutex); 837 838 return ret; 839 } 840 841 static int 842 pipe_write_open(struct inode *inode, struct file *filp) 843 { 844 int ret = -ENOENT; 845 846 mutex_lock(&inode->i_mutex); 847 848 if (inode->i_pipe) { 849 ret = 0; 850 inode->i_pipe->writers++; 851 } 852 853 mutex_unlock(&inode->i_mutex); 854 855 return ret; 856 } 857 858 static int 859 pipe_rdwr_open(struct inode *inode, struct file *filp) 860 { 861 int ret = -ENOENT; 862 863 mutex_lock(&inode->i_mutex); 864 865 if (inode->i_pipe) { 866 ret = 0; 867 if (filp->f_mode & FMODE_READ) 868 inode->i_pipe->readers++; 869 if (filp->f_mode & FMODE_WRITE) 870 inode->i_pipe->writers++; 871 } 872 873 mutex_unlock(&inode->i_mutex); 874 875 return ret; 876 } 877 878 /* 879 * The file_operations structs are not static because they 880 * are also used in linux/fs/fifo.c to do operations on FIFOs. 881 * 882 * Pipes reuse fifos' file_operations structs. 883 */ 884 const struct file_operations read_pipefifo_fops = { 885 .llseek = no_llseek, 886 .read = do_sync_read, 887 .aio_read = pipe_read, 888 .write = bad_pipe_w, 889 .poll = pipe_poll, 890 .unlocked_ioctl = pipe_ioctl, 891 .open = pipe_read_open, 892 .release = pipe_read_release, 893 .fasync = pipe_read_fasync, 894 }; 895 896 const struct file_operations write_pipefifo_fops = { 897 .llseek = no_llseek, 898 .read = bad_pipe_r, 899 .write = do_sync_write, 900 .aio_write = pipe_write, 901 .poll = pipe_poll, 902 .unlocked_ioctl = pipe_ioctl, 903 .open = pipe_write_open, 904 .release = pipe_write_release, 905 .fasync = pipe_write_fasync, 906 }; 907 908 const struct file_operations rdwr_pipefifo_fops = { 909 .llseek = no_llseek, 910 .read = do_sync_read, 911 .aio_read = pipe_read, 912 .write = do_sync_write, 913 .aio_write = pipe_write, 914 .poll = pipe_poll, 915 .unlocked_ioctl = pipe_ioctl, 916 .open = pipe_rdwr_open, 917 .release = pipe_rdwr_release, 918 .fasync = pipe_rdwr_fasync, 919 }; 920 921 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 922 { 923 struct pipe_inode_info *pipe; 924 925 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 926 if (pipe) { 927 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 928 if (pipe->bufs) { 929 init_waitqueue_head(&pipe->wait); 930 pipe->r_counter = pipe->w_counter = 1; 931 pipe->inode = inode; 932 pipe->buffers = PIPE_DEF_BUFFERS; 933 return pipe; 934 } 935 kfree(pipe); 936 } 937 938 return NULL; 939 } 940 941 void __free_pipe_info(struct pipe_inode_info *pipe) 942 { 943 int i; 944 945 for (i = 0; i < pipe->buffers; i++) { 946 struct pipe_buffer *buf = pipe->bufs + i; 947 if (buf->ops) 948 buf->ops->release(pipe, buf); 949 } 950 if (pipe->tmp_page) 951 __free_page(pipe->tmp_page); 952 kfree(pipe->bufs); 953 kfree(pipe); 954 } 955 956 void free_pipe_info(struct inode *inode) 957 { 958 __free_pipe_info(inode->i_pipe); 959 inode->i_pipe = NULL; 960 } 961 962 static struct vfsmount *pipe_mnt __read_mostly; 963 964 /* 965 * pipefs_dname() is called from d_path(). 966 */ 967 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 968 { 969 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 970 dentry->d_inode->i_ino); 971 } 972 973 static const struct dentry_operations pipefs_dentry_operations = { 974 .d_dname = pipefs_dname, 975 }; 976 977 static struct inode * get_pipe_inode(void) 978 { 979 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 980 struct pipe_inode_info *pipe; 981 982 if (!inode) 983 goto fail_inode; 984 985 inode->i_ino = get_next_ino(); 986 987 pipe = alloc_pipe_info(inode); 988 if (!pipe) 989 goto fail_iput; 990 inode->i_pipe = pipe; 991 992 pipe->readers = pipe->writers = 1; 993 inode->i_fop = &rdwr_pipefifo_fops; 994 995 /* 996 * Mark the inode dirty from the very beginning, 997 * that way it will never be moved to the dirty 998 * list because "mark_inode_dirty()" will think 999 * that it already _is_ on the dirty list. 1000 */ 1001 inode->i_state = I_DIRTY; 1002 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1003 inode->i_uid = current_fsuid(); 1004 inode->i_gid = current_fsgid(); 1005 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1006 1007 return inode; 1008 1009 fail_iput: 1010 iput(inode); 1011 1012 fail_inode: 1013 return NULL; 1014 } 1015 1016 struct file *create_write_pipe(int flags) 1017 { 1018 int err; 1019 struct inode *inode; 1020 struct file *f; 1021 struct path path; 1022 struct qstr name = { .name = "" }; 1023 1024 err = -ENFILE; 1025 inode = get_pipe_inode(); 1026 if (!inode) 1027 goto err; 1028 1029 err = -ENOMEM; 1030 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1031 if (!path.dentry) 1032 goto err_inode; 1033 path.mnt = mntget(pipe_mnt); 1034 1035 d_instantiate(path.dentry, inode); 1036 1037 err = -ENFILE; 1038 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1039 if (!f) 1040 goto err_dentry; 1041 f->f_mapping = inode->i_mapping; 1042 1043 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1044 f->f_version = 0; 1045 1046 return f; 1047 1048 err_dentry: 1049 free_pipe_info(inode); 1050 path_put(&path); 1051 return ERR_PTR(err); 1052 1053 err_inode: 1054 free_pipe_info(inode); 1055 iput(inode); 1056 err: 1057 return ERR_PTR(err); 1058 } 1059 1060 void free_write_pipe(struct file *f) 1061 { 1062 free_pipe_info(f->f_dentry->d_inode); 1063 path_put(&f->f_path); 1064 put_filp(f); 1065 } 1066 1067 struct file *create_read_pipe(struct file *wrf, int flags) 1068 { 1069 /* Grab pipe from the writer */ 1070 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1071 &read_pipefifo_fops); 1072 if (!f) 1073 return ERR_PTR(-ENFILE); 1074 1075 path_get(&wrf->f_path); 1076 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1077 1078 return f; 1079 } 1080 1081 int do_pipe_flags(int *fd, int flags) 1082 { 1083 struct file *fw, *fr; 1084 int error; 1085 int fdw, fdr; 1086 1087 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1088 return -EINVAL; 1089 1090 fw = create_write_pipe(flags); 1091 if (IS_ERR(fw)) 1092 return PTR_ERR(fw); 1093 fr = create_read_pipe(fw, flags); 1094 error = PTR_ERR(fr); 1095 if (IS_ERR(fr)) 1096 goto err_write_pipe; 1097 1098 error = get_unused_fd_flags(flags); 1099 if (error < 0) 1100 goto err_read_pipe; 1101 fdr = error; 1102 1103 error = get_unused_fd_flags(flags); 1104 if (error < 0) 1105 goto err_fdr; 1106 fdw = error; 1107 1108 audit_fd_pair(fdr, fdw); 1109 fd_install(fdr, fr); 1110 fd_install(fdw, fw); 1111 fd[0] = fdr; 1112 fd[1] = fdw; 1113 1114 return 0; 1115 1116 err_fdr: 1117 put_unused_fd(fdr); 1118 err_read_pipe: 1119 path_put(&fr->f_path); 1120 put_filp(fr); 1121 err_write_pipe: 1122 free_write_pipe(fw); 1123 return error; 1124 } 1125 1126 /* 1127 * sys_pipe() is the normal C calling standard for creating 1128 * a pipe. It's not the way Unix traditionally does this, though. 1129 */ 1130 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1131 { 1132 int fd[2]; 1133 int error; 1134 1135 error = do_pipe_flags(fd, flags); 1136 if (!error) { 1137 if (copy_to_user(fildes, fd, sizeof(fd))) { 1138 sys_close(fd[0]); 1139 sys_close(fd[1]); 1140 error = -EFAULT; 1141 } 1142 } 1143 return error; 1144 } 1145 1146 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1147 { 1148 return sys_pipe2(fildes, 0); 1149 } 1150 1151 /* 1152 * Allocate a new array of pipe buffers and copy the info over. Returns the 1153 * pipe size if successful, or return -ERROR on error. 1154 */ 1155 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1156 { 1157 struct pipe_buffer *bufs; 1158 1159 /* 1160 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1161 * expect a lot of shrink+grow operations, just free and allocate 1162 * again like we would do for growing. If the pipe currently 1163 * contains more buffers than arg, then return busy. 1164 */ 1165 if (nr_pages < pipe->nrbufs) 1166 return -EBUSY; 1167 1168 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1169 if (unlikely(!bufs)) 1170 return -ENOMEM; 1171 1172 /* 1173 * The pipe array wraps around, so just start the new one at zero 1174 * and adjust the indexes. 1175 */ 1176 if (pipe->nrbufs) { 1177 unsigned int tail; 1178 unsigned int head; 1179 1180 tail = pipe->curbuf + pipe->nrbufs; 1181 if (tail < pipe->buffers) 1182 tail = 0; 1183 else 1184 tail &= (pipe->buffers - 1); 1185 1186 head = pipe->nrbufs - tail; 1187 if (head) 1188 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1189 if (tail) 1190 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1191 } 1192 1193 pipe->curbuf = 0; 1194 kfree(pipe->bufs); 1195 pipe->bufs = bufs; 1196 pipe->buffers = nr_pages; 1197 return nr_pages * PAGE_SIZE; 1198 } 1199 1200 /* 1201 * Currently we rely on the pipe array holding a power-of-2 number 1202 * of pages. 1203 */ 1204 static inline unsigned int round_pipe_size(unsigned int size) 1205 { 1206 unsigned long nr_pages; 1207 1208 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1209 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1210 } 1211 1212 /* 1213 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1214 * will return an error. 1215 */ 1216 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1217 size_t *lenp, loff_t *ppos) 1218 { 1219 int ret; 1220 1221 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1222 if (ret < 0 || !write) 1223 return ret; 1224 1225 pipe_max_size = round_pipe_size(pipe_max_size); 1226 return ret; 1227 } 1228 1229 /* 1230 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1231 * location, so checking ->i_pipe is not enough to verify that this is a 1232 * pipe. 1233 */ 1234 struct pipe_inode_info *get_pipe_info(struct file *file) 1235 { 1236 struct inode *i = file->f_path.dentry->d_inode; 1237 1238 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1239 } 1240 1241 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1242 { 1243 struct pipe_inode_info *pipe; 1244 long ret; 1245 1246 pipe = get_pipe_info(file); 1247 if (!pipe) 1248 return -EBADF; 1249 1250 mutex_lock(&pipe->inode->i_mutex); 1251 1252 switch (cmd) { 1253 case F_SETPIPE_SZ: { 1254 unsigned int size, nr_pages; 1255 1256 size = round_pipe_size(arg); 1257 nr_pages = size >> PAGE_SHIFT; 1258 1259 ret = -EINVAL; 1260 if (!nr_pages) 1261 goto out; 1262 1263 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1264 ret = -EPERM; 1265 goto out; 1266 } 1267 ret = pipe_set_size(pipe, nr_pages); 1268 break; 1269 } 1270 case F_GETPIPE_SZ: 1271 ret = pipe->buffers * PAGE_SIZE; 1272 break; 1273 default: 1274 ret = -EINVAL; 1275 break; 1276 } 1277 1278 out: 1279 mutex_unlock(&pipe->inode->i_mutex); 1280 return ret; 1281 } 1282 1283 static const struct super_operations pipefs_ops = { 1284 .destroy_inode = free_inode_nonrcu, 1285 .statfs = simple_statfs, 1286 }; 1287 1288 /* 1289 * pipefs should _never_ be mounted by userland - too much of security hassle, 1290 * no real gain from having the whole whorehouse mounted. So we don't need 1291 * any operations on the root directory. However, we need a non-trivial 1292 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1293 */ 1294 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1295 int flags, const char *dev_name, void *data) 1296 { 1297 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1298 &pipefs_dentry_operations, PIPEFS_MAGIC); 1299 } 1300 1301 static struct file_system_type pipe_fs_type = { 1302 .name = "pipefs", 1303 .mount = pipefs_mount, 1304 .kill_sb = kill_anon_super, 1305 }; 1306 1307 static int __init init_pipe_fs(void) 1308 { 1309 int err = register_filesystem(&pipe_fs_type); 1310 1311 if (!err) { 1312 pipe_mnt = kern_mount(&pipe_fs_type); 1313 if (IS_ERR(pipe_mnt)) { 1314 err = PTR_ERR(pipe_mnt); 1315 unregister_filesystem(&pipe_fs_type); 1316 } 1317 } 1318 return err; 1319 } 1320 1321 fs_initcall(init_pipe_fs); 1322