1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/pipe_fs_i.h> 17 #include <linux/uio.h> 18 #include <linux/highmem.h> 19 #include <linux/pagemap.h> 20 #include <linux/audit.h> 21 #include <linux/syscalls.h> 22 #include <linux/fcntl.h> 23 24 #include <asm/uaccess.h> 25 #include <asm/ioctls.h> 26 27 /* 28 * The max size that a non-root user is allowed to grow the pipe. Can 29 * be set by root in /proc/sys/fs/pipe-max-size 30 */ 31 unsigned int pipe_max_size = 1048576; 32 33 /* 34 * Minimum pipe size, as required by POSIX 35 */ 36 unsigned int pipe_min_size = PAGE_SIZE; 37 38 /* 39 * We use a start+len construction, which provides full use of the 40 * allocated memory. 41 * -- Florian Coosmann (FGC) 42 * 43 * Reads with count = 0 should always return 0. 44 * -- Julian Bradfield 1999-06-07. 45 * 46 * FIFOs and Pipes now generate SIGIO for both readers and writers. 47 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 48 * 49 * pipe_read & write cleanup 50 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 51 */ 52 53 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 54 { 55 if (pipe->inode) 56 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 57 } 58 59 void pipe_lock(struct pipe_inode_info *pipe) 60 { 61 /* 62 * pipe_lock() nests non-pipe inode locks (for writing to a file) 63 */ 64 pipe_lock_nested(pipe, I_MUTEX_PARENT); 65 } 66 EXPORT_SYMBOL(pipe_lock); 67 68 void pipe_unlock(struct pipe_inode_info *pipe) 69 { 70 if (pipe->inode) 71 mutex_unlock(&pipe->inode->i_mutex); 72 } 73 EXPORT_SYMBOL(pipe_unlock); 74 75 void pipe_double_lock(struct pipe_inode_info *pipe1, 76 struct pipe_inode_info *pipe2) 77 { 78 BUG_ON(pipe1 == pipe2); 79 80 if (pipe1 < pipe2) { 81 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 82 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 83 } else { 84 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 85 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 86 } 87 } 88 89 /* Drop the inode semaphore and wait for a pipe event, atomically */ 90 void pipe_wait(struct pipe_inode_info *pipe) 91 { 92 DEFINE_WAIT(wait); 93 94 /* 95 * Pipes are system-local resources, so sleeping on them 96 * is considered a noninteractive wait: 97 */ 98 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 99 pipe_unlock(pipe); 100 schedule(); 101 finish_wait(&pipe->wait, &wait); 102 pipe_lock(pipe); 103 } 104 105 static int 106 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 107 int atomic) 108 { 109 unsigned long copy; 110 111 while (len > 0) { 112 while (!iov->iov_len) 113 iov++; 114 copy = min_t(unsigned long, len, iov->iov_len); 115 116 if (atomic) { 117 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 118 return -EFAULT; 119 } else { 120 if (copy_from_user(to, iov->iov_base, copy)) 121 return -EFAULT; 122 } 123 to += copy; 124 len -= copy; 125 iov->iov_base += copy; 126 iov->iov_len -= copy; 127 } 128 return 0; 129 } 130 131 static int 132 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 133 int atomic) 134 { 135 unsigned long copy; 136 137 while (len > 0) { 138 while (!iov->iov_len) 139 iov++; 140 copy = min_t(unsigned long, len, iov->iov_len); 141 142 if (atomic) { 143 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 144 return -EFAULT; 145 } else { 146 if (copy_to_user(iov->iov_base, from, copy)) 147 return -EFAULT; 148 } 149 from += copy; 150 len -= copy; 151 iov->iov_base += copy; 152 iov->iov_len -= copy; 153 } 154 return 0; 155 } 156 157 /* 158 * Attempt to pre-fault in the user memory, so we can use atomic copies. 159 * Returns the number of bytes not faulted in. 160 */ 161 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 162 { 163 while (!iov->iov_len) 164 iov++; 165 166 while (len > 0) { 167 unsigned long this_len; 168 169 this_len = min_t(unsigned long, len, iov->iov_len); 170 if (fault_in_pages_writeable(iov->iov_base, this_len)) 171 break; 172 173 len -= this_len; 174 iov++; 175 } 176 177 return len; 178 } 179 180 /* 181 * Pre-fault in the user memory, so we can use atomic copies. 182 */ 183 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 184 { 185 while (!iov->iov_len) 186 iov++; 187 188 while (len > 0) { 189 unsigned long this_len; 190 191 this_len = min_t(unsigned long, len, iov->iov_len); 192 fault_in_pages_readable(iov->iov_base, this_len); 193 len -= this_len; 194 iov++; 195 } 196 } 197 198 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 199 struct pipe_buffer *buf) 200 { 201 struct page *page = buf->page; 202 203 /* 204 * If nobody else uses this page, and we don't already have a 205 * temporary page, let's keep track of it as a one-deep 206 * allocation cache. (Otherwise just release our reference to it) 207 */ 208 if (page_count(page) == 1 && !pipe->tmp_page) 209 pipe->tmp_page = page; 210 else 211 page_cache_release(page); 212 } 213 214 /** 215 * generic_pipe_buf_map - virtually map a pipe buffer 216 * @pipe: the pipe that the buffer belongs to 217 * @buf: the buffer that should be mapped 218 * @atomic: whether to use an atomic map 219 * 220 * Description: 221 * This function returns a kernel virtual address mapping for the 222 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 223 * and the caller has to be careful not to fault before calling 224 * the unmap function. 225 * 226 * Note that this function occupies KM_USER0 if @atomic != 0. 227 */ 228 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 229 struct pipe_buffer *buf, int atomic) 230 { 231 if (atomic) { 232 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 233 return kmap_atomic(buf->page, KM_USER0); 234 } 235 236 return kmap(buf->page); 237 } 238 EXPORT_SYMBOL(generic_pipe_buf_map); 239 240 /** 241 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 242 * @pipe: the pipe that the buffer belongs to 243 * @buf: the buffer that should be unmapped 244 * @map_data: the data that the mapping function returned 245 * 246 * Description: 247 * This function undoes the mapping that ->map() provided. 248 */ 249 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 250 struct pipe_buffer *buf, void *map_data) 251 { 252 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 253 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 254 kunmap_atomic(map_data, KM_USER0); 255 } else 256 kunmap(buf->page); 257 } 258 EXPORT_SYMBOL(generic_pipe_buf_unmap); 259 260 /** 261 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 262 * @pipe: the pipe that the buffer belongs to 263 * @buf: the buffer to attempt to steal 264 * 265 * Description: 266 * This function attempts to steal the &struct page attached to 267 * @buf. If successful, this function returns 0 and returns with 268 * the page locked. The caller may then reuse the page for whatever 269 * he wishes; the typical use is insertion into a different file 270 * page cache. 271 */ 272 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 273 struct pipe_buffer *buf) 274 { 275 struct page *page = buf->page; 276 277 /* 278 * A reference of one is golden, that means that the owner of this 279 * page is the only one holding a reference to it. lock the page 280 * and return OK. 281 */ 282 if (page_count(page) == 1) { 283 lock_page(page); 284 return 0; 285 } 286 287 return 1; 288 } 289 EXPORT_SYMBOL(generic_pipe_buf_steal); 290 291 /** 292 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 293 * @pipe: the pipe that the buffer belongs to 294 * @buf: the buffer to get a reference to 295 * 296 * Description: 297 * This function grabs an extra reference to @buf. It's used in 298 * in the tee() system call, when we duplicate the buffers in one 299 * pipe into another. 300 */ 301 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 302 { 303 page_cache_get(buf->page); 304 } 305 EXPORT_SYMBOL(generic_pipe_buf_get); 306 307 /** 308 * generic_pipe_buf_confirm - verify contents of the pipe buffer 309 * @info: the pipe that the buffer belongs to 310 * @buf: the buffer to confirm 311 * 312 * Description: 313 * This function does nothing, because the generic pipe code uses 314 * pages that are always good when inserted into the pipe. 315 */ 316 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 317 struct pipe_buffer *buf) 318 { 319 return 0; 320 } 321 EXPORT_SYMBOL(generic_pipe_buf_confirm); 322 323 /** 324 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 325 * @pipe: the pipe that the buffer belongs to 326 * @buf: the buffer to put a reference to 327 * 328 * Description: 329 * This function releases a reference to @buf. 330 */ 331 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 332 struct pipe_buffer *buf) 333 { 334 page_cache_release(buf->page); 335 } 336 EXPORT_SYMBOL(generic_pipe_buf_release); 337 338 static const struct pipe_buf_operations anon_pipe_buf_ops = { 339 .can_merge = 1, 340 .map = generic_pipe_buf_map, 341 .unmap = generic_pipe_buf_unmap, 342 .confirm = generic_pipe_buf_confirm, 343 .release = anon_pipe_buf_release, 344 .steal = generic_pipe_buf_steal, 345 .get = generic_pipe_buf_get, 346 }; 347 348 static ssize_t 349 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 350 unsigned long nr_segs, loff_t pos) 351 { 352 struct file *filp = iocb->ki_filp; 353 struct inode *inode = filp->f_path.dentry->d_inode; 354 struct pipe_inode_info *pipe; 355 int do_wakeup; 356 ssize_t ret; 357 struct iovec *iov = (struct iovec *)_iov; 358 size_t total_len; 359 360 total_len = iov_length(iov, nr_segs); 361 /* Null read succeeds. */ 362 if (unlikely(total_len == 0)) 363 return 0; 364 365 do_wakeup = 0; 366 ret = 0; 367 mutex_lock(&inode->i_mutex); 368 pipe = inode->i_pipe; 369 for (;;) { 370 int bufs = pipe->nrbufs; 371 if (bufs) { 372 int curbuf = pipe->curbuf; 373 struct pipe_buffer *buf = pipe->bufs + curbuf; 374 const struct pipe_buf_operations *ops = buf->ops; 375 void *addr; 376 size_t chars = buf->len; 377 int error, atomic; 378 379 if (chars > total_len) 380 chars = total_len; 381 382 error = ops->confirm(pipe, buf); 383 if (error) { 384 if (!ret) 385 ret = error; 386 break; 387 } 388 389 atomic = !iov_fault_in_pages_write(iov, chars); 390 redo: 391 addr = ops->map(pipe, buf, atomic); 392 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 393 ops->unmap(pipe, buf, addr); 394 if (unlikely(error)) { 395 /* 396 * Just retry with the slow path if we failed. 397 */ 398 if (atomic) { 399 atomic = 0; 400 goto redo; 401 } 402 if (!ret) 403 ret = error; 404 break; 405 } 406 ret += chars; 407 buf->offset += chars; 408 buf->len -= chars; 409 if (!buf->len) { 410 buf->ops = NULL; 411 ops->release(pipe, buf); 412 curbuf = (curbuf + 1) & (pipe->buffers - 1); 413 pipe->curbuf = curbuf; 414 pipe->nrbufs = --bufs; 415 do_wakeup = 1; 416 } 417 total_len -= chars; 418 if (!total_len) 419 break; /* common path: read succeeded */ 420 } 421 if (bufs) /* More to do? */ 422 continue; 423 if (!pipe->writers) 424 break; 425 if (!pipe->waiting_writers) { 426 /* syscall merging: Usually we must not sleep 427 * if O_NONBLOCK is set, or if we got some data. 428 * But if a writer sleeps in kernel space, then 429 * we can wait for that data without violating POSIX. 430 */ 431 if (ret) 432 break; 433 if (filp->f_flags & O_NONBLOCK) { 434 ret = -EAGAIN; 435 break; 436 } 437 } 438 if (signal_pending(current)) { 439 if (!ret) 440 ret = -ERESTARTSYS; 441 break; 442 } 443 if (do_wakeup) { 444 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 445 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 446 } 447 pipe_wait(pipe); 448 } 449 mutex_unlock(&inode->i_mutex); 450 451 /* Signal writers asynchronously that there is more room. */ 452 if (do_wakeup) { 453 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 454 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 455 } 456 if (ret > 0) 457 file_accessed(filp); 458 return ret; 459 } 460 461 static ssize_t 462 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 463 unsigned long nr_segs, loff_t ppos) 464 { 465 struct file *filp = iocb->ki_filp; 466 struct inode *inode = filp->f_path.dentry->d_inode; 467 struct pipe_inode_info *pipe; 468 ssize_t ret; 469 int do_wakeup; 470 struct iovec *iov = (struct iovec *)_iov; 471 size_t total_len; 472 ssize_t chars; 473 474 total_len = iov_length(iov, nr_segs); 475 /* Null write succeeds. */ 476 if (unlikely(total_len == 0)) 477 return 0; 478 479 do_wakeup = 0; 480 ret = 0; 481 mutex_lock(&inode->i_mutex); 482 pipe = inode->i_pipe; 483 484 if (!pipe->readers) { 485 send_sig(SIGPIPE, current, 0); 486 ret = -EPIPE; 487 goto out; 488 } 489 490 /* We try to merge small writes */ 491 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 492 if (pipe->nrbufs && chars != 0) { 493 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 494 (pipe->buffers - 1); 495 struct pipe_buffer *buf = pipe->bufs + lastbuf; 496 const struct pipe_buf_operations *ops = buf->ops; 497 int offset = buf->offset + buf->len; 498 499 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 500 int error, atomic = 1; 501 void *addr; 502 503 error = ops->confirm(pipe, buf); 504 if (error) 505 goto out; 506 507 iov_fault_in_pages_read(iov, chars); 508 redo1: 509 addr = ops->map(pipe, buf, atomic); 510 error = pipe_iov_copy_from_user(offset + addr, iov, 511 chars, atomic); 512 ops->unmap(pipe, buf, addr); 513 ret = error; 514 do_wakeup = 1; 515 if (error) { 516 if (atomic) { 517 atomic = 0; 518 goto redo1; 519 } 520 goto out; 521 } 522 buf->len += chars; 523 total_len -= chars; 524 ret = chars; 525 if (!total_len) 526 goto out; 527 } 528 } 529 530 for (;;) { 531 int bufs; 532 533 if (!pipe->readers) { 534 send_sig(SIGPIPE, current, 0); 535 if (!ret) 536 ret = -EPIPE; 537 break; 538 } 539 bufs = pipe->nrbufs; 540 if (bufs < pipe->buffers) { 541 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 542 struct pipe_buffer *buf = pipe->bufs + newbuf; 543 struct page *page = pipe->tmp_page; 544 char *src; 545 int error, atomic = 1; 546 547 if (!page) { 548 page = alloc_page(GFP_HIGHUSER); 549 if (unlikely(!page)) { 550 ret = ret ? : -ENOMEM; 551 break; 552 } 553 pipe->tmp_page = page; 554 } 555 /* Always wake up, even if the copy fails. Otherwise 556 * we lock up (O_NONBLOCK-)readers that sleep due to 557 * syscall merging. 558 * FIXME! Is this really true? 559 */ 560 do_wakeup = 1; 561 chars = PAGE_SIZE; 562 if (chars > total_len) 563 chars = total_len; 564 565 iov_fault_in_pages_read(iov, chars); 566 redo2: 567 if (atomic) 568 src = kmap_atomic(page, KM_USER0); 569 else 570 src = kmap(page); 571 572 error = pipe_iov_copy_from_user(src, iov, chars, 573 atomic); 574 if (atomic) 575 kunmap_atomic(src, KM_USER0); 576 else 577 kunmap(page); 578 579 if (unlikely(error)) { 580 if (atomic) { 581 atomic = 0; 582 goto redo2; 583 } 584 if (!ret) 585 ret = error; 586 break; 587 } 588 ret += chars; 589 590 /* Insert it into the buffer array */ 591 buf->page = page; 592 buf->ops = &anon_pipe_buf_ops; 593 buf->offset = 0; 594 buf->len = chars; 595 pipe->nrbufs = ++bufs; 596 pipe->tmp_page = NULL; 597 598 total_len -= chars; 599 if (!total_len) 600 break; 601 } 602 if (bufs < pipe->buffers) 603 continue; 604 if (filp->f_flags & O_NONBLOCK) { 605 if (!ret) 606 ret = -EAGAIN; 607 break; 608 } 609 if (signal_pending(current)) { 610 if (!ret) 611 ret = -ERESTARTSYS; 612 break; 613 } 614 if (do_wakeup) { 615 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 616 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 617 do_wakeup = 0; 618 } 619 pipe->waiting_writers++; 620 pipe_wait(pipe); 621 pipe->waiting_writers--; 622 } 623 out: 624 mutex_unlock(&inode->i_mutex); 625 if (do_wakeup) { 626 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 627 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 628 } 629 if (ret > 0) 630 file_update_time(filp); 631 return ret; 632 } 633 634 static ssize_t 635 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 636 { 637 return -EBADF; 638 } 639 640 static ssize_t 641 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 642 loff_t *ppos) 643 { 644 return -EBADF; 645 } 646 647 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 648 { 649 struct inode *inode = filp->f_path.dentry->d_inode; 650 struct pipe_inode_info *pipe; 651 int count, buf, nrbufs; 652 653 switch (cmd) { 654 case FIONREAD: 655 mutex_lock(&inode->i_mutex); 656 pipe = inode->i_pipe; 657 count = 0; 658 buf = pipe->curbuf; 659 nrbufs = pipe->nrbufs; 660 while (--nrbufs >= 0) { 661 count += pipe->bufs[buf].len; 662 buf = (buf+1) & (pipe->buffers - 1); 663 } 664 mutex_unlock(&inode->i_mutex); 665 666 return put_user(count, (int __user *)arg); 667 default: 668 return -EINVAL; 669 } 670 } 671 672 /* No kernel lock held - fine */ 673 static unsigned int 674 pipe_poll(struct file *filp, poll_table *wait) 675 { 676 unsigned int mask; 677 struct inode *inode = filp->f_path.dentry->d_inode; 678 struct pipe_inode_info *pipe = inode->i_pipe; 679 int nrbufs; 680 681 poll_wait(filp, &pipe->wait, wait); 682 683 /* Reading only -- no need for acquiring the semaphore. */ 684 nrbufs = pipe->nrbufs; 685 mask = 0; 686 if (filp->f_mode & FMODE_READ) { 687 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 688 if (!pipe->writers && filp->f_version != pipe->w_counter) 689 mask |= POLLHUP; 690 } 691 692 if (filp->f_mode & FMODE_WRITE) { 693 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 694 /* 695 * Most Unices do not set POLLERR for FIFOs but on Linux they 696 * behave exactly like pipes for poll(). 697 */ 698 if (!pipe->readers) 699 mask |= POLLERR; 700 } 701 702 return mask; 703 } 704 705 static int 706 pipe_release(struct inode *inode, int decr, int decw) 707 { 708 struct pipe_inode_info *pipe; 709 710 mutex_lock(&inode->i_mutex); 711 pipe = inode->i_pipe; 712 pipe->readers -= decr; 713 pipe->writers -= decw; 714 715 if (!pipe->readers && !pipe->writers) { 716 free_pipe_info(inode); 717 } else { 718 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 719 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 720 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 721 } 722 mutex_unlock(&inode->i_mutex); 723 724 return 0; 725 } 726 727 static int 728 pipe_read_fasync(int fd, struct file *filp, int on) 729 { 730 struct inode *inode = filp->f_path.dentry->d_inode; 731 int retval; 732 733 mutex_lock(&inode->i_mutex); 734 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 735 mutex_unlock(&inode->i_mutex); 736 737 return retval; 738 } 739 740 741 static int 742 pipe_write_fasync(int fd, struct file *filp, int on) 743 { 744 struct inode *inode = filp->f_path.dentry->d_inode; 745 int retval; 746 747 mutex_lock(&inode->i_mutex); 748 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 749 mutex_unlock(&inode->i_mutex); 750 751 return retval; 752 } 753 754 755 static int 756 pipe_rdwr_fasync(int fd, struct file *filp, int on) 757 { 758 struct inode *inode = filp->f_path.dentry->d_inode; 759 struct pipe_inode_info *pipe = inode->i_pipe; 760 int retval; 761 762 mutex_lock(&inode->i_mutex); 763 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 764 if (retval >= 0) { 765 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 766 if (retval < 0) /* this can happen only if on == T */ 767 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 768 } 769 mutex_unlock(&inode->i_mutex); 770 return retval; 771 } 772 773 774 static int 775 pipe_read_release(struct inode *inode, struct file *filp) 776 { 777 return pipe_release(inode, 1, 0); 778 } 779 780 static int 781 pipe_write_release(struct inode *inode, struct file *filp) 782 { 783 return pipe_release(inode, 0, 1); 784 } 785 786 static int 787 pipe_rdwr_release(struct inode *inode, struct file *filp) 788 { 789 int decr, decw; 790 791 decr = (filp->f_mode & FMODE_READ) != 0; 792 decw = (filp->f_mode & FMODE_WRITE) != 0; 793 return pipe_release(inode, decr, decw); 794 } 795 796 static int 797 pipe_read_open(struct inode *inode, struct file *filp) 798 { 799 int ret = -ENOENT; 800 801 mutex_lock(&inode->i_mutex); 802 803 if (inode->i_pipe) { 804 ret = 0; 805 inode->i_pipe->readers++; 806 } 807 808 mutex_unlock(&inode->i_mutex); 809 810 return ret; 811 } 812 813 static int 814 pipe_write_open(struct inode *inode, struct file *filp) 815 { 816 int ret = -ENOENT; 817 818 mutex_lock(&inode->i_mutex); 819 820 if (inode->i_pipe) { 821 ret = 0; 822 inode->i_pipe->writers++; 823 } 824 825 mutex_unlock(&inode->i_mutex); 826 827 return ret; 828 } 829 830 static int 831 pipe_rdwr_open(struct inode *inode, struct file *filp) 832 { 833 int ret = -ENOENT; 834 835 mutex_lock(&inode->i_mutex); 836 837 if (inode->i_pipe) { 838 ret = 0; 839 if (filp->f_mode & FMODE_READ) 840 inode->i_pipe->readers++; 841 if (filp->f_mode & FMODE_WRITE) 842 inode->i_pipe->writers++; 843 } 844 845 mutex_unlock(&inode->i_mutex); 846 847 return ret; 848 } 849 850 /* 851 * The file_operations structs are not static because they 852 * are also used in linux/fs/fifo.c to do operations on FIFOs. 853 * 854 * Pipes reuse fifos' file_operations structs. 855 */ 856 const struct file_operations read_pipefifo_fops = { 857 .llseek = no_llseek, 858 .read = do_sync_read, 859 .aio_read = pipe_read, 860 .write = bad_pipe_w, 861 .poll = pipe_poll, 862 .unlocked_ioctl = pipe_ioctl, 863 .open = pipe_read_open, 864 .release = pipe_read_release, 865 .fasync = pipe_read_fasync, 866 }; 867 868 const struct file_operations write_pipefifo_fops = { 869 .llseek = no_llseek, 870 .read = bad_pipe_r, 871 .write = do_sync_write, 872 .aio_write = pipe_write, 873 .poll = pipe_poll, 874 .unlocked_ioctl = pipe_ioctl, 875 .open = pipe_write_open, 876 .release = pipe_write_release, 877 .fasync = pipe_write_fasync, 878 }; 879 880 const struct file_operations rdwr_pipefifo_fops = { 881 .llseek = no_llseek, 882 .read = do_sync_read, 883 .aio_read = pipe_read, 884 .write = do_sync_write, 885 .aio_write = pipe_write, 886 .poll = pipe_poll, 887 .unlocked_ioctl = pipe_ioctl, 888 .open = pipe_rdwr_open, 889 .release = pipe_rdwr_release, 890 .fasync = pipe_rdwr_fasync, 891 }; 892 893 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 894 { 895 struct pipe_inode_info *pipe; 896 897 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 898 if (pipe) { 899 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 900 if (pipe->bufs) { 901 init_waitqueue_head(&pipe->wait); 902 pipe->r_counter = pipe->w_counter = 1; 903 pipe->inode = inode; 904 pipe->buffers = PIPE_DEF_BUFFERS; 905 return pipe; 906 } 907 kfree(pipe); 908 } 909 910 return NULL; 911 } 912 913 void __free_pipe_info(struct pipe_inode_info *pipe) 914 { 915 int i; 916 917 for (i = 0; i < pipe->buffers; i++) { 918 struct pipe_buffer *buf = pipe->bufs + i; 919 if (buf->ops) 920 buf->ops->release(pipe, buf); 921 } 922 if (pipe->tmp_page) 923 __free_page(pipe->tmp_page); 924 kfree(pipe->bufs); 925 kfree(pipe); 926 } 927 928 void free_pipe_info(struct inode *inode) 929 { 930 __free_pipe_info(inode->i_pipe); 931 inode->i_pipe = NULL; 932 } 933 934 static struct vfsmount *pipe_mnt __read_mostly; 935 936 /* 937 * pipefs_dname() is called from d_path(). 938 */ 939 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 940 { 941 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 942 dentry->d_inode->i_ino); 943 } 944 945 static const struct dentry_operations pipefs_dentry_operations = { 946 .d_dname = pipefs_dname, 947 }; 948 949 static struct inode * get_pipe_inode(void) 950 { 951 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 952 struct pipe_inode_info *pipe; 953 954 if (!inode) 955 goto fail_inode; 956 957 inode->i_ino = get_next_ino(); 958 959 pipe = alloc_pipe_info(inode); 960 if (!pipe) 961 goto fail_iput; 962 inode->i_pipe = pipe; 963 964 pipe->readers = pipe->writers = 1; 965 inode->i_fop = &rdwr_pipefifo_fops; 966 967 /* 968 * Mark the inode dirty from the very beginning, 969 * that way it will never be moved to the dirty 970 * list because "mark_inode_dirty()" will think 971 * that it already _is_ on the dirty list. 972 */ 973 inode->i_state = I_DIRTY; 974 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 975 inode->i_uid = current_fsuid(); 976 inode->i_gid = current_fsgid(); 977 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 978 979 return inode; 980 981 fail_iput: 982 iput(inode); 983 984 fail_inode: 985 return NULL; 986 } 987 988 struct file *create_write_pipe(int flags) 989 { 990 int err; 991 struct inode *inode; 992 struct file *f; 993 struct path path; 994 struct qstr name = { .name = "" }; 995 996 err = -ENFILE; 997 inode = get_pipe_inode(); 998 if (!inode) 999 goto err; 1000 1001 err = -ENOMEM; 1002 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1003 if (!path.dentry) 1004 goto err_inode; 1005 path.mnt = mntget(pipe_mnt); 1006 1007 d_instantiate(path.dentry, inode); 1008 1009 err = -ENFILE; 1010 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1011 if (!f) 1012 goto err_dentry; 1013 f->f_mapping = inode->i_mapping; 1014 1015 f->f_flags = O_WRONLY | (flags & O_NONBLOCK); 1016 f->f_version = 0; 1017 1018 return f; 1019 1020 err_dentry: 1021 free_pipe_info(inode); 1022 path_put(&path); 1023 return ERR_PTR(err); 1024 1025 err_inode: 1026 free_pipe_info(inode); 1027 iput(inode); 1028 err: 1029 return ERR_PTR(err); 1030 } 1031 1032 void free_write_pipe(struct file *f) 1033 { 1034 free_pipe_info(f->f_dentry->d_inode); 1035 path_put(&f->f_path); 1036 put_filp(f); 1037 } 1038 1039 struct file *create_read_pipe(struct file *wrf, int flags) 1040 { 1041 /* Grab pipe from the writer */ 1042 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1043 &read_pipefifo_fops); 1044 if (!f) 1045 return ERR_PTR(-ENFILE); 1046 1047 path_get(&wrf->f_path); 1048 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1049 1050 return f; 1051 } 1052 1053 int do_pipe_flags(int *fd, int flags) 1054 { 1055 struct file *fw, *fr; 1056 int error; 1057 int fdw, fdr; 1058 1059 if (flags & ~(O_CLOEXEC | O_NONBLOCK)) 1060 return -EINVAL; 1061 1062 fw = create_write_pipe(flags); 1063 if (IS_ERR(fw)) 1064 return PTR_ERR(fw); 1065 fr = create_read_pipe(fw, flags); 1066 error = PTR_ERR(fr); 1067 if (IS_ERR(fr)) 1068 goto err_write_pipe; 1069 1070 error = get_unused_fd_flags(flags); 1071 if (error < 0) 1072 goto err_read_pipe; 1073 fdr = error; 1074 1075 error = get_unused_fd_flags(flags); 1076 if (error < 0) 1077 goto err_fdr; 1078 fdw = error; 1079 1080 audit_fd_pair(fdr, fdw); 1081 fd_install(fdr, fr); 1082 fd_install(fdw, fw); 1083 fd[0] = fdr; 1084 fd[1] = fdw; 1085 1086 return 0; 1087 1088 err_fdr: 1089 put_unused_fd(fdr); 1090 err_read_pipe: 1091 path_put(&fr->f_path); 1092 put_filp(fr); 1093 err_write_pipe: 1094 free_write_pipe(fw); 1095 return error; 1096 } 1097 1098 /* 1099 * sys_pipe() is the normal C calling standard for creating 1100 * a pipe. It's not the way Unix traditionally does this, though. 1101 */ 1102 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1103 { 1104 int fd[2]; 1105 int error; 1106 1107 error = do_pipe_flags(fd, flags); 1108 if (!error) { 1109 if (copy_to_user(fildes, fd, sizeof(fd))) { 1110 sys_close(fd[0]); 1111 sys_close(fd[1]); 1112 error = -EFAULT; 1113 } 1114 } 1115 return error; 1116 } 1117 1118 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1119 { 1120 return sys_pipe2(fildes, 0); 1121 } 1122 1123 /* 1124 * Allocate a new array of pipe buffers and copy the info over. Returns the 1125 * pipe size if successful, or return -ERROR on error. 1126 */ 1127 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1128 { 1129 struct pipe_buffer *bufs; 1130 1131 /* 1132 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1133 * expect a lot of shrink+grow operations, just free and allocate 1134 * again like we would do for growing. If the pipe currently 1135 * contains more buffers than arg, then return busy. 1136 */ 1137 if (nr_pages < pipe->nrbufs) 1138 return -EBUSY; 1139 1140 bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL); 1141 if (unlikely(!bufs)) 1142 return -ENOMEM; 1143 1144 /* 1145 * The pipe array wraps around, so just start the new one at zero 1146 * and adjust the indexes. 1147 */ 1148 if (pipe->nrbufs) { 1149 unsigned int tail; 1150 unsigned int head; 1151 1152 tail = pipe->curbuf + pipe->nrbufs; 1153 if (tail < pipe->buffers) 1154 tail = 0; 1155 else 1156 tail &= (pipe->buffers - 1); 1157 1158 head = pipe->nrbufs - tail; 1159 if (head) 1160 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1161 if (tail) 1162 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1163 } 1164 1165 pipe->curbuf = 0; 1166 kfree(pipe->bufs); 1167 pipe->bufs = bufs; 1168 pipe->buffers = nr_pages; 1169 return nr_pages * PAGE_SIZE; 1170 } 1171 1172 /* 1173 * Currently we rely on the pipe array holding a power-of-2 number 1174 * of pages. 1175 */ 1176 static inline unsigned int round_pipe_size(unsigned int size) 1177 { 1178 unsigned long nr_pages; 1179 1180 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1181 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1182 } 1183 1184 /* 1185 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1186 * will return an error. 1187 */ 1188 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1189 size_t *lenp, loff_t *ppos) 1190 { 1191 int ret; 1192 1193 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1194 if (ret < 0 || !write) 1195 return ret; 1196 1197 pipe_max_size = round_pipe_size(pipe_max_size); 1198 return ret; 1199 } 1200 1201 /* 1202 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1203 * location, so checking ->i_pipe is not enough to verify that this is a 1204 * pipe. 1205 */ 1206 struct pipe_inode_info *get_pipe_info(struct file *file) 1207 { 1208 struct inode *i = file->f_path.dentry->d_inode; 1209 1210 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1211 } 1212 1213 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1214 { 1215 struct pipe_inode_info *pipe; 1216 long ret; 1217 1218 pipe = get_pipe_info(file); 1219 if (!pipe) 1220 return -EBADF; 1221 1222 mutex_lock(&pipe->inode->i_mutex); 1223 1224 switch (cmd) { 1225 case F_SETPIPE_SZ: { 1226 unsigned int size, nr_pages; 1227 1228 size = round_pipe_size(arg); 1229 nr_pages = size >> PAGE_SHIFT; 1230 1231 ret = -EINVAL; 1232 if (!nr_pages) 1233 goto out; 1234 1235 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1236 ret = -EPERM; 1237 goto out; 1238 } 1239 ret = pipe_set_size(pipe, nr_pages); 1240 break; 1241 } 1242 case F_GETPIPE_SZ: 1243 ret = pipe->buffers * PAGE_SIZE; 1244 break; 1245 default: 1246 ret = -EINVAL; 1247 break; 1248 } 1249 1250 out: 1251 mutex_unlock(&pipe->inode->i_mutex); 1252 return ret; 1253 } 1254 1255 static const struct super_operations pipefs_ops = { 1256 .destroy_inode = free_inode_nonrcu, 1257 }; 1258 1259 /* 1260 * pipefs should _never_ be mounted by userland - too much of security hassle, 1261 * no real gain from having the whole whorehouse mounted. So we don't need 1262 * any operations on the root directory. However, we need a non-trivial 1263 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1264 */ 1265 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1266 int flags, const char *dev_name, void *data) 1267 { 1268 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1269 &pipefs_dentry_operations, PIPEFS_MAGIC); 1270 } 1271 1272 static struct file_system_type pipe_fs_type = { 1273 .name = "pipefs", 1274 .mount = pipefs_mount, 1275 .kill_sb = kill_anon_super, 1276 }; 1277 1278 static int __init init_pipe_fs(void) 1279 { 1280 int err = register_filesystem(&pipe_fs_type); 1281 1282 if (!err) { 1283 pipe_mnt = kern_mount(&pipe_fs_type); 1284 if (IS_ERR(pipe_mnt)) { 1285 err = PTR_ERR(pipe_mnt); 1286 unregister_filesystem(&pipe_fs_type); 1287 } 1288 } 1289 return err; 1290 } 1291 1292 static void __exit exit_pipe_fs(void) 1293 { 1294 kern_unmount(pipe_mnt); 1295 unregister_filesystem(&pipe_fs_type); 1296 } 1297 1298 fs_initcall(init_pipe_fs); 1299 module_exit(exit_pipe_fs); 1300