1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 25 #include <asm/uaccess.h> 26 #include <asm/ioctls.h> 27 28 /* 29 * The max size that a non-root user is allowed to grow the pipe. Can 30 * be set by root in /proc/sys/fs/pipe-max-size 31 */ 32 unsigned int pipe_max_size = 1048576; 33 34 /* 35 * Minimum pipe size, as required by POSIX 36 */ 37 unsigned int pipe_min_size = PAGE_SIZE; 38 39 /* 40 * We use a start+len construction, which provides full use of the 41 * allocated memory. 42 * -- Florian Coosmann (FGC) 43 * 44 * Reads with count = 0 should always return 0. 45 * -- Julian Bradfield 1999-06-07. 46 * 47 * FIFOs and Pipes now generate SIGIO for both readers and writers. 48 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 49 * 50 * pipe_read & write cleanup 51 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 52 */ 53 54 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 55 { 56 if (pipe->inode) 57 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 58 } 59 60 void pipe_lock(struct pipe_inode_info *pipe) 61 { 62 /* 63 * pipe_lock() nests non-pipe inode locks (for writing to a file) 64 */ 65 pipe_lock_nested(pipe, I_MUTEX_PARENT); 66 } 67 EXPORT_SYMBOL(pipe_lock); 68 69 void pipe_unlock(struct pipe_inode_info *pipe) 70 { 71 if (pipe->inode) 72 mutex_unlock(&pipe->inode->i_mutex); 73 } 74 EXPORT_SYMBOL(pipe_unlock); 75 76 void pipe_double_lock(struct pipe_inode_info *pipe1, 77 struct pipe_inode_info *pipe2) 78 { 79 BUG_ON(pipe1 == pipe2); 80 81 if (pipe1 < pipe2) { 82 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 83 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 84 } else { 85 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 86 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 87 } 88 } 89 90 /* Drop the inode semaphore and wait for a pipe event, atomically */ 91 void pipe_wait(struct pipe_inode_info *pipe) 92 { 93 DEFINE_WAIT(wait); 94 95 /* 96 * Pipes are system-local resources, so sleeping on them 97 * is considered a noninteractive wait: 98 */ 99 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 100 pipe_unlock(pipe); 101 schedule(); 102 finish_wait(&pipe->wait, &wait); 103 pipe_lock(pipe); 104 } 105 106 static int 107 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 108 int atomic) 109 { 110 unsigned long copy; 111 112 while (len > 0) { 113 while (!iov->iov_len) 114 iov++; 115 copy = min_t(unsigned long, len, iov->iov_len); 116 117 if (atomic) { 118 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 119 return -EFAULT; 120 } else { 121 if (copy_from_user(to, iov->iov_base, copy)) 122 return -EFAULT; 123 } 124 to += copy; 125 len -= copy; 126 iov->iov_base += copy; 127 iov->iov_len -= copy; 128 } 129 return 0; 130 } 131 132 static int 133 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 134 int atomic) 135 { 136 unsigned long copy; 137 138 while (len > 0) { 139 while (!iov->iov_len) 140 iov++; 141 copy = min_t(unsigned long, len, iov->iov_len); 142 143 if (atomic) { 144 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 145 return -EFAULT; 146 } else { 147 if (copy_to_user(iov->iov_base, from, copy)) 148 return -EFAULT; 149 } 150 from += copy; 151 len -= copy; 152 iov->iov_base += copy; 153 iov->iov_len -= copy; 154 } 155 return 0; 156 } 157 158 /* 159 * Attempt to pre-fault in the user memory, so we can use atomic copies. 160 * Returns the number of bytes not faulted in. 161 */ 162 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 163 { 164 while (!iov->iov_len) 165 iov++; 166 167 while (len > 0) { 168 unsigned long this_len; 169 170 this_len = min_t(unsigned long, len, iov->iov_len); 171 if (fault_in_pages_writeable(iov->iov_base, this_len)) 172 break; 173 174 len -= this_len; 175 iov++; 176 } 177 178 return len; 179 } 180 181 /* 182 * Pre-fault in the user memory, so we can use atomic copies. 183 */ 184 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 185 { 186 while (!iov->iov_len) 187 iov++; 188 189 while (len > 0) { 190 unsigned long this_len; 191 192 this_len = min_t(unsigned long, len, iov->iov_len); 193 fault_in_pages_readable(iov->iov_base, this_len); 194 len -= this_len; 195 iov++; 196 } 197 } 198 199 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 200 struct pipe_buffer *buf) 201 { 202 struct page *page = buf->page; 203 204 /* 205 * If nobody else uses this page, and we don't already have a 206 * temporary page, let's keep track of it as a one-deep 207 * allocation cache. (Otherwise just release our reference to it) 208 */ 209 if (page_count(page) == 1 && !pipe->tmp_page) 210 pipe->tmp_page = page; 211 else 212 page_cache_release(page); 213 } 214 215 /** 216 * generic_pipe_buf_map - virtually map a pipe buffer 217 * @pipe: the pipe that the buffer belongs to 218 * @buf: the buffer that should be mapped 219 * @atomic: whether to use an atomic map 220 * 221 * Description: 222 * This function returns a kernel virtual address mapping for the 223 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 224 * and the caller has to be careful not to fault before calling 225 * the unmap function. 226 * 227 * Note that this function occupies KM_USER0 if @atomic != 0. 228 */ 229 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 230 struct pipe_buffer *buf, int atomic) 231 { 232 if (atomic) { 233 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 234 return kmap_atomic(buf->page); 235 } 236 237 return kmap(buf->page); 238 } 239 EXPORT_SYMBOL(generic_pipe_buf_map); 240 241 /** 242 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 243 * @pipe: the pipe that the buffer belongs to 244 * @buf: the buffer that should be unmapped 245 * @map_data: the data that the mapping function returned 246 * 247 * Description: 248 * This function undoes the mapping that ->map() provided. 249 */ 250 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 251 struct pipe_buffer *buf, void *map_data) 252 { 253 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 254 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 255 kunmap_atomic(map_data); 256 } else 257 kunmap(buf->page); 258 } 259 EXPORT_SYMBOL(generic_pipe_buf_unmap); 260 261 /** 262 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 263 * @pipe: the pipe that the buffer belongs to 264 * @buf: the buffer to attempt to steal 265 * 266 * Description: 267 * This function attempts to steal the &struct page attached to 268 * @buf. If successful, this function returns 0 and returns with 269 * the page locked. The caller may then reuse the page for whatever 270 * he wishes; the typical use is insertion into a different file 271 * page cache. 272 */ 273 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 274 struct pipe_buffer *buf) 275 { 276 struct page *page = buf->page; 277 278 /* 279 * A reference of one is golden, that means that the owner of this 280 * page is the only one holding a reference to it. lock the page 281 * and return OK. 282 */ 283 if (page_count(page) == 1) { 284 lock_page(page); 285 return 0; 286 } 287 288 return 1; 289 } 290 EXPORT_SYMBOL(generic_pipe_buf_steal); 291 292 /** 293 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 294 * @pipe: the pipe that the buffer belongs to 295 * @buf: the buffer to get a reference to 296 * 297 * Description: 298 * This function grabs an extra reference to @buf. It's used in 299 * in the tee() system call, when we duplicate the buffers in one 300 * pipe into another. 301 */ 302 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 303 { 304 page_cache_get(buf->page); 305 } 306 EXPORT_SYMBOL(generic_pipe_buf_get); 307 308 /** 309 * generic_pipe_buf_confirm - verify contents of the pipe buffer 310 * @info: the pipe that the buffer belongs to 311 * @buf: the buffer to confirm 312 * 313 * Description: 314 * This function does nothing, because the generic pipe code uses 315 * pages that are always good when inserted into the pipe. 316 */ 317 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 318 struct pipe_buffer *buf) 319 { 320 return 0; 321 } 322 EXPORT_SYMBOL(generic_pipe_buf_confirm); 323 324 /** 325 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 326 * @pipe: the pipe that the buffer belongs to 327 * @buf: the buffer to put a reference to 328 * 329 * Description: 330 * This function releases a reference to @buf. 331 */ 332 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 333 struct pipe_buffer *buf) 334 { 335 page_cache_release(buf->page); 336 } 337 EXPORT_SYMBOL(generic_pipe_buf_release); 338 339 static const struct pipe_buf_operations anon_pipe_buf_ops = { 340 .can_merge = 1, 341 .map = generic_pipe_buf_map, 342 .unmap = generic_pipe_buf_unmap, 343 .confirm = generic_pipe_buf_confirm, 344 .release = anon_pipe_buf_release, 345 .steal = generic_pipe_buf_steal, 346 .get = generic_pipe_buf_get, 347 }; 348 349 static const struct pipe_buf_operations packet_pipe_buf_ops = { 350 .can_merge = 0, 351 .map = generic_pipe_buf_map, 352 .unmap = generic_pipe_buf_unmap, 353 .confirm = generic_pipe_buf_confirm, 354 .release = anon_pipe_buf_release, 355 .steal = generic_pipe_buf_steal, 356 .get = generic_pipe_buf_get, 357 }; 358 359 static ssize_t 360 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 361 unsigned long nr_segs, loff_t pos) 362 { 363 struct file *filp = iocb->ki_filp; 364 struct inode *inode = filp->f_path.dentry->d_inode; 365 struct pipe_inode_info *pipe; 366 int do_wakeup; 367 ssize_t ret; 368 struct iovec *iov = (struct iovec *)_iov; 369 size_t total_len; 370 371 total_len = iov_length(iov, nr_segs); 372 /* Null read succeeds. */ 373 if (unlikely(total_len == 0)) 374 return 0; 375 376 do_wakeup = 0; 377 ret = 0; 378 mutex_lock(&inode->i_mutex); 379 pipe = inode->i_pipe; 380 for (;;) { 381 int bufs = pipe->nrbufs; 382 if (bufs) { 383 int curbuf = pipe->curbuf; 384 struct pipe_buffer *buf = pipe->bufs + curbuf; 385 const struct pipe_buf_operations *ops = buf->ops; 386 void *addr; 387 size_t chars = buf->len; 388 int error, atomic; 389 390 if (chars > total_len) 391 chars = total_len; 392 393 error = ops->confirm(pipe, buf); 394 if (error) { 395 if (!ret) 396 ret = error; 397 break; 398 } 399 400 atomic = !iov_fault_in_pages_write(iov, chars); 401 redo: 402 addr = ops->map(pipe, buf, atomic); 403 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 404 ops->unmap(pipe, buf, addr); 405 if (unlikely(error)) { 406 /* 407 * Just retry with the slow path if we failed. 408 */ 409 if (atomic) { 410 atomic = 0; 411 goto redo; 412 } 413 if (!ret) 414 ret = error; 415 break; 416 } 417 ret += chars; 418 buf->offset += chars; 419 buf->len -= chars; 420 421 /* Was it a packet buffer? Clean up and exit */ 422 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 423 total_len = chars; 424 buf->len = 0; 425 } 426 427 if (!buf->len) { 428 buf->ops = NULL; 429 ops->release(pipe, buf); 430 curbuf = (curbuf + 1) & (pipe->buffers - 1); 431 pipe->curbuf = curbuf; 432 pipe->nrbufs = --bufs; 433 do_wakeup = 1; 434 } 435 total_len -= chars; 436 if (!total_len) 437 break; /* common path: read succeeded */ 438 } 439 if (bufs) /* More to do? */ 440 continue; 441 if (!pipe->writers) 442 break; 443 if (!pipe->waiting_writers) { 444 /* syscall merging: Usually we must not sleep 445 * if O_NONBLOCK is set, or if we got some data. 446 * But if a writer sleeps in kernel space, then 447 * we can wait for that data without violating POSIX. 448 */ 449 if (ret) 450 break; 451 if (filp->f_flags & O_NONBLOCK) { 452 ret = -EAGAIN; 453 break; 454 } 455 } 456 if (signal_pending(current)) { 457 if (!ret) 458 ret = -ERESTARTSYS; 459 break; 460 } 461 if (do_wakeup) { 462 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 463 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 464 } 465 pipe_wait(pipe); 466 } 467 mutex_unlock(&inode->i_mutex); 468 469 /* Signal writers asynchronously that there is more room. */ 470 if (do_wakeup) { 471 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 472 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 473 } 474 if (ret > 0) 475 file_accessed(filp); 476 return ret; 477 } 478 479 static inline int is_packetized(struct file *file) 480 { 481 return (file->f_flags & O_DIRECT) != 0; 482 } 483 484 static ssize_t 485 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 486 unsigned long nr_segs, loff_t ppos) 487 { 488 struct file *filp = iocb->ki_filp; 489 struct inode *inode = filp->f_path.dentry->d_inode; 490 struct pipe_inode_info *pipe; 491 ssize_t ret; 492 int do_wakeup; 493 struct iovec *iov = (struct iovec *)_iov; 494 size_t total_len; 495 ssize_t chars; 496 497 total_len = iov_length(iov, nr_segs); 498 /* Null write succeeds. */ 499 if (unlikely(total_len == 0)) 500 return 0; 501 502 do_wakeup = 0; 503 ret = 0; 504 mutex_lock(&inode->i_mutex); 505 pipe = inode->i_pipe; 506 507 if (!pipe->readers) { 508 send_sig(SIGPIPE, current, 0); 509 ret = -EPIPE; 510 goto out; 511 } 512 513 /* We try to merge small writes */ 514 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 515 if (pipe->nrbufs && chars != 0) { 516 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 517 (pipe->buffers - 1); 518 struct pipe_buffer *buf = pipe->bufs + lastbuf; 519 const struct pipe_buf_operations *ops = buf->ops; 520 int offset = buf->offset + buf->len; 521 522 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 523 int error, atomic = 1; 524 void *addr; 525 526 error = ops->confirm(pipe, buf); 527 if (error) 528 goto out; 529 530 iov_fault_in_pages_read(iov, chars); 531 redo1: 532 addr = ops->map(pipe, buf, atomic); 533 error = pipe_iov_copy_from_user(offset + addr, iov, 534 chars, atomic); 535 ops->unmap(pipe, buf, addr); 536 ret = error; 537 do_wakeup = 1; 538 if (error) { 539 if (atomic) { 540 atomic = 0; 541 goto redo1; 542 } 543 goto out; 544 } 545 buf->len += chars; 546 total_len -= chars; 547 ret = chars; 548 if (!total_len) 549 goto out; 550 } 551 } 552 553 for (;;) { 554 int bufs; 555 556 if (!pipe->readers) { 557 send_sig(SIGPIPE, current, 0); 558 if (!ret) 559 ret = -EPIPE; 560 break; 561 } 562 bufs = pipe->nrbufs; 563 if (bufs < pipe->buffers) { 564 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 565 struct pipe_buffer *buf = pipe->bufs + newbuf; 566 struct page *page = pipe->tmp_page; 567 char *src; 568 int error, atomic = 1; 569 570 if (!page) { 571 page = alloc_page(GFP_HIGHUSER); 572 if (unlikely(!page)) { 573 ret = ret ? : -ENOMEM; 574 break; 575 } 576 pipe->tmp_page = page; 577 } 578 /* Always wake up, even if the copy fails. Otherwise 579 * we lock up (O_NONBLOCK-)readers that sleep due to 580 * syscall merging. 581 * FIXME! Is this really true? 582 */ 583 do_wakeup = 1; 584 chars = PAGE_SIZE; 585 if (chars > total_len) 586 chars = total_len; 587 588 iov_fault_in_pages_read(iov, chars); 589 redo2: 590 if (atomic) 591 src = kmap_atomic(page); 592 else 593 src = kmap(page); 594 595 error = pipe_iov_copy_from_user(src, iov, chars, 596 atomic); 597 if (atomic) 598 kunmap_atomic(src); 599 else 600 kunmap(page); 601 602 if (unlikely(error)) { 603 if (atomic) { 604 atomic = 0; 605 goto redo2; 606 } 607 if (!ret) 608 ret = error; 609 break; 610 } 611 ret += chars; 612 613 /* Insert it into the buffer array */ 614 buf->page = page; 615 buf->ops = &anon_pipe_buf_ops; 616 buf->offset = 0; 617 buf->len = chars; 618 buf->flags = 0; 619 if (is_packetized(filp)) { 620 buf->ops = &packet_pipe_buf_ops; 621 buf->flags = PIPE_BUF_FLAG_PACKET; 622 } 623 pipe->nrbufs = ++bufs; 624 pipe->tmp_page = NULL; 625 626 total_len -= chars; 627 if (!total_len) 628 break; 629 } 630 if (bufs < pipe->buffers) 631 continue; 632 if (filp->f_flags & O_NONBLOCK) { 633 if (!ret) 634 ret = -EAGAIN; 635 break; 636 } 637 if (signal_pending(current)) { 638 if (!ret) 639 ret = -ERESTARTSYS; 640 break; 641 } 642 if (do_wakeup) { 643 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 644 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 645 do_wakeup = 0; 646 } 647 pipe->waiting_writers++; 648 pipe_wait(pipe); 649 pipe->waiting_writers--; 650 } 651 out: 652 mutex_unlock(&inode->i_mutex); 653 if (do_wakeup) { 654 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 655 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 656 } 657 if (ret > 0) { 658 int err = file_update_time(filp); 659 if (err) 660 ret = err; 661 } 662 return ret; 663 } 664 665 static ssize_t 666 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 667 { 668 return -EBADF; 669 } 670 671 static ssize_t 672 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 673 loff_t *ppos) 674 { 675 return -EBADF; 676 } 677 678 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 679 { 680 struct inode *inode = filp->f_path.dentry->d_inode; 681 struct pipe_inode_info *pipe; 682 int count, buf, nrbufs; 683 684 switch (cmd) { 685 case FIONREAD: 686 mutex_lock(&inode->i_mutex); 687 pipe = inode->i_pipe; 688 count = 0; 689 buf = pipe->curbuf; 690 nrbufs = pipe->nrbufs; 691 while (--nrbufs >= 0) { 692 count += pipe->bufs[buf].len; 693 buf = (buf+1) & (pipe->buffers - 1); 694 } 695 mutex_unlock(&inode->i_mutex); 696 697 return put_user(count, (int __user *)arg); 698 default: 699 return -ENOIOCTLCMD; 700 } 701 } 702 703 /* No kernel lock held - fine */ 704 static unsigned int 705 pipe_poll(struct file *filp, poll_table *wait) 706 { 707 unsigned int mask; 708 struct inode *inode = filp->f_path.dentry->d_inode; 709 struct pipe_inode_info *pipe = inode->i_pipe; 710 int nrbufs; 711 712 poll_wait(filp, &pipe->wait, wait); 713 714 /* Reading only -- no need for acquiring the semaphore. */ 715 nrbufs = pipe->nrbufs; 716 mask = 0; 717 if (filp->f_mode & FMODE_READ) { 718 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 719 if (!pipe->writers && filp->f_version != pipe->w_counter) 720 mask |= POLLHUP; 721 } 722 723 if (filp->f_mode & FMODE_WRITE) { 724 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 725 /* 726 * Most Unices do not set POLLERR for FIFOs but on Linux they 727 * behave exactly like pipes for poll(). 728 */ 729 if (!pipe->readers) 730 mask |= POLLERR; 731 } 732 733 return mask; 734 } 735 736 static int 737 pipe_release(struct inode *inode, int decr, int decw) 738 { 739 struct pipe_inode_info *pipe; 740 741 mutex_lock(&inode->i_mutex); 742 pipe = inode->i_pipe; 743 pipe->readers -= decr; 744 pipe->writers -= decw; 745 746 if (!pipe->readers && !pipe->writers) { 747 free_pipe_info(inode); 748 } else { 749 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 750 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 751 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 752 } 753 mutex_unlock(&inode->i_mutex); 754 755 return 0; 756 } 757 758 static int 759 pipe_read_fasync(int fd, struct file *filp, int on) 760 { 761 struct inode *inode = filp->f_path.dentry->d_inode; 762 int retval; 763 764 mutex_lock(&inode->i_mutex); 765 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 766 mutex_unlock(&inode->i_mutex); 767 768 return retval; 769 } 770 771 772 static int 773 pipe_write_fasync(int fd, struct file *filp, int on) 774 { 775 struct inode *inode = filp->f_path.dentry->d_inode; 776 int retval; 777 778 mutex_lock(&inode->i_mutex); 779 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 780 mutex_unlock(&inode->i_mutex); 781 782 return retval; 783 } 784 785 786 static int 787 pipe_rdwr_fasync(int fd, struct file *filp, int on) 788 { 789 struct inode *inode = filp->f_path.dentry->d_inode; 790 struct pipe_inode_info *pipe = inode->i_pipe; 791 int retval; 792 793 mutex_lock(&inode->i_mutex); 794 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 795 if (retval >= 0) { 796 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 797 if (retval < 0) /* this can happen only if on == T */ 798 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 799 } 800 mutex_unlock(&inode->i_mutex); 801 return retval; 802 } 803 804 805 static int 806 pipe_read_release(struct inode *inode, struct file *filp) 807 { 808 return pipe_release(inode, 1, 0); 809 } 810 811 static int 812 pipe_write_release(struct inode *inode, struct file *filp) 813 { 814 return pipe_release(inode, 0, 1); 815 } 816 817 static int 818 pipe_rdwr_release(struct inode *inode, struct file *filp) 819 { 820 int decr, decw; 821 822 decr = (filp->f_mode & FMODE_READ) != 0; 823 decw = (filp->f_mode & FMODE_WRITE) != 0; 824 return pipe_release(inode, decr, decw); 825 } 826 827 static int 828 pipe_read_open(struct inode *inode, struct file *filp) 829 { 830 int ret = -ENOENT; 831 832 mutex_lock(&inode->i_mutex); 833 834 if (inode->i_pipe) { 835 ret = 0; 836 inode->i_pipe->readers++; 837 } 838 839 mutex_unlock(&inode->i_mutex); 840 841 return ret; 842 } 843 844 static int 845 pipe_write_open(struct inode *inode, struct file *filp) 846 { 847 int ret = -ENOENT; 848 849 mutex_lock(&inode->i_mutex); 850 851 if (inode->i_pipe) { 852 ret = 0; 853 inode->i_pipe->writers++; 854 } 855 856 mutex_unlock(&inode->i_mutex); 857 858 return ret; 859 } 860 861 static int 862 pipe_rdwr_open(struct inode *inode, struct file *filp) 863 { 864 int ret = -ENOENT; 865 866 mutex_lock(&inode->i_mutex); 867 868 if (inode->i_pipe) { 869 ret = 0; 870 if (filp->f_mode & FMODE_READ) 871 inode->i_pipe->readers++; 872 if (filp->f_mode & FMODE_WRITE) 873 inode->i_pipe->writers++; 874 } 875 876 mutex_unlock(&inode->i_mutex); 877 878 return ret; 879 } 880 881 /* 882 * The file_operations structs are not static because they 883 * are also used in linux/fs/fifo.c to do operations on FIFOs. 884 * 885 * Pipes reuse fifos' file_operations structs. 886 */ 887 const struct file_operations read_pipefifo_fops = { 888 .llseek = no_llseek, 889 .read = do_sync_read, 890 .aio_read = pipe_read, 891 .write = bad_pipe_w, 892 .poll = pipe_poll, 893 .unlocked_ioctl = pipe_ioctl, 894 .open = pipe_read_open, 895 .release = pipe_read_release, 896 .fasync = pipe_read_fasync, 897 }; 898 899 const struct file_operations write_pipefifo_fops = { 900 .llseek = no_llseek, 901 .read = bad_pipe_r, 902 .write = do_sync_write, 903 .aio_write = pipe_write, 904 .poll = pipe_poll, 905 .unlocked_ioctl = pipe_ioctl, 906 .open = pipe_write_open, 907 .release = pipe_write_release, 908 .fasync = pipe_write_fasync, 909 }; 910 911 const struct file_operations rdwr_pipefifo_fops = { 912 .llseek = no_llseek, 913 .read = do_sync_read, 914 .aio_read = pipe_read, 915 .write = do_sync_write, 916 .aio_write = pipe_write, 917 .poll = pipe_poll, 918 .unlocked_ioctl = pipe_ioctl, 919 .open = pipe_rdwr_open, 920 .release = pipe_rdwr_release, 921 .fasync = pipe_rdwr_fasync, 922 }; 923 924 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 925 { 926 struct pipe_inode_info *pipe; 927 928 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 929 if (pipe) { 930 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 931 if (pipe->bufs) { 932 init_waitqueue_head(&pipe->wait); 933 pipe->r_counter = pipe->w_counter = 1; 934 pipe->inode = inode; 935 pipe->buffers = PIPE_DEF_BUFFERS; 936 return pipe; 937 } 938 kfree(pipe); 939 } 940 941 return NULL; 942 } 943 944 void __free_pipe_info(struct pipe_inode_info *pipe) 945 { 946 int i; 947 948 for (i = 0; i < pipe->buffers; i++) { 949 struct pipe_buffer *buf = pipe->bufs + i; 950 if (buf->ops) 951 buf->ops->release(pipe, buf); 952 } 953 if (pipe->tmp_page) 954 __free_page(pipe->tmp_page); 955 kfree(pipe->bufs); 956 kfree(pipe); 957 } 958 959 void free_pipe_info(struct inode *inode) 960 { 961 __free_pipe_info(inode->i_pipe); 962 inode->i_pipe = NULL; 963 } 964 965 static struct vfsmount *pipe_mnt __read_mostly; 966 967 /* 968 * pipefs_dname() is called from d_path(). 969 */ 970 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 971 { 972 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 973 dentry->d_inode->i_ino); 974 } 975 976 static const struct dentry_operations pipefs_dentry_operations = { 977 .d_dname = pipefs_dname, 978 }; 979 980 static struct inode * get_pipe_inode(void) 981 { 982 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 983 struct pipe_inode_info *pipe; 984 985 if (!inode) 986 goto fail_inode; 987 988 inode->i_ino = get_next_ino(); 989 990 pipe = alloc_pipe_info(inode); 991 if (!pipe) 992 goto fail_iput; 993 inode->i_pipe = pipe; 994 995 pipe->readers = pipe->writers = 1; 996 inode->i_fop = &rdwr_pipefifo_fops; 997 998 /* 999 * Mark the inode dirty from the very beginning, 1000 * that way it will never be moved to the dirty 1001 * list because "mark_inode_dirty()" will think 1002 * that it already _is_ on the dirty list. 1003 */ 1004 inode->i_state = I_DIRTY; 1005 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1006 inode->i_uid = current_fsuid(); 1007 inode->i_gid = current_fsgid(); 1008 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1009 1010 return inode; 1011 1012 fail_iput: 1013 iput(inode); 1014 1015 fail_inode: 1016 return NULL; 1017 } 1018 1019 struct file *create_write_pipe(int flags) 1020 { 1021 int err; 1022 struct inode *inode; 1023 struct file *f; 1024 struct path path; 1025 struct qstr name = { .name = "" }; 1026 1027 err = -ENFILE; 1028 inode = get_pipe_inode(); 1029 if (!inode) 1030 goto err; 1031 1032 err = -ENOMEM; 1033 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1034 if (!path.dentry) 1035 goto err_inode; 1036 path.mnt = mntget(pipe_mnt); 1037 1038 d_instantiate(path.dentry, inode); 1039 1040 err = -ENFILE; 1041 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1042 if (!f) 1043 goto err_dentry; 1044 f->f_mapping = inode->i_mapping; 1045 1046 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1047 f->f_version = 0; 1048 1049 return f; 1050 1051 err_dentry: 1052 free_pipe_info(inode); 1053 path_put(&path); 1054 return ERR_PTR(err); 1055 1056 err_inode: 1057 free_pipe_info(inode); 1058 iput(inode); 1059 err: 1060 return ERR_PTR(err); 1061 } 1062 1063 void free_write_pipe(struct file *f) 1064 { 1065 free_pipe_info(f->f_dentry->d_inode); 1066 path_put(&f->f_path); 1067 put_filp(f); 1068 } 1069 1070 struct file *create_read_pipe(struct file *wrf, int flags) 1071 { 1072 /* Grab pipe from the writer */ 1073 struct file *f = alloc_file(&wrf->f_path, FMODE_READ, 1074 &read_pipefifo_fops); 1075 if (!f) 1076 return ERR_PTR(-ENFILE); 1077 1078 path_get(&wrf->f_path); 1079 f->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1080 1081 return f; 1082 } 1083 1084 int do_pipe_flags(int *fd, int flags) 1085 { 1086 struct file *fw, *fr; 1087 int error; 1088 int fdw, fdr; 1089 1090 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1091 return -EINVAL; 1092 1093 fw = create_write_pipe(flags); 1094 if (IS_ERR(fw)) 1095 return PTR_ERR(fw); 1096 fr = create_read_pipe(fw, flags); 1097 error = PTR_ERR(fr); 1098 if (IS_ERR(fr)) 1099 goto err_write_pipe; 1100 1101 error = get_unused_fd_flags(flags); 1102 if (error < 0) 1103 goto err_read_pipe; 1104 fdr = error; 1105 1106 error = get_unused_fd_flags(flags); 1107 if (error < 0) 1108 goto err_fdr; 1109 fdw = error; 1110 1111 audit_fd_pair(fdr, fdw); 1112 fd_install(fdr, fr); 1113 fd_install(fdw, fw); 1114 fd[0] = fdr; 1115 fd[1] = fdw; 1116 1117 return 0; 1118 1119 err_fdr: 1120 put_unused_fd(fdr); 1121 err_read_pipe: 1122 path_put(&fr->f_path); 1123 put_filp(fr); 1124 err_write_pipe: 1125 free_write_pipe(fw); 1126 return error; 1127 } 1128 1129 /* 1130 * sys_pipe() is the normal C calling standard for creating 1131 * a pipe. It's not the way Unix traditionally does this, though. 1132 */ 1133 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1134 { 1135 int fd[2]; 1136 int error; 1137 1138 error = do_pipe_flags(fd, flags); 1139 if (!error) { 1140 if (copy_to_user(fildes, fd, sizeof(fd))) { 1141 sys_close(fd[0]); 1142 sys_close(fd[1]); 1143 error = -EFAULT; 1144 } 1145 } 1146 return error; 1147 } 1148 1149 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1150 { 1151 return sys_pipe2(fildes, 0); 1152 } 1153 1154 /* 1155 * Allocate a new array of pipe buffers and copy the info over. Returns the 1156 * pipe size if successful, or return -ERROR on error. 1157 */ 1158 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1159 { 1160 struct pipe_buffer *bufs; 1161 1162 /* 1163 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1164 * expect a lot of shrink+grow operations, just free and allocate 1165 * again like we would do for growing. If the pipe currently 1166 * contains more buffers than arg, then return busy. 1167 */ 1168 if (nr_pages < pipe->nrbufs) 1169 return -EBUSY; 1170 1171 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1172 if (unlikely(!bufs)) 1173 return -ENOMEM; 1174 1175 /* 1176 * The pipe array wraps around, so just start the new one at zero 1177 * and adjust the indexes. 1178 */ 1179 if (pipe->nrbufs) { 1180 unsigned int tail; 1181 unsigned int head; 1182 1183 tail = pipe->curbuf + pipe->nrbufs; 1184 if (tail < pipe->buffers) 1185 tail = 0; 1186 else 1187 tail &= (pipe->buffers - 1); 1188 1189 head = pipe->nrbufs - tail; 1190 if (head) 1191 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1192 if (tail) 1193 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1194 } 1195 1196 pipe->curbuf = 0; 1197 kfree(pipe->bufs); 1198 pipe->bufs = bufs; 1199 pipe->buffers = nr_pages; 1200 return nr_pages * PAGE_SIZE; 1201 } 1202 1203 /* 1204 * Currently we rely on the pipe array holding a power-of-2 number 1205 * of pages. 1206 */ 1207 static inline unsigned int round_pipe_size(unsigned int size) 1208 { 1209 unsigned long nr_pages; 1210 1211 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1212 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1213 } 1214 1215 /* 1216 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1217 * will return an error. 1218 */ 1219 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1220 size_t *lenp, loff_t *ppos) 1221 { 1222 int ret; 1223 1224 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1225 if (ret < 0 || !write) 1226 return ret; 1227 1228 pipe_max_size = round_pipe_size(pipe_max_size); 1229 return ret; 1230 } 1231 1232 /* 1233 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1234 * location, so checking ->i_pipe is not enough to verify that this is a 1235 * pipe. 1236 */ 1237 struct pipe_inode_info *get_pipe_info(struct file *file) 1238 { 1239 struct inode *i = file->f_path.dentry->d_inode; 1240 1241 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1242 } 1243 1244 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1245 { 1246 struct pipe_inode_info *pipe; 1247 long ret; 1248 1249 pipe = get_pipe_info(file); 1250 if (!pipe) 1251 return -EBADF; 1252 1253 mutex_lock(&pipe->inode->i_mutex); 1254 1255 switch (cmd) { 1256 case F_SETPIPE_SZ: { 1257 unsigned int size, nr_pages; 1258 1259 size = round_pipe_size(arg); 1260 nr_pages = size >> PAGE_SHIFT; 1261 1262 ret = -EINVAL; 1263 if (!nr_pages) 1264 goto out; 1265 1266 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1267 ret = -EPERM; 1268 goto out; 1269 } 1270 ret = pipe_set_size(pipe, nr_pages); 1271 break; 1272 } 1273 case F_GETPIPE_SZ: 1274 ret = pipe->buffers * PAGE_SIZE; 1275 break; 1276 default: 1277 ret = -EINVAL; 1278 break; 1279 } 1280 1281 out: 1282 mutex_unlock(&pipe->inode->i_mutex); 1283 return ret; 1284 } 1285 1286 static const struct super_operations pipefs_ops = { 1287 .destroy_inode = free_inode_nonrcu, 1288 .statfs = simple_statfs, 1289 }; 1290 1291 /* 1292 * pipefs should _never_ be mounted by userland - too much of security hassle, 1293 * no real gain from having the whole whorehouse mounted. So we don't need 1294 * any operations on the root directory. However, we need a non-trivial 1295 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1296 */ 1297 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1298 int flags, const char *dev_name, void *data) 1299 { 1300 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1301 &pipefs_dentry_operations, PIPEFS_MAGIC); 1302 } 1303 1304 static struct file_system_type pipe_fs_type = { 1305 .name = "pipefs", 1306 .mount = pipefs_mount, 1307 .kill_sb = kill_anon_super, 1308 }; 1309 1310 static int __init init_pipe_fs(void) 1311 { 1312 int err = register_filesystem(&pipe_fs_type); 1313 1314 if (!err) { 1315 pipe_mnt = kern_mount(&pipe_fs_type); 1316 if (IS_ERR(pipe_mnt)) { 1317 err = PTR_ERR(pipe_mnt); 1318 unregister_filesystem(&pipe_fs_type); 1319 } 1320 } 1321 return err; 1322 } 1323 1324 fs_initcall(init_pipe_fs); 1325