1 /* 2 * linux/fs/pipe.c 3 * 4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds 5 */ 6 7 #include <linux/mm.h> 8 #include <linux/file.h> 9 #include <linux/poll.h> 10 #include <linux/slab.h> 11 #include <linux/module.h> 12 #include <linux/init.h> 13 #include <linux/fs.h> 14 #include <linux/log2.h> 15 #include <linux/mount.h> 16 #include <linux/magic.h> 17 #include <linux/pipe_fs_i.h> 18 #include <linux/uio.h> 19 #include <linux/highmem.h> 20 #include <linux/pagemap.h> 21 #include <linux/audit.h> 22 #include <linux/syscalls.h> 23 #include <linux/fcntl.h> 24 25 #include <asm/uaccess.h> 26 #include <asm/ioctls.h> 27 28 /* 29 * The max size that a non-root user is allowed to grow the pipe. Can 30 * be set by root in /proc/sys/fs/pipe-max-size 31 */ 32 unsigned int pipe_max_size = 1048576; 33 34 /* 35 * Minimum pipe size, as required by POSIX 36 */ 37 unsigned int pipe_min_size = PAGE_SIZE; 38 39 /* 40 * We use a start+len construction, which provides full use of the 41 * allocated memory. 42 * -- Florian Coosmann (FGC) 43 * 44 * Reads with count = 0 should always return 0. 45 * -- Julian Bradfield 1999-06-07. 46 * 47 * FIFOs and Pipes now generate SIGIO for both readers and writers. 48 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 49 * 50 * pipe_read & write cleanup 51 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 52 */ 53 54 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) 55 { 56 if (pipe->inode) 57 mutex_lock_nested(&pipe->inode->i_mutex, subclass); 58 } 59 60 void pipe_lock(struct pipe_inode_info *pipe) 61 { 62 /* 63 * pipe_lock() nests non-pipe inode locks (for writing to a file) 64 */ 65 pipe_lock_nested(pipe, I_MUTEX_PARENT); 66 } 67 EXPORT_SYMBOL(pipe_lock); 68 69 void pipe_unlock(struct pipe_inode_info *pipe) 70 { 71 if (pipe->inode) 72 mutex_unlock(&pipe->inode->i_mutex); 73 } 74 EXPORT_SYMBOL(pipe_unlock); 75 76 void pipe_double_lock(struct pipe_inode_info *pipe1, 77 struct pipe_inode_info *pipe2) 78 { 79 BUG_ON(pipe1 == pipe2); 80 81 if (pipe1 < pipe2) { 82 pipe_lock_nested(pipe1, I_MUTEX_PARENT); 83 pipe_lock_nested(pipe2, I_MUTEX_CHILD); 84 } else { 85 pipe_lock_nested(pipe2, I_MUTEX_PARENT); 86 pipe_lock_nested(pipe1, I_MUTEX_CHILD); 87 } 88 } 89 90 /* Drop the inode semaphore and wait for a pipe event, atomically */ 91 void pipe_wait(struct pipe_inode_info *pipe) 92 { 93 DEFINE_WAIT(wait); 94 95 /* 96 * Pipes are system-local resources, so sleeping on them 97 * is considered a noninteractive wait: 98 */ 99 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); 100 pipe_unlock(pipe); 101 schedule(); 102 finish_wait(&pipe->wait, &wait); 103 pipe_lock(pipe); 104 } 105 106 static int 107 pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, 108 int atomic) 109 { 110 unsigned long copy; 111 112 while (len > 0) { 113 while (!iov->iov_len) 114 iov++; 115 copy = min_t(unsigned long, len, iov->iov_len); 116 117 if (atomic) { 118 if (__copy_from_user_inatomic(to, iov->iov_base, copy)) 119 return -EFAULT; 120 } else { 121 if (copy_from_user(to, iov->iov_base, copy)) 122 return -EFAULT; 123 } 124 to += copy; 125 len -= copy; 126 iov->iov_base += copy; 127 iov->iov_len -= copy; 128 } 129 return 0; 130 } 131 132 static int 133 pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, 134 int atomic) 135 { 136 unsigned long copy; 137 138 while (len > 0) { 139 while (!iov->iov_len) 140 iov++; 141 copy = min_t(unsigned long, len, iov->iov_len); 142 143 if (atomic) { 144 if (__copy_to_user_inatomic(iov->iov_base, from, copy)) 145 return -EFAULT; 146 } else { 147 if (copy_to_user(iov->iov_base, from, copy)) 148 return -EFAULT; 149 } 150 from += copy; 151 len -= copy; 152 iov->iov_base += copy; 153 iov->iov_len -= copy; 154 } 155 return 0; 156 } 157 158 /* 159 * Attempt to pre-fault in the user memory, so we can use atomic copies. 160 * Returns the number of bytes not faulted in. 161 */ 162 static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) 163 { 164 while (!iov->iov_len) 165 iov++; 166 167 while (len > 0) { 168 unsigned long this_len; 169 170 this_len = min_t(unsigned long, len, iov->iov_len); 171 if (fault_in_pages_writeable(iov->iov_base, this_len)) 172 break; 173 174 len -= this_len; 175 iov++; 176 } 177 178 return len; 179 } 180 181 /* 182 * Pre-fault in the user memory, so we can use atomic copies. 183 */ 184 static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) 185 { 186 while (!iov->iov_len) 187 iov++; 188 189 while (len > 0) { 190 unsigned long this_len; 191 192 this_len = min_t(unsigned long, len, iov->iov_len); 193 fault_in_pages_readable(iov->iov_base, this_len); 194 len -= this_len; 195 iov++; 196 } 197 } 198 199 static void anon_pipe_buf_release(struct pipe_inode_info *pipe, 200 struct pipe_buffer *buf) 201 { 202 struct page *page = buf->page; 203 204 /* 205 * If nobody else uses this page, and we don't already have a 206 * temporary page, let's keep track of it as a one-deep 207 * allocation cache. (Otherwise just release our reference to it) 208 */ 209 if (page_count(page) == 1 && !pipe->tmp_page) 210 pipe->tmp_page = page; 211 else 212 page_cache_release(page); 213 } 214 215 /** 216 * generic_pipe_buf_map - virtually map a pipe buffer 217 * @pipe: the pipe that the buffer belongs to 218 * @buf: the buffer that should be mapped 219 * @atomic: whether to use an atomic map 220 * 221 * Description: 222 * This function returns a kernel virtual address mapping for the 223 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided 224 * and the caller has to be careful not to fault before calling 225 * the unmap function. 226 * 227 * Note that this function calls kmap_atomic() if @atomic != 0. 228 */ 229 void *generic_pipe_buf_map(struct pipe_inode_info *pipe, 230 struct pipe_buffer *buf, int atomic) 231 { 232 if (atomic) { 233 buf->flags |= PIPE_BUF_FLAG_ATOMIC; 234 return kmap_atomic(buf->page); 235 } 236 237 return kmap(buf->page); 238 } 239 EXPORT_SYMBOL(generic_pipe_buf_map); 240 241 /** 242 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer 243 * @pipe: the pipe that the buffer belongs to 244 * @buf: the buffer that should be unmapped 245 * @map_data: the data that the mapping function returned 246 * 247 * Description: 248 * This function undoes the mapping that ->map() provided. 249 */ 250 void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, 251 struct pipe_buffer *buf, void *map_data) 252 { 253 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { 254 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; 255 kunmap_atomic(map_data); 256 } else 257 kunmap(buf->page); 258 } 259 EXPORT_SYMBOL(generic_pipe_buf_unmap); 260 261 /** 262 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer 263 * @pipe: the pipe that the buffer belongs to 264 * @buf: the buffer to attempt to steal 265 * 266 * Description: 267 * This function attempts to steal the &struct page attached to 268 * @buf. If successful, this function returns 0 and returns with 269 * the page locked. The caller may then reuse the page for whatever 270 * he wishes; the typical use is insertion into a different file 271 * page cache. 272 */ 273 int generic_pipe_buf_steal(struct pipe_inode_info *pipe, 274 struct pipe_buffer *buf) 275 { 276 struct page *page = buf->page; 277 278 /* 279 * A reference of one is golden, that means that the owner of this 280 * page is the only one holding a reference to it. lock the page 281 * and return OK. 282 */ 283 if (page_count(page) == 1) { 284 lock_page(page); 285 return 0; 286 } 287 288 return 1; 289 } 290 EXPORT_SYMBOL(generic_pipe_buf_steal); 291 292 /** 293 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer 294 * @pipe: the pipe that the buffer belongs to 295 * @buf: the buffer to get a reference to 296 * 297 * Description: 298 * This function grabs an extra reference to @buf. It's used in 299 * in the tee() system call, when we duplicate the buffers in one 300 * pipe into another. 301 */ 302 void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) 303 { 304 page_cache_get(buf->page); 305 } 306 EXPORT_SYMBOL(generic_pipe_buf_get); 307 308 /** 309 * generic_pipe_buf_confirm - verify contents of the pipe buffer 310 * @info: the pipe that the buffer belongs to 311 * @buf: the buffer to confirm 312 * 313 * Description: 314 * This function does nothing, because the generic pipe code uses 315 * pages that are always good when inserted into the pipe. 316 */ 317 int generic_pipe_buf_confirm(struct pipe_inode_info *info, 318 struct pipe_buffer *buf) 319 { 320 return 0; 321 } 322 EXPORT_SYMBOL(generic_pipe_buf_confirm); 323 324 /** 325 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer 326 * @pipe: the pipe that the buffer belongs to 327 * @buf: the buffer to put a reference to 328 * 329 * Description: 330 * This function releases a reference to @buf. 331 */ 332 void generic_pipe_buf_release(struct pipe_inode_info *pipe, 333 struct pipe_buffer *buf) 334 { 335 page_cache_release(buf->page); 336 } 337 EXPORT_SYMBOL(generic_pipe_buf_release); 338 339 static const struct pipe_buf_operations anon_pipe_buf_ops = { 340 .can_merge = 1, 341 .map = generic_pipe_buf_map, 342 .unmap = generic_pipe_buf_unmap, 343 .confirm = generic_pipe_buf_confirm, 344 .release = anon_pipe_buf_release, 345 .steal = generic_pipe_buf_steal, 346 .get = generic_pipe_buf_get, 347 }; 348 349 static const struct pipe_buf_operations packet_pipe_buf_ops = { 350 .can_merge = 0, 351 .map = generic_pipe_buf_map, 352 .unmap = generic_pipe_buf_unmap, 353 .confirm = generic_pipe_buf_confirm, 354 .release = anon_pipe_buf_release, 355 .steal = generic_pipe_buf_steal, 356 .get = generic_pipe_buf_get, 357 }; 358 359 static ssize_t 360 pipe_read(struct kiocb *iocb, const struct iovec *_iov, 361 unsigned long nr_segs, loff_t pos) 362 { 363 struct file *filp = iocb->ki_filp; 364 struct inode *inode = filp->f_path.dentry->d_inode; 365 struct pipe_inode_info *pipe; 366 int do_wakeup; 367 ssize_t ret; 368 struct iovec *iov = (struct iovec *)_iov; 369 size_t total_len; 370 371 total_len = iov_length(iov, nr_segs); 372 /* Null read succeeds. */ 373 if (unlikely(total_len == 0)) 374 return 0; 375 376 do_wakeup = 0; 377 ret = 0; 378 mutex_lock(&inode->i_mutex); 379 pipe = inode->i_pipe; 380 for (;;) { 381 int bufs = pipe->nrbufs; 382 if (bufs) { 383 int curbuf = pipe->curbuf; 384 struct pipe_buffer *buf = pipe->bufs + curbuf; 385 const struct pipe_buf_operations *ops = buf->ops; 386 void *addr; 387 size_t chars = buf->len; 388 int error, atomic; 389 390 if (chars > total_len) 391 chars = total_len; 392 393 error = ops->confirm(pipe, buf); 394 if (error) { 395 if (!ret) 396 ret = error; 397 break; 398 } 399 400 atomic = !iov_fault_in_pages_write(iov, chars); 401 redo: 402 addr = ops->map(pipe, buf, atomic); 403 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); 404 ops->unmap(pipe, buf, addr); 405 if (unlikely(error)) { 406 /* 407 * Just retry with the slow path if we failed. 408 */ 409 if (atomic) { 410 atomic = 0; 411 goto redo; 412 } 413 if (!ret) 414 ret = error; 415 break; 416 } 417 ret += chars; 418 buf->offset += chars; 419 buf->len -= chars; 420 421 /* Was it a packet buffer? Clean up and exit */ 422 if (buf->flags & PIPE_BUF_FLAG_PACKET) { 423 total_len = chars; 424 buf->len = 0; 425 } 426 427 if (!buf->len) { 428 buf->ops = NULL; 429 ops->release(pipe, buf); 430 curbuf = (curbuf + 1) & (pipe->buffers - 1); 431 pipe->curbuf = curbuf; 432 pipe->nrbufs = --bufs; 433 do_wakeup = 1; 434 } 435 total_len -= chars; 436 if (!total_len) 437 break; /* common path: read succeeded */ 438 } 439 if (bufs) /* More to do? */ 440 continue; 441 if (!pipe->writers) 442 break; 443 if (!pipe->waiting_writers) { 444 /* syscall merging: Usually we must not sleep 445 * if O_NONBLOCK is set, or if we got some data. 446 * But if a writer sleeps in kernel space, then 447 * we can wait for that data without violating POSIX. 448 */ 449 if (ret) 450 break; 451 if (filp->f_flags & O_NONBLOCK) { 452 ret = -EAGAIN; 453 break; 454 } 455 } 456 if (signal_pending(current)) { 457 if (!ret) 458 ret = -ERESTARTSYS; 459 break; 460 } 461 if (do_wakeup) { 462 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 463 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 464 } 465 pipe_wait(pipe); 466 } 467 mutex_unlock(&inode->i_mutex); 468 469 /* Signal writers asynchronously that there is more room. */ 470 if (do_wakeup) { 471 wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); 472 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 473 } 474 if (ret > 0) 475 file_accessed(filp); 476 return ret; 477 } 478 479 static inline int is_packetized(struct file *file) 480 { 481 return (file->f_flags & O_DIRECT) != 0; 482 } 483 484 static ssize_t 485 pipe_write(struct kiocb *iocb, const struct iovec *_iov, 486 unsigned long nr_segs, loff_t ppos) 487 { 488 struct file *filp = iocb->ki_filp; 489 struct inode *inode = filp->f_path.dentry->d_inode; 490 struct pipe_inode_info *pipe; 491 ssize_t ret; 492 int do_wakeup; 493 struct iovec *iov = (struct iovec *)_iov; 494 size_t total_len; 495 ssize_t chars; 496 497 total_len = iov_length(iov, nr_segs); 498 /* Null write succeeds. */ 499 if (unlikely(total_len == 0)) 500 return 0; 501 502 do_wakeup = 0; 503 ret = 0; 504 mutex_lock(&inode->i_mutex); 505 pipe = inode->i_pipe; 506 507 if (!pipe->readers) { 508 send_sig(SIGPIPE, current, 0); 509 ret = -EPIPE; 510 goto out; 511 } 512 513 /* We try to merge small writes */ 514 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ 515 if (pipe->nrbufs && chars != 0) { 516 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & 517 (pipe->buffers - 1); 518 struct pipe_buffer *buf = pipe->bufs + lastbuf; 519 const struct pipe_buf_operations *ops = buf->ops; 520 int offset = buf->offset + buf->len; 521 522 if (ops->can_merge && offset + chars <= PAGE_SIZE) { 523 int error, atomic = 1; 524 void *addr; 525 526 error = ops->confirm(pipe, buf); 527 if (error) 528 goto out; 529 530 iov_fault_in_pages_read(iov, chars); 531 redo1: 532 addr = ops->map(pipe, buf, atomic); 533 error = pipe_iov_copy_from_user(offset + addr, iov, 534 chars, atomic); 535 ops->unmap(pipe, buf, addr); 536 ret = error; 537 do_wakeup = 1; 538 if (error) { 539 if (atomic) { 540 atomic = 0; 541 goto redo1; 542 } 543 goto out; 544 } 545 buf->len += chars; 546 total_len -= chars; 547 ret = chars; 548 if (!total_len) 549 goto out; 550 } 551 } 552 553 for (;;) { 554 int bufs; 555 556 if (!pipe->readers) { 557 send_sig(SIGPIPE, current, 0); 558 if (!ret) 559 ret = -EPIPE; 560 break; 561 } 562 bufs = pipe->nrbufs; 563 if (bufs < pipe->buffers) { 564 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); 565 struct pipe_buffer *buf = pipe->bufs + newbuf; 566 struct page *page = pipe->tmp_page; 567 char *src; 568 int error, atomic = 1; 569 570 if (!page) { 571 page = alloc_page(GFP_HIGHUSER); 572 if (unlikely(!page)) { 573 ret = ret ? : -ENOMEM; 574 break; 575 } 576 pipe->tmp_page = page; 577 } 578 /* Always wake up, even if the copy fails. Otherwise 579 * we lock up (O_NONBLOCK-)readers that sleep due to 580 * syscall merging. 581 * FIXME! Is this really true? 582 */ 583 do_wakeup = 1; 584 chars = PAGE_SIZE; 585 if (chars > total_len) 586 chars = total_len; 587 588 iov_fault_in_pages_read(iov, chars); 589 redo2: 590 if (atomic) 591 src = kmap_atomic(page); 592 else 593 src = kmap(page); 594 595 error = pipe_iov_copy_from_user(src, iov, chars, 596 atomic); 597 if (atomic) 598 kunmap_atomic(src); 599 else 600 kunmap(page); 601 602 if (unlikely(error)) { 603 if (atomic) { 604 atomic = 0; 605 goto redo2; 606 } 607 if (!ret) 608 ret = error; 609 break; 610 } 611 ret += chars; 612 613 /* Insert it into the buffer array */ 614 buf->page = page; 615 buf->ops = &anon_pipe_buf_ops; 616 buf->offset = 0; 617 buf->len = chars; 618 buf->flags = 0; 619 if (is_packetized(filp)) { 620 buf->ops = &packet_pipe_buf_ops; 621 buf->flags = PIPE_BUF_FLAG_PACKET; 622 } 623 pipe->nrbufs = ++bufs; 624 pipe->tmp_page = NULL; 625 626 total_len -= chars; 627 if (!total_len) 628 break; 629 } 630 if (bufs < pipe->buffers) 631 continue; 632 if (filp->f_flags & O_NONBLOCK) { 633 if (!ret) 634 ret = -EAGAIN; 635 break; 636 } 637 if (signal_pending(current)) { 638 if (!ret) 639 ret = -ERESTARTSYS; 640 break; 641 } 642 if (do_wakeup) { 643 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 644 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 645 do_wakeup = 0; 646 } 647 pipe->waiting_writers++; 648 pipe_wait(pipe); 649 pipe->waiting_writers--; 650 } 651 out: 652 mutex_unlock(&inode->i_mutex); 653 if (do_wakeup) { 654 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); 655 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 656 } 657 if (ret > 0) { 658 int err = file_update_time(filp); 659 if (err) 660 ret = err; 661 } 662 return ret; 663 } 664 665 static ssize_t 666 bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) 667 { 668 return -EBADF; 669 } 670 671 static ssize_t 672 bad_pipe_w(struct file *filp, const char __user *buf, size_t count, 673 loff_t *ppos) 674 { 675 return -EBADF; 676 } 677 678 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 679 { 680 struct inode *inode = filp->f_path.dentry->d_inode; 681 struct pipe_inode_info *pipe; 682 int count, buf, nrbufs; 683 684 switch (cmd) { 685 case FIONREAD: 686 mutex_lock(&inode->i_mutex); 687 pipe = inode->i_pipe; 688 count = 0; 689 buf = pipe->curbuf; 690 nrbufs = pipe->nrbufs; 691 while (--nrbufs >= 0) { 692 count += pipe->bufs[buf].len; 693 buf = (buf+1) & (pipe->buffers - 1); 694 } 695 mutex_unlock(&inode->i_mutex); 696 697 return put_user(count, (int __user *)arg); 698 default: 699 return -ENOIOCTLCMD; 700 } 701 } 702 703 /* No kernel lock held - fine */ 704 static unsigned int 705 pipe_poll(struct file *filp, poll_table *wait) 706 { 707 unsigned int mask; 708 struct inode *inode = filp->f_path.dentry->d_inode; 709 struct pipe_inode_info *pipe = inode->i_pipe; 710 int nrbufs; 711 712 poll_wait(filp, &pipe->wait, wait); 713 714 /* Reading only -- no need for acquiring the semaphore. */ 715 nrbufs = pipe->nrbufs; 716 mask = 0; 717 if (filp->f_mode & FMODE_READ) { 718 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; 719 if (!pipe->writers && filp->f_version != pipe->w_counter) 720 mask |= POLLHUP; 721 } 722 723 if (filp->f_mode & FMODE_WRITE) { 724 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; 725 /* 726 * Most Unices do not set POLLERR for FIFOs but on Linux they 727 * behave exactly like pipes for poll(). 728 */ 729 if (!pipe->readers) 730 mask |= POLLERR; 731 } 732 733 return mask; 734 } 735 736 static int 737 pipe_release(struct inode *inode, int decr, int decw) 738 { 739 struct pipe_inode_info *pipe; 740 741 mutex_lock(&inode->i_mutex); 742 pipe = inode->i_pipe; 743 pipe->readers -= decr; 744 pipe->writers -= decw; 745 746 if (!pipe->readers && !pipe->writers) { 747 free_pipe_info(inode); 748 } else { 749 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); 750 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); 751 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); 752 } 753 mutex_unlock(&inode->i_mutex); 754 755 return 0; 756 } 757 758 static int 759 pipe_read_fasync(int fd, struct file *filp, int on) 760 { 761 struct inode *inode = filp->f_path.dentry->d_inode; 762 int retval; 763 764 mutex_lock(&inode->i_mutex); 765 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); 766 mutex_unlock(&inode->i_mutex); 767 768 return retval; 769 } 770 771 772 static int 773 pipe_write_fasync(int fd, struct file *filp, int on) 774 { 775 struct inode *inode = filp->f_path.dentry->d_inode; 776 int retval; 777 778 mutex_lock(&inode->i_mutex); 779 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); 780 mutex_unlock(&inode->i_mutex); 781 782 return retval; 783 } 784 785 786 static int 787 pipe_rdwr_fasync(int fd, struct file *filp, int on) 788 { 789 struct inode *inode = filp->f_path.dentry->d_inode; 790 struct pipe_inode_info *pipe = inode->i_pipe; 791 int retval; 792 793 mutex_lock(&inode->i_mutex); 794 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); 795 if (retval >= 0) { 796 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); 797 if (retval < 0) /* this can happen only if on == T */ 798 fasync_helper(-1, filp, 0, &pipe->fasync_readers); 799 } 800 mutex_unlock(&inode->i_mutex); 801 return retval; 802 } 803 804 805 static int 806 pipe_read_release(struct inode *inode, struct file *filp) 807 { 808 return pipe_release(inode, 1, 0); 809 } 810 811 static int 812 pipe_write_release(struct inode *inode, struct file *filp) 813 { 814 return pipe_release(inode, 0, 1); 815 } 816 817 static int 818 pipe_rdwr_release(struct inode *inode, struct file *filp) 819 { 820 int decr, decw; 821 822 decr = (filp->f_mode & FMODE_READ) != 0; 823 decw = (filp->f_mode & FMODE_WRITE) != 0; 824 return pipe_release(inode, decr, decw); 825 } 826 827 static int 828 pipe_read_open(struct inode *inode, struct file *filp) 829 { 830 int ret = -ENOENT; 831 832 mutex_lock(&inode->i_mutex); 833 834 if (inode->i_pipe) { 835 ret = 0; 836 inode->i_pipe->readers++; 837 } 838 839 mutex_unlock(&inode->i_mutex); 840 841 return ret; 842 } 843 844 static int 845 pipe_write_open(struct inode *inode, struct file *filp) 846 { 847 int ret = -ENOENT; 848 849 mutex_lock(&inode->i_mutex); 850 851 if (inode->i_pipe) { 852 ret = 0; 853 inode->i_pipe->writers++; 854 } 855 856 mutex_unlock(&inode->i_mutex); 857 858 return ret; 859 } 860 861 static int 862 pipe_rdwr_open(struct inode *inode, struct file *filp) 863 { 864 int ret = -ENOENT; 865 866 mutex_lock(&inode->i_mutex); 867 868 if (inode->i_pipe) { 869 ret = 0; 870 if (filp->f_mode & FMODE_READ) 871 inode->i_pipe->readers++; 872 if (filp->f_mode & FMODE_WRITE) 873 inode->i_pipe->writers++; 874 } 875 876 mutex_unlock(&inode->i_mutex); 877 878 return ret; 879 } 880 881 /* 882 * The file_operations structs are not static because they 883 * are also used in linux/fs/fifo.c to do operations on FIFOs. 884 * 885 * Pipes reuse fifos' file_operations structs. 886 */ 887 const struct file_operations read_pipefifo_fops = { 888 .llseek = no_llseek, 889 .read = do_sync_read, 890 .aio_read = pipe_read, 891 .write = bad_pipe_w, 892 .poll = pipe_poll, 893 .unlocked_ioctl = pipe_ioctl, 894 .open = pipe_read_open, 895 .release = pipe_read_release, 896 .fasync = pipe_read_fasync, 897 }; 898 899 const struct file_operations write_pipefifo_fops = { 900 .llseek = no_llseek, 901 .read = bad_pipe_r, 902 .write = do_sync_write, 903 .aio_write = pipe_write, 904 .poll = pipe_poll, 905 .unlocked_ioctl = pipe_ioctl, 906 .open = pipe_write_open, 907 .release = pipe_write_release, 908 .fasync = pipe_write_fasync, 909 }; 910 911 const struct file_operations rdwr_pipefifo_fops = { 912 .llseek = no_llseek, 913 .read = do_sync_read, 914 .aio_read = pipe_read, 915 .write = do_sync_write, 916 .aio_write = pipe_write, 917 .poll = pipe_poll, 918 .unlocked_ioctl = pipe_ioctl, 919 .open = pipe_rdwr_open, 920 .release = pipe_rdwr_release, 921 .fasync = pipe_rdwr_fasync, 922 }; 923 924 struct pipe_inode_info * alloc_pipe_info(struct inode *inode) 925 { 926 struct pipe_inode_info *pipe; 927 928 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); 929 if (pipe) { 930 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); 931 if (pipe->bufs) { 932 init_waitqueue_head(&pipe->wait); 933 pipe->r_counter = pipe->w_counter = 1; 934 pipe->inode = inode; 935 pipe->buffers = PIPE_DEF_BUFFERS; 936 return pipe; 937 } 938 kfree(pipe); 939 } 940 941 return NULL; 942 } 943 944 void __free_pipe_info(struct pipe_inode_info *pipe) 945 { 946 int i; 947 948 for (i = 0; i < pipe->buffers; i++) { 949 struct pipe_buffer *buf = pipe->bufs + i; 950 if (buf->ops) 951 buf->ops->release(pipe, buf); 952 } 953 if (pipe->tmp_page) 954 __free_page(pipe->tmp_page); 955 kfree(pipe->bufs); 956 kfree(pipe); 957 } 958 959 void free_pipe_info(struct inode *inode) 960 { 961 __free_pipe_info(inode->i_pipe); 962 inode->i_pipe = NULL; 963 } 964 965 static struct vfsmount *pipe_mnt __read_mostly; 966 967 /* 968 * pipefs_dname() is called from d_path(). 969 */ 970 static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) 971 { 972 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", 973 dentry->d_inode->i_ino); 974 } 975 976 static const struct dentry_operations pipefs_dentry_operations = { 977 .d_dname = pipefs_dname, 978 }; 979 980 static struct inode * get_pipe_inode(void) 981 { 982 struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); 983 struct pipe_inode_info *pipe; 984 985 if (!inode) 986 goto fail_inode; 987 988 inode->i_ino = get_next_ino(); 989 990 pipe = alloc_pipe_info(inode); 991 if (!pipe) 992 goto fail_iput; 993 inode->i_pipe = pipe; 994 995 pipe->readers = pipe->writers = 1; 996 inode->i_fop = &rdwr_pipefifo_fops; 997 998 /* 999 * Mark the inode dirty from the very beginning, 1000 * that way it will never be moved to the dirty 1001 * list because "mark_inode_dirty()" will think 1002 * that it already _is_ on the dirty list. 1003 */ 1004 inode->i_state = I_DIRTY; 1005 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; 1006 inode->i_uid = current_fsuid(); 1007 inode->i_gid = current_fsgid(); 1008 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1009 1010 return inode; 1011 1012 fail_iput: 1013 iput(inode); 1014 1015 fail_inode: 1016 return NULL; 1017 } 1018 1019 int create_pipe_files(struct file **res, int flags) 1020 { 1021 int err; 1022 struct inode *inode = get_pipe_inode(); 1023 struct file *f; 1024 struct path path; 1025 static struct qstr name = { .name = "" }; 1026 1027 if (!inode) 1028 return -ENFILE; 1029 1030 err = -ENOMEM; 1031 path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &name); 1032 if (!path.dentry) 1033 goto err_inode; 1034 path.mnt = mntget(pipe_mnt); 1035 1036 d_instantiate(path.dentry, inode); 1037 1038 err = -ENFILE; 1039 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops); 1040 if (!f) 1041 goto err_dentry; 1042 1043 f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); 1044 1045 res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops); 1046 if (!res[0]) 1047 goto err_file; 1048 1049 path_get(&path); 1050 res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); 1051 res[1] = f; 1052 return 0; 1053 1054 err_file: 1055 put_filp(f); 1056 err_dentry: 1057 free_pipe_info(inode); 1058 path_put(&path); 1059 return err; 1060 1061 err_inode: 1062 free_pipe_info(inode); 1063 iput(inode); 1064 return err; 1065 } 1066 1067 static int __do_pipe_flags(int *fd, struct file **files, int flags) 1068 { 1069 int error; 1070 int fdw, fdr; 1071 1072 if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) 1073 return -EINVAL; 1074 1075 error = create_pipe_files(files, flags); 1076 if (error) 1077 return error; 1078 1079 error = get_unused_fd_flags(flags); 1080 if (error < 0) 1081 goto err_read_pipe; 1082 fdr = error; 1083 1084 error = get_unused_fd_flags(flags); 1085 if (error < 0) 1086 goto err_fdr; 1087 fdw = error; 1088 1089 audit_fd_pair(fdr, fdw); 1090 fd[0] = fdr; 1091 fd[1] = fdw; 1092 return 0; 1093 1094 err_fdr: 1095 put_unused_fd(fdr); 1096 err_read_pipe: 1097 fput(files[0]); 1098 fput(files[1]); 1099 return error; 1100 } 1101 1102 int do_pipe_flags(int *fd, int flags) 1103 { 1104 struct file *files[2]; 1105 int error = __do_pipe_flags(fd, files, flags); 1106 if (!error) { 1107 fd_install(fd[0], files[0]); 1108 fd_install(fd[1], files[1]); 1109 } 1110 return error; 1111 } 1112 1113 /* 1114 * sys_pipe() is the normal C calling standard for creating 1115 * a pipe. It's not the way Unix traditionally does this, though. 1116 */ 1117 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) 1118 { 1119 struct file *files[2]; 1120 int fd[2]; 1121 int error; 1122 1123 error = __do_pipe_flags(fd, files, flags); 1124 if (!error) { 1125 if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { 1126 fput(files[0]); 1127 fput(files[1]); 1128 put_unused_fd(fd[0]); 1129 put_unused_fd(fd[1]); 1130 error = -EFAULT; 1131 } else { 1132 fd_install(fd[0], files[0]); 1133 fd_install(fd[1], files[1]); 1134 } 1135 } 1136 return error; 1137 } 1138 1139 SYSCALL_DEFINE1(pipe, int __user *, fildes) 1140 { 1141 return sys_pipe2(fildes, 0); 1142 } 1143 1144 /* 1145 * Allocate a new array of pipe buffers and copy the info over. Returns the 1146 * pipe size if successful, or return -ERROR on error. 1147 */ 1148 static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) 1149 { 1150 struct pipe_buffer *bufs; 1151 1152 /* 1153 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't 1154 * expect a lot of shrink+grow operations, just free and allocate 1155 * again like we would do for growing. If the pipe currently 1156 * contains more buffers than arg, then return busy. 1157 */ 1158 if (nr_pages < pipe->nrbufs) 1159 return -EBUSY; 1160 1161 bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL | __GFP_NOWARN); 1162 if (unlikely(!bufs)) 1163 return -ENOMEM; 1164 1165 /* 1166 * The pipe array wraps around, so just start the new one at zero 1167 * and adjust the indexes. 1168 */ 1169 if (pipe->nrbufs) { 1170 unsigned int tail; 1171 unsigned int head; 1172 1173 tail = pipe->curbuf + pipe->nrbufs; 1174 if (tail < pipe->buffers) 1175 tail = 0; 1176 else 1177 tail &= (pipe->buffers - 1); 1178 1179 head = pipe->nrbufs - tail; 1180 if (head) 1181 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); 1182 if (tail) 1183 memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); 1184 } 1185 1186 pipe->curbuf = 0; 1187 kfree(pipe->bufs); 1188 pipe->bufs = bufs; 1189 pipe->buffers = nr_pages; 1190 return nr_pages * PAGE_SIZE; 1191 } 1192 1193 /* 1194 * Currently we rely on the pipe array holding a power-of-2 number 1195 * of pages. 1196 */ 1197 static inline unsigned int round_pipe_size(unsigned int size) 1198 { 1199 unsigned long nr_pages; 1200 1201 nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; 1202 return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; 1203 } 1204 1205 /* 1206 * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax 1207 * will return an error. 1208 */ 1209 int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, 1210 size_t *lenp, loff_t *ppos) 1211 { 1212 int ret; 1213 1214 ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); 1215 if (ret < 0 || !write) 1216 return ret; 1217 1218 pipe_max_size = round_pipe_size(pipe_max_size); 1219 return ret; 1220 } 1221 1222 /* 1223 * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same 1224 * location, so checking ->i_pipe is not enough to verify that this is a 1225 * pipe. 1226 */ 1227 struct pipe_inode_info *get_pipe_info(struct file *file) 1228 { 1229 struct inode *i = file->f_path.dentry->d_inode; 1230 1231 return S_ISFIFO(i->i_mode) ? i->i_pipe : NULL; 1232 } 1233 1234 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) 1235 { 1236 struct pipe_inode_info *pipe; 1237 long ret; 1238 1239 pipe = get_pipe_info(file); 1240 if (!pipe) 1241 return -EBADF; 1242 1243 mutex_lock(&pipe->inode->i_mutex); 1244 1245 switch (cmd) { 1246 case F_SETPIPE_SZ: { 1247 unsigned int size, nr_pages; 1248 1249 size = round_pipe_size(arg); 1250 nr_pages = size >> PAGE_SHIFT; 1251 1252 ret = -EINVAL; 1253 if (!nr_pages) 1254 goto out; 1255 1256 if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { 1257 ret = -EPERM; 1258 goto out; 1259 } 1260 ret = pipe_set_size(pipe, nr_pages); 1261 break; 1262 } 1263 case F_GETPIPE_SZ: 1264 ret = pipe->buffers * PAGE_SIZE; 1265 break; 1266 default: 1267 ret = -EINVAL; 1268 break; 1269 } 1270 1271 out: 1272 mutex_unlock(&pipe->inode->i_mutex); 1273 return ret; 1274 } 1275 1276 static const struct super_operations pipefs_ops = { 1277 .destroy_inode = free_inode_nonrcu, 1278 .statfs = simple_statfs, 1279 }; 1280 1281 /* 1282 * pipefs should _never_ be mounted by userland - too much of security hassle, 1283 * no real gain from having the whole whorehouse mounted. So we don't need 1284 * any operations on the root directory. However, we need a non-trivial 1285 * d_name - pipe: will go nicely and kill the special-casing in procfs. 1286 */ 1287 static struct dentry *pipefs_mount(struct file_system_type *fs_type, 1288 int flags, const char *dev_name, void *data) 1289 { 1290 return mount_pseudo(fs_type, "pipe:", &pipefs_ops, 1291 &pipefs_dentry_operations, PIPEFS_MAGIC); 1292 } 1293 1294 static struct file_system_type pipe_fs_type = { 1295 .name = "pipefs", 1296 .mount = pipefs_mount, 1297 .kill_sb = kill_anon_super, 1298 }; 1299 1300 static int __init init_pipe_fs(void) 1301 { 1302 int err = register_filesystem(&pipe_fs_type); 1303 1304 if (!err) { 1305 pipe_mnt = kern_mount(&pipe_fs_type); 1306 if (IS_ERR(pipe_mnt)) { 1307 err = PTR_ERR(pipe_mnt); 1308 unregister_filesystem(&pipe_fs_type); 1309 } 1310 } 1311 return err; 1312 } 1313 1314 fs_initcall(init_pipe_fs); 1315