1 /* 2 * fs/eventpoll.c ( Efficent event polling implementation ) 3 * Copyright (C) 2001,...,2003 Davide Libenzi 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * Davide Libenzi <davidel@xmailserver.org> 11 * 12 */ 13 14 #include <linux/module.h> 15 #include <linux/init.h> 16 #include <linux/kernel.h> 17 #include <linux/sched.h> 18 #include <linux/fs.h> 19 #include <linux/file.h> 20 #include <linux/signal.h> 21 #include <linux/errno.h> 22 #include <linux/mm.h> 23 #include <linux/slab.h> 24 #include <linux/poll.h> 25 #include <linux/smp_lock.h> 26 #include <linux/string.h> 27 #include <linux/list.h> 28 #include <linux/hash.h> 29 #include <linux/spinlock.h> 30 #include <linux/syscalls.h> 31 #include <linux/rwsem.h> 32 #include <linux/rbtree.h> 33 #include <linux/wait.h> 34 #include <linux/eventpoll.h> 35 #include <linux/mount.h> 36 #include <linux/bitops.h> 37 #include <asm/uaccess.h> 38 #include <asm/system.h> 39 #include <asm/io.h> 40 #include <asm/mman.h> 41 #include <asm/atomic.h> 42 #include <asm/semaphore.h> 43 44 45 /* 46 * LOCKING: 47 * There are three level of locking required by epoll : 48 * 49 * 1) epsem (semaphore) 50 * 2) ep->sem (rw_semaphore) 51 * 3) ep->lock (rw_lock) 52 * 53 * The acquire order is the one listed above, from 1 to 3. 54 * We need a spinlock (ep->lock) because we manipulate objects 55 * from inside the poll callback, that might be triggered from 56 * a wake_up() that in turn might be called from IRQ context. 57 * So we can't sleep inside the poll callback and hence we need 58 * a spinlock. During the event transfer loop (from kernel to 59 * user space) we could end up sleeping due a copy_to_user(), so 60 * we need a lock that will allow us to sleep. This lock is a 61 * read-write semaphore (ep->sem). It is acquired on read during 62 * the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL) 63 * and during eventpoll_release_file(). Then we also need a global 64 * semaphore to serialize eventpoll_release_file() and ep_free(). 65 * This semaphore is acquired by ep_free() during the epoll file 66 * cleanup path and it is also acquired by eventpoll_release_file() 67 * if a file has been pushed inside an epoll set and it is then 68 * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). 69 * It is possible to drop the "ep->sem" and to use the global 70 * semaphore "epsem" (together with "ep->lock") to have it working, 71 * but having "ep->sem" will make the interface more scalable. 72 * Events that require holding "epsem" are very rare, while for 73 * normal operations the epoll private "ep->sem" will guarantee 74 * a greater scalability. 75 */ 76 77 78 #define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */ 79 80 #define DEBUG_EPOLL 0 81 82 #if DEBUG_EPOLL > 0 83 #define DPRINTK(x) printk x 84 #define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0) 85 #else /* #if DEBUG_EPOLL > 0 */ 86 #define DPRINTK(x) (void) 0 87 #define DNPRINTK(n, x) (void) 0 88 #endif /* #if DEBUG_EPOLL > 0 */ 89 90 #define DEBUG_EPI 0 91 92 #if DEBUG_EPI != 0 93 #define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) 94 #else /* #if DEBUG_EPI != 0 */ 95 #define EPI_SLAB_DEBUG 0 96 #endif /* #if DEBUG_EPI != 0 */ 97 98 /* Epoll private bits inside the event mask */ 99 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 100 101 /* Maximum number of poll wake up nests we are allowing */ 102 #define EP_MAX_POLLWAKE_NESTS 4 103 104 /* Macro to allocate a "struct epitem" from the slab cache */ 105 #define EPI_MEM_ALLOC() (struct epitem *) kmem_cache_alloc(epi_cache, SLAB_KERNEL) 106 107 /* Macro to free a "struct epitem" to the slab cache */ 108 #define EPI_MEM_FREE(p) kmem_cache_free(epi_cache, p) 109 110 /* Macro to allocate a "struct eppoll_entry" from the slab cache */ 111 #define PWQ_MEM_ALLOC() (struct eppoll_entry *) kmem_cache_alloc(pwq_cache, SLAB_KERNEL) 112 113 /* Macro to free a "struct eppoll_entry" to the slab cache */ 114 #define PWQ_MEM_FREE(p) kmem_cache_free(pwq_cache, p) 115 116 /* Fast test to see if the file is an evenpoll file */ 117 #define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops) 118 119 /* Setup the structure that is used as key for the rb-tree */ 120 #define EP_SET_FFD(p, f, d) do { (p)->file = (f); (p)->fd = (d); } while (0) 121 122 /* Compare rb-tree keys */ 123 #define EP_CMP_FFD(p1, p2) ((p1)->file > (p2)->file ? +1: \ 124 ((p1)->file < (p2)->file ? -1: (p1)->fd - (p2)->fd)) 125 126 /* Special initialization for the rb-tree node to detect linkage */ 127 #define EP_RB_INITNODE(n) (n)->rb_parent = (n) 128 129 /* Removes a node from the rb-tree and marks it for a fast is-linked check */ 130 #define EP_RB_ERASE(n, r) do { rb_erase(n, r); (n)->rb_parent = (n); } while (0) 131 132 /* Fast check to verify that the item is linked to the main rb-tree */ 133 #define EP_RB_LINKED(n) ((n)->rb_parent != (n)) 134 135 /* 136 * Remove the item from the list and perform its initialization. 137 * This is useful for us because we can test if the item is linked 138 * using "EP_IS_LINKED(p)". 139 */ 140 #define EP_LIST_DEL(p) do { list_del(p); INIT_LIST_HEAD(p); } while (0) 141 142 /* Tells us if the item is currently linked */ 143 #define EP_IS_LINKED(p) (!list_empty(p)) 144 145 /* Get the "struct epitem" from a wait queue pointer */ 146 #define EP_ITEM_FROM_WAIT(p) ((struct epitem *) container_of(p, struct eppoll_entry, wait)->base) 147 148 /* Get the "struct epitem" from an epoll queue wrapper */ 149 #define EP_ITEM_FROM_EPQUEUE(p) (container_of(p, struct ep_pqueue, pt)->epi) 150 151 /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 152 #define EP_OP_HASH_EVENT(op) ((op) != EPOLL_CTL_DEL) 153 154 155 struct epoll_filefd { 156 struct file *file; 157 int fd; 158 }; 159 160 /* 161 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 162 * It is used to keep track on all tasks that are currently inside the wake_up() code 163 * to 1) short-circuit the one coming from the same task and same wait queue head 164 * ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting 165 * 3) let go the ones coming from other tasks. 166 */ 167 struct wake_task_node { 168 struct list_head llink; 169 task_t *task; 170 wait_queue_head_t *wq; 171 }; 172 173 /* 174 * This is used to implement the safe poll wake up avoiding to reenter 175 * the poll callback from inside wake_up(). 176 */ 177 struct poll_safewake { 178 struct list_head wake_task_list; 179 spinlock_t lock; 180 }; 181 182 /* 183 * This structure is stored inside the "private_data" member of the file 184 * structure and rapresent the main data sructure for the eventpoll 185 * interface. 186 */ 187 struct eventpoll { 188 /* Protect the this structure access */ 189 rwlock_t lock; 190 191 /* 192 * This semaphore is used to ensure that files are not removed 193 * while epoll is using them. This is read-held during the event 194 * collection loop and it is write-held during the file cleanup 195 * path, the epoll file exit code and the ctl operations. 196 */ 197 struct rw_semaphore sem; 198 199 /* Wait queue used by sys_epoll_wait() */ 200 wait_queue_head_t wq; 201 202 /* Wait queue used by file->poll() */ 203 wait_queue_head_t poll_wait; 204 205 /* List of ready file descriptors */ 206 struct list_head rdllist; 207 208 /* RB-Tree root used to store monitored fd structs */ 209 struct rb_root rbr; 210 }; 211 212 /* Wait structure used by the poll hooks */ 213 struct eppoll_entry { 214 /* List header used to link this structure to the "struct epitem" */ 215 struct list_head llink; 216 217 /* The "base" pointer is set to the container "struct epitem" */ 218 void *base; 219 220 /* 221 * Wait queue item that will be linked to the target file wait 222 * queue head. 223 */ 224 wait_queue_t wait; 225 226 /* The wait queue head that linked the "wait" wait queue item */ 227 wait_queue_head_t *whead; 228 }; 229 230 /* 231 * Each file descriptor added to the eventpoll interface will 232 * have an entry of this type linked to the hash. 233 */ 234 struct epitem { 235 /* RB-Tree node used to link this structure to the eventpoll rb-tree */ 236 struct rb_node rbn; 237 238 /* List header used to link this structure to the eventpoll ready list */ 239 struct list_head rdllink; 240 241 /* The file descriptor information this item refers to */ 242 struct epoll_filefd ffd; 243 244 /* Number of active wait queue attached to poll operations */ 245 int nwait; 246 247 /* List containing poll wait queues */ 248 struct list_head pwqlist; 249 250 /* The "container" of this item */ 251 struct eventpoll *ep; 252 253 /* The structure that describe the interested events and the source fd */ 254 struct epoll_event event; 255 256 /* 257 * Used to keep track of the usage count of the structure. This avoids 258 * that the structure will desappear from underneath our processing. 259 */ 260 atomic_t usecnt; 261 262 /* List header used to link this item to the "struct file" items list */ 263 struct list_head fllink; 264 265 /* List header used to link the item to the transfer list */ 266 struct list_head txlink; 267 268 /* 269 * This is used during the collection/transfer of events to userspace 270 * to pin items empty events set. 271 */ 272 unsigned int revents; 273 }; 274 275 /* Wrapper struct used by poll queueing */ 276 struct ep_pqueue { 277 poll_table pt; 278 struct epitem *epi; 279 }; 280 281 282 283 static void ep_poll_safewake_init(struct poll_safewake *psw); 284 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq); 285 static int ep_getfd(int *efd, struct inode **einode, struct file **efile); 286 static int ep_file_init(struct file *file); 287 static void ep_free(struct eventpoll *ep); 288 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); 289 static void ep_use_epitem(struct epitem *epi); 290 static void ep_release_epitem(struct epitem *epi); 291 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 292 poll_table *pt); 293 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); 294 static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 295 struct file *tfile, int fd); 296 static int ep_modify(struct eventpoll *ep, struct epitem *epi, 297 struct epoll_event *event); 298 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); 299 static int ep_unlink(struct eventpoll *ep, struct epitem *epi); 300 static int ep_remove(struct eventpoll *ep, struct epitem *epi); 301 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); 302 static int ep_eventpoll_close(struct inode *inode, struct file *file); 303 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); 304 static int ep_collect_ready_items(struct eventpoll *ep, 305 struct list_head *txlist, int maxevents); 306 static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 307 struct epoll_event __user *events); 308 static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist); 309 static int ep_events_transfer(struct eventpoll *ep, 310 struct epoll_event __user *events, 311 int maxevents); 312 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 313 int maxevents, long timeout); 314 static int eventpollfs_delete_dentry(struct dentry *dentry); 315 static struct inode *ep_eventpoll_inode(void); 316 static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type, 317 int flags, const char *dev_name, 318 void *data); 319 320 /* 321 * This semaphore is used to serialize ep_free() and eventpoll_release_file(). 322 */ 323 struct semaphore epsem; 324 325 /* Safe wake up implementation */ 326 static struct poll_safewake psw; 327 328 /* Slab cache used to allocate "struct epitem" */ 329 static kmem_cache_t *epi_cache; 330 331 /* Slab cache used to allocate "struct eppoll_entry" */ 332 static kmem_cache_t *pwq_cache; 333 334 /* Virtual fs used to allocate inodes for eventpoll files */ 335 static struct vfsmount *eventpoll_mnt; 336 337 /* File callbacks that implement the eventpoll file behaviour */ 338 static struct file_operations eventpoll_fops = { 339 .release = ep_eventpoll_close, 340 .poll = ep_eventpoll_poll 341 }; 342 343 /* 344 * This is used to register the virtual file system from where 345 * eventpoll inodes are allocated. 346 */ 347 static struct file_system_type eventpoll_fs_type = { 348 .name = "eventpollfs", 349 .get_sb = eventpollfs_get_sb, 350 .kill_sb = kill_anon_super, 351 }; 352 353 /* Very basic directory entry operations for the eventpoll virtual file system */ 354 static struct dentry_operations eventpollfs_dentry_operations = { 355 .d_delete = eventpollfs_delete_dentry, 356 }; 357 358 359 360 /* Initialize the poll safe wake up structure */ 361 static void ep_poll_safewake_init(struct poll_safewake *psw) 362 { 363 364 INIT_LIST_HEAD(&psw->wake_task_list); 365 spin_lock_init(&psw->lock); 366 } 367 368 369 /* 370 * Perform a safe wake up of the poll wait list. The problem is that 371 * with the new callback'd wake up system, it is possible that the 372 * poll callback is reentered from inside the call to wake_up() done 373 * on the poll wait queue head. The rule is that we cannot reenter the 374 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 375 * and we cannot reenter the same wait queue head at all. This will 376 * enable to have a hierarchy of epoll file descriptor of no more than 377 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 378 * because this one gets called by the poll callback, that in turn is called 379 * from inside a wake_up(), that might be called from irq context. 380 */ 381 static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 382 { 383 int wake_nests = 0; 384 unsigned long flags; 385 task_t *this_task = current; 386 struct list_head *lsthead = &psw->wake_task_list, *lnk; 387 struct wake_task_node *tncur; 388 struct wake_task_node tnode; 389 390 spin_lock_irqsave(&psw->lock, flags); 391 392 /* Try to see if the current task is already inside this wakeup call */ 393 list_for_each(lnk, lsthead) { 394 tncur = list_entry(lnk, struct wake_task_node, llink); 395 396 if (tncur->wq == wq || 397 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) { 398 /* 399 * Ops ... loop detected or maximum nest level reached. 400 * We abort this wake by breaking the cycle itself. 401 */ 402 spin_unlock_irqrestore(&psw->lock, flags); 403 return; 404 } 405 } 406 407 /* Add the current task to the list */ 408 tnode.task = this_task; 409 tnode.wq = wq; 410 list_add(&tnode.llink, lsthead); 411 412 spin_unlock_irqrestore(&psw->lock, flags); 413 414 /* Do really wake up now */ 415 wake_up(wq); 416 417 /* Remove the current task from the list */ 418 spin_lock_irqsave(&psw->lock, flags); 419 list_del(&tnode.llink); 420 spin_unlock_irqrestore(&psw->lock, flags); 421 } 422 423 424 /* Used to initialize the epoll bits inside the "struct file" */ 425 void eventpoll_init_file(struct file *file) 426 { 427 428 INIT_LIST_HEAD(&file->f_ep_links); 429 spin_lock_init(&file->f_ep_lock); 430 } 431 432 433 /* 434 * This is called from eventpoll_release() to unlink files from the eventpoll 435 * interface. We need to have this facility to cleanup correctly files that are 436 * closed without being removed from the eventpoll interface. 437 */ 438 void eventpoll_release_file(struct file *file) 439 { 440 struct list_head *lsthead = &file->f_ep_links; 441 struct eventpoll *ep; 442 struct epitem *epi; 443 444 /* 445 * We don't want to get "file->f_ep_lock" because it is not 446 * necessary. It is not necessary because we're in the "struct file" 447 * cleanup path, and this means that noone is using this file anymore. 448 * The only hit might come from ep_free() but by holding the semaphore 449 * will correctly serialize the operation. We do need to acquire 450 * "ep->sem" after "epsem" because ep_remove() requires it when called 451 * from anywhere but ep_free(). 452 */ 453 down(&epsem); 454 455 while (!list_empty(lsthead)) { 456 epi = list_entry(lsthead->next, struct epitem, fllink); 457 458 ep = epi->ep; 459 EP_LIST_DEL(&epi->fllink); 460 down_write(&ep->sem); 461 ep_remove(ep, epi); 462 up_write(&ep->sem); 463 } 464 465 up(&epsem); 466 } 467 468 469 /* 470 * It opens an eventpoll file descriptor by suggesting a storage of "size" 471 * file descriptors. The size parameter is just an hint about how to size 472 * data structures. It won't prevent the user to store more than "size" 473 * file descriptors inside the epoll interface. It is the kernel part of 474 * the userspace epoll_create(2). 475 */ 476 asmlinkage long sys_epoll_create(int size) 477 { 478 int error, fd; 479 struct inode *inode; 480 struct file *file; 481 482 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", 483 current, size)); 484 485 /* Sanity check on the size parameter */ 486 error = -EINVAL; 487 if (size <= 0) 488 goto eexit_1; 489 490 /* 491 * Creates all the items needed to setup an eventpoll file. That is, 492 * a file structure, and inode and a free file descriptor. 493 */ 494 error = ep_getfd(&fd, &inode, &file); 495 if (error) 496 goto eexit_1; 497 498 /* Setup the file internal data structure ( "struct eventpoll" ) */ 499 error = ep_file_init(file); 500 if (error) 501 goto eexit_2; 502 503 504 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 505 current, size, fd)); 506 507 return fd; 508 509 eexit_2: 510 sys_close(fd); 511 eexit_1: 512 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 513 current, size, error)); 514 return error; 515 } 516 517 518 /* 519 * The following function implements the controller interface for 520 * the eventpoll file that enables the insertion/removal/change of 521 * file descriptors inside the interest set. It represents 522 * the kernel part of the user space epoll_ctl(2). 523 */ 524 asmlinkage long 525 sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) 526 { 527 int error; 528 struct file *file, *tfile; 529 struct eventpoll *ep; 530 struct epitem *epi; 531 struct epoll_event epds; 532 533 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", 534 current, epfd, op, fd, event)); 535 536 error = -EFAULT; 537 if (EP_OP_HASH_EVENT(op) && 538 copy_from_user(&epds, event, sizeof(struct epoll_event))) 539 goto eexit_1; 540 541 /* Get the "struct file *" for the eventpoll file */ 542 error = -EBADF; 543 file = fget(epfd); 544 if (!file) 545 goto eexit_1; 546 547 /* Get the "struct file *" for the target file */ 548 tfile = fget(fd); 549 if (!tfile) 550 goto eexit_2; 551 552 /* The target file descriptor must support poll */ 553 error = -EPERM; 554 if (!tfile->f_op || !tfile->f_op->poll) 555 goto eexit_3; 556 557 /* 558 * We have to check that the file structure underneath the file descriptor 559 * the user passed to us _is_ an eventpoll file. And also we do not permit 560 * adding an epoll file descriptor inside itself. 561 */ 562 error = -EINVAL; 563 if (file == tfile || !IS_FILE_EPOLL(file)) 564 goto eexit_3; 565 566 /* 567 * At this point it is safe to assume that the "private_data" contains 568 * our own data structure. 569 */ 570 ep = file->private_data; 571 572 down_write(&ep->sem); 573 574 /* Try to lookup the file inside our hash table */ 575 epi = ep_find(ep, tfile, fd); 576 577 error = -EINVAL; 578 switch (op) { 579 case EPOLL_CTL_ADD: 580 if (!epi) { 581 epds.events |= POLLERR | POLLHUP; 582 583 error = ep_insert(ep, &epds, tfile, fd); 584 } else 585 error = -EEXIST; 586 break; 587 case EPOLL_CTL_DEL: 588 if (epi) 589 error = ep_remove(ep, epi); 590 else 591 error = -ENOENT; 592 break; 593 case EPOLL_CTL_MOD: 594 if (epi) { 595 epds.events |= POLLERR | POLLHUP; 596 error = ep_modify(ep, epi, &epds); 597 } else 598 error = -ENOENT; 599 break; 600 } 601 602 /* 603 * The function ep_find() increments the usage count of the structure 604 * so, if this is not NULL, we need to release it. 605 */ 606 if (epi) 607 ep_release_epitem(epi); 608 609 up_write(&ep->sem); 610 611 eexit_3: 612 fput(tfile); 613 eexit_2: 614 fput(file); 615 eexit_1: 616 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", 617 current, epfd, op, fd, event, error)); 618 619 return error; 620 } 621 622 #define MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 623 624 /* 625 * Implement the event wait interface for the eventpoll file. It is the kernel 626 * part of the user space epoll_wait(2). 627 */ 628 asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 629 int maxevents, int timeout) 630 { 631 int error; 632 struct file *file; 633 struct eventpoll *ep; 634 635 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", 636 current, epfd, events, maxevents, timeout)); 637 638 /* The maximum number of event must be greater than zero */ 639 if (maxevents <= 0 || maxevents > MAX_EVENTS) 640 return -EINVAL; 641 642 /* Verify that the area passed by the user is writeable */ 643 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { 644 error = -EFAULT; 645 goto eexit_1; 646 } 647 648 /* Get the "struct file *" for the eventpoll file */ 649 error = -EBADF; 650 file = fget(epfd); 651 if (!file) 652 goto eexit_1; 653 654 /* 655 * We have to check that the file structure underneath the fd 656 * the user passed to us _is_ an eventpoll file. 657 */ 658 error = -EINVAL; 659 if (!IS_FILE_EPOLL(file)) 660 goto eexit_2; 661 662 /* 663 * At this point it is safe to assume that the "private_data" contains 664 * our own data structure. 665 */ 666 ep = file->private_data; 667 668 /* Time to fish for events ... */ 669 error = ep_poll(ep, events, maxevents, timeout); 670 671 eexit_2: 672 fput(file); 673 eexit_1: 674 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", 675 current, epfd, events, maxevents, timeout, error)); 676 677 return error; 678 } 679 680 681 /* 682 * Creates the file descriptor to be used by the epoll interface. 683 */ 684 static int ep_getfd(int *efd, struct inode **einode, struct file **efile) 685 { 686 struct qstr this; 687 char name[32]; 688 struct dentry *dentry; 689 struct inode *inode; 690 struct file *file; 691 int error, fd; 692 693 /* Get an ready to use file */ 694 error = -ENFILE; 695 file = get_empty_filp(); 696 if (!file) 697 goto eexit_1; 698 699 /* Allocates an inode from the eventpoll file system */ 700 inode = ep_eventpoll_inode(); 701 error = PTR_ERR(inode); 702 if (IS_ERR(inode)) 703 goto eexit_2; 704 705 /* Allocates a free descriptor to plug the file onto */ 706 error = get_unused_fd(); 707 if (error < 0) 708 goto eexit_3; 709 fd = error; 710 711 /* 712 * Link the inode to a directory entry by creating a unique name 713 * using the inode number. 714 */ 715 error = -ENOMEM; 716 sprintf(name, "[%lu]", inode->i_ino); 717 this.name = name; 718 this.len = strlen(name); 719 this.hash = inode->i_ino; 720 dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this); 721 if (!dentry) 722 goto eexit_4; 723 dentry->d_op = &eventpollfs_dentry_operations; 724 d_add(dentry, inode); 725 file->f_vfsmnt = mntget(eventpoll_mnt); 726 file->f_dentry = dentry; 727 file->f_mapping = inode->i_mapping; 728 729 file->f_pos = 0; 730 file->f_flags = O_RDONLY; 731 file->f_op = &eventpoll_fops; 732 file->f_mode = FMODE_READ; 733 file->f_version = 0; 734 file->private_data = NULL; 735 736 /* Install the new setup file into the allocated fd. */ 737 fd_install(fd, file); 738 739 *efd = fd; 740 *einode = inode; 741 *efile = file; 742 return 0; 743 744 eexit_4: 745 put_unused_fd(fd); 746 eexit_3: 747 iput(inode); 748 eexit_2: 749 put_filp(file); 750 eexit_1: 751 return error; 752 } 753 754 755 static int ep_file_init(struct file *file) 756 { 757 struct eventpoll *ep; 758 759 if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) 760 return -ENOMEM; 761 762 memset(ep, 0, sizeof(*ep)); 763 rwlock_init(&ep->lock); 764 init_rwsem(&ep->sem); 765 init_waitqueue_head(&ep->wq); 766 init_waitqueue_head(&ep->poll_wait); 767 INIT_LIST_HEAD(&ep->rdllist); 768 ep->rbr = RB_ROOT; 769 770 file->private_data = ep; 771 772 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n", 773 current, ep)); 774 return 0; 775 } 776 777 778 static void ep_free(struct eventpoll *ep) 779 { 780 struct rb_node *rbp; 781 struct epitem *epi; 782 783 /* We need to release all tasks waiting for these file */ 784 if (waitqueue_active(&ep->poll_wait)) 785 ep_poll_safewake(&psw, &ep->poll_wait); 786 787 /* 788 * We need to lock this because we could be hit by 789 * eventpoll_release_file() while we're freeing the "struct eventpoll". 790 * We do not need to hold "ep->sem" here because the epoll file 791 * is on the way to be removed and no one has references to it 792 * anymore. The only hit might come from eventpoll_release_file() but 793 * holding "epsem" is sufficent here. 794 */ 795 down(&epsem); 796 797 /* 798 * Walks through the whole tree by unregistering poll callbacks. 799 */ 800 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 801 epi = rb_entry(rbp, struct epitem, rbn); 802 803 ep_unregister_pollwait(ep, epi); 804 } 805 806 /* 807 * Walks through the whole hash by freeing each "struct epitem". At this 808 * point we are sure no poll callbacks will be lingering around, and also by 809 * write-holding "sem" we can be sure that no file cleanup code will hit 810 * us during this operation. So we can avoid the lock on "ep->lock". 811 */ 812 while ((rbp = rb_first(&ep->rbr)) != 0) { 813 epi = rb_entry(rbp, struct epitem, rbn); 814 ep_remove(ep, epi); 815 } 816 817 up(&epsem); 818 } 819 820 821 /* 822 * Search the file inside the eventpoll hash. It add usage count to 823 * the returned item, so the caller must call ep_release_epitem() 824 * after finished using the "struct epitem". 825 */ 826 static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) 827 { 828 int kcmp; 829 unsigned long flags; 830 struct rb_node *rbp; 831 struct epitem *epi, *epir = NULL; 832 struct epoll_filefd ffd; 833 834 EP_SET_FFD(&ffd, file, fd); 835 read_lock_irqsave(&ep->lock, flags); 836 for (rbp = ep->rbr.rb_node; rbp; ) { 837 epi = rb_entry(rbp, struct epitem, rbn); 838 kcmp = EP_CMP_FFD(&ffd, &epi->ffd); 839 if (kcmp > 0) 840 rbp = rbp->rb_right; 841 else if (kcmp < 0) 842 rbp = rbp->rb_left; 843 else { 844 ep_use_epitem(epi); 845 epir = epi; 846 break; 847 } 848 } 849 read_unlock_irqrestore(&ep->lock, flags); 850 851 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", 852 current, file, epir)); 853 854 return epir; 855 } 856 857 858 /* 859 * Increment the usage count of the "struct epitem" making it sure 860 * that the user will have a valid pointer to reference. 861 */ 862 static void ep_use_epitem(struct epitem *epi) 863 { 864 865 atomic_inc(&epi->usecnt); 866 } 867 868 869 /* 870 * Decrement ( release ) the usage count by signaling that the user 871 * has finished using the structure. It might lead to freeing the 872 * structure itself if the count goes to zero. 873 */ 874 static void ep_release_epitem(struct epitem *epi) 875 { 876 877 if (atomic_dec_and_test(&epi->usecnt)) 878 EPI_MEM_FREE(epi); 879 } 880 881 882 /* 883 * This is the callback that is used to add our wait queue to the 884 * target file wakeup lists. 885 */ 886 static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, 887 poll_table *pt) 888 { 889 struct epitem *epi = EP_ITEM_FROM_EPQUEUE(pt); 890 struct eppoll_entry *pwq; 891 892 if (epi->nwait >= 0 && (pwq = PWQ_MEM_ALLOC())) { 893 init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); 894 pwq->whead = whead; 895 pwq->base = epi; 896 add_wait_queue(whead, &pwq->wait); 897 list_add_tail(&pwq->llink, &epi->pwqlist); 898 epi->nwait++; 899 } else { 900 /* We have to signal that an error occurred */ 901 epi->nwait = -1; 902 } 903 } 904 905 906 static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 907 { 908 int kcmp; 909 struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; 910 struct epitem *epic; 911 912 while (*p) { 913 parent = *p; 914 epic = rb_entry(parent, struct epitem, rbn); 915 kcmp = EP_CMP_FFD(&epi->ffd, &epic->ffd); 916 if (kcmp > 0) 917 p = &parent->rb_right; 918 else 919 p = &parent->rb_left; 920 } 921 rb_link_node(&epi->rbn, parent, p); 922 rb_insert_color(&epi->rbn, &ep->rbr); 923 } 924 925 926 static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 927 struct file *tfile, int fd) 928 { 929 int error, revents, pwake = 0; 930 unsigned long flags; 931 struct epitem *epi; 932 struct ep_pqueue epq; 933 934 error = -ENOMEM; 935 if (!(epi = EPI_MEM_ALLOC())) 936 goto eexit_1; 937 938 /* Item initialization follow here ... */ 939 EP_RB_INITNODE(&epi->rbn); 940 INIT_LIST_HEAD(&epi->rdllink); 941 INIT_LIST_HEAD(&epi->fllink); 942 INIT_LIST_HEAD(&epi->txlink); 943 INIT_LIST_HEAD(&epi->pwqlist); 944 epi->ep = ep; 945 EP_SET_FFD(&epi->ffd, tfile, fd); 946 epi->event = *event; 947 atomic_set(&epi->usecnt, 1); 948 epi->nwait = 0; 949 950 /* Initialize the poll table using the queue callback */ 951 epq.epi = epi; 952 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 953 954 /* 955 * Attach the item to the poll hooks and get current event bits. 956 * We can safely use the file* here because its usage count has 957 * been increased by the caller of this function. 958 */ 959 revents = tfile->f_op->poll(tfile, &epq.pt); 960 961 /* 962 * We have to check if something went wrong during the poll wait queue 963 * install process. Namely an allocation for a wait queue failed due 964 * high memory pressure. 965 */ 966 if (epi->nwait < 0) 967 goto eexit_2; 968 969 /* Add the current item to the list of active epoll hook for this file */ 970 spin_lock(&tfile->f_ep_lock); 971 list_add_tail(&epi->fllink, &tfile->f_ep_links); 972 spin_unlock(&tfile->f_ep_lock); 973 974 /* We have to drop the new item inside our item list to keep track of it */ 975 write_lock_irqsave(&ep->lock, flags); 976 977 /* Add the current item to the rb-tree */ 978 ep_rbtree_insert(ep, epi); 979 980 /* If the file is already "ready" we drop it inside the ready list */ 981 if ((revents & event->events) && !EP_IS_LINKED(&epi->rdllink)) { 982 list_add_tail(&epi->rdllink, &ep->rdllist); 983 984 /* Notify waiting tasks that events are available */ 985 if (waitqueue_active(&ep->wq)) 986 wake_up(&ep->wq); 987 if (waitqueue_active(&ep->poll_wait)) 988 pwake++; 989 } 990 991 write_unlock_irqrestore(&ep->lock, flags); 992 993 /* We have to call this outside the lock */ 994 if (pwake) 995 ep_poll_safewake(&psw, &ep->poll_wait); 996 997 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", 998 current, ep, tfile, fd)); 999 1000 return 0; 1001 1002 eexit_2: 1003 ep_unregister_pollwait(ep, epi); 1004 1005 /* 1006 * We need to do this because an event could have been arrived on some 1007 * allocated wait queue. 1008 */ 1009 write_lock_irqsave(&ep->lock, flags); 1010 if (EP_IS_LINKED(&epi->rdllink)) 1011 EP_LIST_DEL(&epi->rdllink); 1012 write_unlock_irqrestore(&ep->lock, flags); 1013 1014 EPI_MEM_FREE(epi); 1015 eexit_1: 1016 return error; 1017 } 1018 1019 1020 /* 1021 * Modify the interest event mask by dropping an event if the new mask 1022 * has a match in the current file status. 1023 */ 1024 static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) 1025 { 1026 int pwake = 0; 1027 unsigned int revents; 1028 unsigned long flags; 1029 1030 /* 1031 * Set the new event interest mask before calling f_op->poll(), otherwise 1032 * a potential race might occur. In fact if we do this operation inside 1033 * the lock, an event might happen between the f_op->poll() call and the 1034 * new event set registering. 1035 */ 1036 epi->event.events = event->events; 1037 1038 /* 1039 * Get current event bits. We can safely use the file* here because 1040 * its usage count has been increased by the caller of this function. 1041 */ 1042 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1043 1044 write_lock_irqsave(&ep->lock, flags); 1045 1046 /* Copy the data member from inside the lock */ 1047 epi->event.data = event->data; 1048 1049 /* 1050 * If the item is not linked to the hash it means that it's on its 1051 * way toward the removal. Do nothing in this case. 1052 */ 1053 if (EP_RB_LINKED(&epi->rbn)) { 1054 /* 1055 * If the item is "hot" and it is not registered inside the ready 1056 * list, push it inside. If the item is not "hot" and it is currently 1057 * registered inside the ready list, unlink it. 1058 */ 1059 if (revents & event->events) { 1060 if (!EP_IS_LINKED(&epi->rdllink)) { 1061 list_add_tail(&epi->rdllink, &ep->rdllist); 1062 1063 /* Notify waiting tasks that events are available */ 1064 if (waitqueue_active(&ep->wq)) 1065 wake_up(&ep->wq); 1066 if (waitqueue_active(&ep->poll_wait)) 1067 pwake++; 1068 } 1069 } 1070 } 1071 1072 write_unlock_irqrestore(&ep->lock, flags); 1073 1074 /* We have to call this outside the lock */ 1075 if (pwake) 1076 ep_poll_safewake(&psw, &ep->poll_wait); 1077 1078 return 0; 1079 } 1080 1081 1082 /* 1083 * This function unregister poll callbacks from the associated file descriptor. 1084 * Since this must be called without holding "ep->lock" the atomic exchange trick 1085 * will protect us from multiple unregister. 1086 */ 1087 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 1088 { 1089 int nwait; 1090 struct list_head *lsthead = &epi->pwqlist; 1091 struct eppoll_entry *pwq; 1092 1093 /* This is called without locks, so we need the atomic exchange */ 1094 nwait = xchg(&epi->nwait, 0); 1095 1096 if (nwait) { 1097 while (!list_empty(lsthead)) { 1098 pwq = list_entry(lsthead->next, struct eppoll_entry, llink); 1099 1100 EP_LIST_DEL(&pwq->llink); 1101 remove_wait_queue(pwq->whead, &pwq->wait); 1102 PWQ_MEM_FREE(pwq); 1103 } 1104 } 1105 } 1106 1107 1108 /* 1109 * Unlink the "struct epitem" from all places it might have been hooked up. 1110 * This function must be called with write IRQ lock on "ep->lock". 1111 */ 1112 static int ep_unlink(struct eventpoll *ep, struct epitem *epi) 1113 { 1114 int error; 1115 1116 /* 1117 * It can happen that this one is called for an item already unlinked. 1118 * The check protect us from doing a double unlink ( crash ). 1119 */ 1120 error = -ENOENT; 1121 if (!EP_RB_LINKED(&epi->rbn)) 1122 goto eexit_1; 1123 1124 /* 1125 * Clear the event mask for the unlinked item. This will avoid item 1126 * notifications to be sent after the unlink operation from inside 1127 * the kernel->userspace event transfer loop. 1128 */ 1129 epi->event.events = 0; 1130 1131 /* 1132 * At this point is safe to do the job, unlink the item from our rb-tree. 1133 * This operation togheter with the above check closes the door to 1134 * double unlinks. 1135 */ 1136 EP_RB_ERASE(&epi->rbn, &ep->rbr); 1137 1138 /* 1139 * If the item we are going to remove is inside the ready file descriptors 1140 * we want to remove it from this list to avoid stale events. 1141 */ 1142 if (EP_IS_LINKED(&epi->rdllink)) 1143 EP_LIST_DEL(&epi->rdllink); 1144 1145 error = 0; 1146 eexit_1: 1147 1148 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", 1149 current, ep, epi->file, error)); 1150 1151 return error; 1152 } 1153 1154 1155 /* 1156 * Removes a "struct epitem" from the eventpoll hash and deallocates 1157 * all the associated resources. 1158 */ 1159 static int ep_remove(struct eventpoll *ep, struct epitem *epi) 1160 { 1161 int error; 1162 unsigned long flags; 1163 struct file *file = epi->ffd.file; 1164 1165 /* 1166 * Removes poll wait queue hooks. We _have_ to do this without holding 1167 * the "ep->lock" otherwise a deadlock might occur. This because of the 1168 * sequence of the lock acquisition. Here we do "ep->lock" then the wait 1169 * queue head lock when unregistering the wait queue. The wakeup callback 1170 * will run by holding the wait queue head lock and will call our callback 1171 * that will try to get "ep->lock". 1172 */ 1173 ep_unregister_pollwait(ep, epi); 1174 1175 /* Remove the current item from the list of epoll hooks */ 1176 spin_lock(&file->f_ep_lock); 1177 if (EP_IS_LINKED(&epi->fllink)) 1178 EP_LIST_DEL(&epi->fllink); 1179 spin_unlock(&file->f_ep_lock); 1180 1181 /* We need to acquire the write IRQ lock before calling ep_unlink() */ 1182 write_lock_irqsave(&ep->lock, flags); 1183 1184 /* Really unlink the item from the hash */ 1185 error = ep_unlink(ep, epi); 1186 1187 write_unlock_irqrestore(&ep->lock, flags); 1188 1189 if (error) 1190 goto eexit_1; 1191 1192 /* At this point it is safe to free the eventpoll item */ 1193 ep_release_epitem(epi); 1194 1195 error = 0; 1196 eexit_1: 1197 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", 1198 current, ep, file, error)); 1199 1200 return error; 1201 } 1202 1203 1204 /* 1205 * This is the callback that is passed to the wait queue wakeup 1206 * machanism. It is called by the stored file descriptors when they 1207 * have events to report. 1208 */ 1209 static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) 1210 { 1211 int pwake = 0; 1212 unsigned long flags; 1213 struct epitem *epi = EP_ITEM_FROM_WAIT(wait); 1214 struct eventpoll *ep = epi->ep; 1215 1216 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", 1217 current, epi->file, epi, ep)); 1218 1219 write_lock_irqsave(&ep->lock, flags); 1220 1221 /* 1222 * If the event mask does not contain any poll(2) event, we consider the 1223 * descriptor to be disabled. This condition is likely the effect of the 1224 * EPOLLONESHOT bit that disables the descriptor when an event is received, 1225 * until the next EPOLL_CTL_MOD will be issued. 1226 */ 1227 if (!(epi->event.events & ~EP_PRIVATE_BITS)) 1228 goto is_disabled; 1229 1230 /* If this file is already in the ready list we exit soon */ 1231 if (EP_IS_LINKED(&epi->rdllink)) 1232 goto is_linked; 1233 1234 list_add_tail(&epi->rdllink, &ep->rdllist); 1235 1236 is_linked: 1237 /* 1238 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1239 * wait list. 1240 */ 1241 if (waitqueue_active(&ep->wq)) 1242 wake_up(&ep->wq); 1243 if (waitqueue_active(&ep->poll_wait)) 1244 pwake++; 1245 1246 is_disabled: 1247 write_unlock_irqrestore(&ep->lock, flags); 1248 1249 /* We have to call this outside the lock */ 1250 if (pwake) 1251 ep_poll_safewake(&psw, &ep->poll_wait); 1252 1253 return 1; 1254 } 1255 1256 1257 static int ep_eventpoll_close(struct inode *inode, struct file *file) 1258 { 1259 struct eventpoll *ep = file->private_data; 1260 1261 if (ep) { 1262 ep_free(ep); 1263 kfree(ep); 1264 } 1265 1266 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); 1267 return 0; 1268 } 1269 1270 1271 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 1272 { 1273 unsigned int pollflags = 0; 1274 unsigned long flags; 1275 struct eventpoll *ep = file->private_data; 1276 1277 /* Insert inside our poll wait queue */ 1278 poll_wait(file, &ep->poll_wait, wait); 1279 1280 /* Check our condition */ 1281 read_lock_irqsave(&ep->lock, flags); 1282 if (!list_empty(&ep->rdllist)) 1283 pollflags = POLLIN | POLLRDNORM; 1284 read_unlock_irqrestore(&ep->lock, flags); 1285 1286 return pollflags; 1287 } 1288 1289 1290 /* 1291 * Since we have to release the lock during the __copy_to_user() operation and 1292 * during the f_op->poll() call, we try to collect the maximum number of items 1293 * by reducing the irqlock/irqunlock switching rate. 1294 */ 1295 static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents) 1296 { 1297 int nepi; 1298 unsigned long flags; 1299 struct list_head *lsthead = &ep->rdllist, *lnk; 1300 struct epitem *epi; 1301 1302 write_lock_irqsave(&ep->lock, flags); 1303 1304 for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) { 1305 epi = list_entry(lnk, struct epitem, rdllink); 1306 1307 lnk = lnk->next; 1308 1309 /* If this file is already in the ready list we exit soon */ 1310 if (!EP_IS_LINKED(&epi->txlink)) { 1311 /* 1312 * This is initialized in this way so that the default 1313 * behaviour of the reinjecting code will be to push back 1314 * the item inside the ready list. 1315 */ 1316 epi->revents = epi->event.events; 1317 1318 /* Link the ready item into the transfer list */ 1319 list_add(&epi->txlink, txlist); 1320 nepi++; 1321 1322 /* 1323 * Unlink the item from the ready list. 1324 */ 1325 EP_LIST_DEL(&epi->rdllink); 1326 } 1327 } 1328 1329 write_unlock_irqrestore(&ep->lock, flags); 1330 1331 return nepi; 1332 } 1333 1334 1335 /* 1336 * This function is called without holding the "ep->lock" since the call to 1337 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ 1338 * because of the way poll() is traditionally implemented in Linux. 1339 */ 1340 static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 1341 struct epoll_event __user *events) 1342 { 1343 int eventcnt = 0; 1344 unsigned int revents; 1345 struct list_head *lnk; 1346 struct epitem *epi; 1347 1348 /* 1349 * We can loop without lock because this is a task private list. 1350 * The test done during the collection loop will guarantee us that 1351 * another task will not try to collect this file. Also, items 1352 * cannot vanish during the loop because we are holding "sem". 1353 */ 1354 list_for_each(lnk, txlist) { 1355 epi = list_entry(lnk, struct epitem, txlink); 1356 1357 /* 1358 * Get the ready file event set. We can safely use the file 1359 * because we are holding the "sem" in read and this will 1360 * guarantee that both the file and the item will not vanish. 1361 */ 1362 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1363 1364 /* 1365 * Set the return event set for the current file descriptor. 1366 * Note that only the task task was successfully able to link 1367 * the item to its "txlist" will write this field. 1368 */ 1369 epi->revents = revents & epi->event.events; 1370 1371 if (epi->revents) { 1372 if (__put_user(epi->revents, 1373 &events[eventcnt].events) || 1374 __put_user(epi->event.data, 1375 &events[eventcnt].data)) 1376 return -EFAULT; 1377 if (epi->event.events & EPOLLONESHOT) 1378 epi->event.events &= EP_PRIVATE_BITS; 1379 eventcnt++; 1380 } 1381 } 1382 return eventcnt; 1383 } 1384 1385 1386 /* 1387 * Walk through the transfer list we collected with ep_collect_ready_items() 1388 * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's 1389 * not already linked, links it to the ready list. Same as above, we are holding 1390 * "sem" so items cannot vanish underneath our nose. 1391 */ 1392 static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist) 1393 { 1394 int ricnt = 0, pwake = 0; 1395 unsigned long flags; 1396 struct epitem *epi; 1397 1398 write_lock_irqsave(&ep->lock, flags); 1399 1400 while (!list_empty(txlist)) { 1401 epi = list_entry(txlist->next, struct epitem, txlink); 1402 1403 /* Unlink the current item from the transfer list */ 1404 EP_LIST_DEL(&epi->txlink); 1405 1406 /* 1407 * If the item is no more linked to the interest set, we don't 1408 * have to push it inside the ready list because the following 1409 * ep_release_epitem() is going to drop it. Also, if the current 1410 * item is set to have an Edge Triggered behaviour, we don't have 1411 * to push it back either. 1412 */ 1413 if (EP_RB_LINKED(&epi->rbn) && !(epi->event.events & EPOLLET) && 1414 (epi->revents & epi->event.events) && !EP_IS_LINKED(&epi->rdllink)) { 1415 list_add_tail(&epi->rdllink, &ep->rdllist); 1416 ricnt++; 1417 } 1418 } 1419 1420 if (ricnt) { 1421 /* 1422 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1423 * wait list. 1424 */ 1425 if (waitqueue_active(&ep->wq)) 1426 wake_up(&ep->wq); 1427 if (waitqueue_active(&ep->poll_wait)) 1428 pwake++; 1429 } 1430 1431 write_unlock_irqrestore(&ep->lock, flags); 1432 1433 /* We have to call this outside the lock */ 1434 if (pwake) 1435 ep_poll_safewake(&psw, &ep->poll_wait); 1436 } 1437 1438 1439 /* 1440 * Perform the transfer of events to user space. 1441 */ 1442 static int ep_events_transfer(struct eventpoll *ep, 1443 struct epoll_event __user *events, int maxevents) 1444 { 1445 int eventcnt = 0; 1446 struct list_head txlist; 1447 1448 INIT_LIST_HEAD(&txlist); 1449 1450 /* 1451 * We need to lock this because we could be hit by 1452 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). 1453 */ 1454 down_read(&ep->sem); 1455 1456 /* Collect/extract ready items */ 1457 if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) { 1458 /* Build result set in userspace */ 1459 eventcnt = ep_send_events(ep, &txlist, events); 1460 1461 /* Reinject ready items into the ready list */ 1462 ep_reinject_items(ep, &txlist); 1463 } 1464 1465 up_read(&ep->sem); 1466 1467 return eventcnt; 1468 } 1469 1470 1471 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1472 int maxevents, long timeout) 1473 { 1474 int res, eavail; 1475 unsigned long flags; 1476 long jtimeout; 1477 wait_queue_t wait; 1478 1479 /* 1480 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1481 * and the overflow condition. The passed timeout is in milliseconds, 1482 * that why (t * HZ) / 1000. 1483 */ 1484 jtimeout = timeout == -1 || timeout > (MAX_SCHEDULE_TIMEOUT - 1000) / HZ ? 1485 MAX_SCHEDULE_TIMEOUT: (timeout * HZ + 999) / 1000; 1486 1487 retry: 1488 write_lock_irqsave(&ep->lock, flags); 1489 1490 res = 0; 1491 if (list_empty(&ep->rdllist)) { 1492 /* 1493 * We don't have any available event to return to the caller. 1494 * We need to sleep here, and we will be wake up by 1495 * ep_poll_callback() when events will become available. 1496 */ 1497 init_waitqueue_entry(&wait, current); 1498 add_wait_queue(&ep->wq, &wait); 1499 1500 for (;;) { 1501 /* 1502 * We don't want to sleep if the ep_poll_callback() sends us 1503 * a wakeup in between. That's why we set the task state 1504 * to TASK_INTERRUPTIBLE before doing the checks. 1505 */ 1506 set_current_state(TASK_INTERRUPTIBLE); 1507 if (!list_empty(&ep->rdllist) || !jtimeout) 1508 break; 1509 if (signal_pending(current)) { 1510 res = -EINTR; 1511 break; 1512 } 1513 1514 write_unlock_irqrestore(&ep->lock, flags); 1515 jtimeout = schedule_timeout(jtimeout); 1516 write_lock_irqsave(&ep->lock, flags); 1517 } 1518 remove_wait_queue(&ep->wq, &wait); 1519 1520 set_current_state(TASK_RUNNING); 1521 } 1522 1523 /* Is it worth to try to dig for events ? */ 1524 eavail = !list_empty(&ep->rdllist); 1525 1526 write_unlock_irqrestore(&ep->lock, flags); 1527 1528 /* 1529 * Try to transfer events to user space. In case we get 0 events and 1530 * there's still timeout left over, we go trying again in search of 1531 * more luck. 1532 */ 1533 if (!res && eavail && 1534 !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) 1535 goto retry; 1536 1537 return res; 1538 } 1539 1540 1541 static int eventpollfs_delete_dentry(struct dentry *dentry) 1542 { 1543 1544 return 1; 1545 } 1546 1547 1548 static struct inode *ep_eventpoll_inode(void) 1549 { 1550 int error = -ENOMEM; 1551 struct inode *inode = new_inode(eventpoll_mnt->mnt_sb); 1552 1553 if (!inode) 1554 goto eexit_1; 1555 1556 inode->i_fop = &eventpoll_fops; 1557 1558 /* 1559 * Mark the inode dirty from the very beginning, 1560 * that way it will never be moved to the dirty 1561 * list because mark_inode_dirty() will think 1562 * that it already _is_ on the dirty list. 1563 */ 1564 inode->i_state = I_DIRTY; 1565 inode->i_mode = S_IRUSR | S_IWUSR; 1566 inode->i_uid = current->fsuid; 1567 inode->i_gid = current->fsgid; 1568 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 1569 inode->i_blksize = PAGE_SIZE; 1570 return inode; 1571 1572 eexit_1: 1573 return ERR_PTR(error); 1574 } 1575 1576 1577 static struct super_block * 1578 eventpollfs_get_sb(struct file_system_type *fs_type, int flags, 1579 const char *dev_name, void *data) 1580 { 1581 return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC); 1582 } 1583 1584 1585 static int __init eventpoll_init(void) 1586 { 1587 int error; 1588 1589 init_MUTEX(&epsem); 1590 1591 /* Initialize the structure used to perform safe poll wait head wake ups */ 1592 ep_poll_safewake_init(&psw); 1593 1594 /* Allocates slab cache used to allocate "struct epitem" items */ 1595 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1596 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, 1597 NULL, NULL); 1598 1599 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 1600 pwq_cache = kmem_cache_create("eventpoll_pwq", 1601 sizeof(struct eppoll_entry), 0, 1602 EPI_SLAB_DEBUG|SLAB_PANIC, NULL, NULL); 1603 1604 /* 1605 * Register the virtual file system that will be the source of inodes 1606 * for the eventpoll files 1607 */ 1608 error = register_filesystem(&eventpoll_fs_type); 1609 if (error) 1610 goto epanic; 1611 1612 /* Mount the above commented virtual file system */ 1613 eventpoll_mnt = kern_mount(&eventpoll_fs_type); 1614 error = PTR_ERR(eventpoll_mnt); 1615 if (IS_ERR(eventpoll_mnt)) 1616 goto epanic; 1617 1618 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: successfully initialized.\n", 1619 current)); 1620 return 0; 1621 1622 epanic: 1623 panic("eventpoll_init() failed\n"); 1624 } 1625 1626 1627 static void __exit eventpoll_exit(void) 1628 { 1629 /* Undo all operations done inside eventpoll_init() */ 1630 unregister_filesystem(&eventpoll_fs_type); 1631 mntput(eventpoll_mnt); 1632 kmem_cache_destroy(pwq_cache); 1633 kmem_cache_destroy(epi_cache); 1634 } 1635 1636 module_init(eventpoll_init); 1637 module_exit(eventpoll_exit); 1638 1639 MODULE_LICENSE("GPL"); 1640