1 /* 2 * kvm eventfd support - use eventfd objects to signal various KVM events 3 * 4 * Copyright 2009 Novell. All Rights Reserved. 5 * 6 * Author: 7 * Gregory Haskins <ghaskins@novell.com> 8 * 9 * This file is free software; you can redistribute it and/or modify 10 * it under the terms of version 2 of the GNU General Public License 11 * as published by the Free Software Foundation. 12 * 13 * This program is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 * GNU General Public License for more details. 17 * 18 * You should have received a copy of the GNU General Public License 19 * along with this program; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 21 */ 22 23 #include <linux/kvm_host.h> 24 #include <linux/kvm.h> 25 #include <linux/workqueue.h> 26 #include <linux/syscalls.h> 27 #include <linux/wait.h> 28 #include <linux/poll.h> 29 #include <linux/file.h> 30 #include <linux/list.h> 31 #include <linux/eventfd.h> 32 #include <linux/kernel.h> 33 34 #include "iodev.h" 35 36 /* 37 * -------------------------------------------------------------------- 38 * irqfd: Allows an fd to be used to inject an interrupt to the guest 39 * 40 * Credit goes to Avi Kivity for the original idea. 41 * -------------------------------------------------------------------- 42 */ 43 44 struct _irqfd { 45 struct kvm *kvm; 46 struct eventfd_ctx *eventfd; 47 int gsi; 48 struct list_head list; 49 poll_table pt; 50 wait_queue_head_t *wqh; 51 wait_queue_t wait; 52 struct work_struct inject; 53 struct work_struct shutdown; 54 }; 55 56 static struct workqueue_struct *irqfd_cleanup_wq; 57 58 static void 59 irqfd_inject(struct work_struct *work) 60 { 61 struct _irqfd *irqfd = container_of(work, struct _irqfd, inject); 62 struct kvm *kvm = irqfd->kvm; 63 64 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1); 65 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0); 66 } 67 68 /* 69 * Race-free decouple logic (ordering is critical) 70 */ 71 static void 72 irqfd_shutdown(struct work_struct *work) 73 { 74 struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown); 75 76 /* 77 * Synchronize with the wait-queue and unhook ourselves to prevent 78 * further events. 79 */ 80 remove_wait_queue(irqfd->wqh, &irqfd->wait); 81 82 /* 83 * We know no new events will be scheduled at this point, so block 84 * until all previously outstanding events have completed 85 */ 86 flush_work(&irqfd->inject); 87 88 /* 89 * It is now safe to release the object's resources 90 */ 91 eventfd_ctx_put(irqfd->eventfd); 92 kfree(irqfd); 93 } 94 95 96 /* assumes kvm->irqfds.lock is held */ 97 static bool 98 irqfd_is_active(struct _irqfd *irqfd) 99 { 100 return list_empty(&irqfd->list) ? false : true; 101 } 102 103 /* 104 * Mark the irqfd as inactive and schedule it for removal 105 * 106 * assumes kvm->irqfds.lock is held 107 */ 108 static void 109 irqfd_deactivate(struct _irqfd *irqfd) 110 { 111 BUG_ON(!irqfd_is_active(irqfd)); 112 113 list_del_init(&irqfd->list); 114 115 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 116 } 117 118 /* 119 * Called with wqh->lock held and interrupts disabled 120 */ 121 static int 122 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key) 123 { 124 struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait); 125 unsigned long flags = (unsigned long)key; 126 127 if (flags & POLLIN) 128 /* An event has been signaled, inject an interrupt */ 129 schedule_work(&irqfd->inject); 130 131 if (flags & POLLHUP) { 132 /* The eventfd is closing, detach from KVM */ 133 struct kvm *kvm = irqfd->kvm; 134 unsigned long flags; 135 136 spin_lock_irqsave(&kvm->irqfds.lock, flags); 137 138 /* 139 * We must check if someone deactivated the irqfd before 140 * we could acquire the irqfds.lock since the item is 141 * deactivated from the KVM side before it is unhooked from 142 * the wait-queue. If it is already deactivated, we can 143 * simply return knowing the other side will cleanup for us. 144 * We cannot race against the irqfd going away since the 145 * other side is required to acquire wqh->lock, which we hold 146 */ 147 if (irqfd_is_active(irqfd)) 148 irqfd_deactivate(irqfd); 149 150 spin_unlock_irqrestore(&kvm->irqfds.lock, flags); 151 } 152 153 return 0; 154 } 155 156 static void 157 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 158 poll_table *pt) 159 { 160 struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt); 161 162 irqfd->wqh = wqh; 163 add_wait_queue(wqh, &irqfd->wait); 164 } 165 166 static int 167 kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) 168 { 169 struct _irqfd *irqfd; 170 struct file *file = NULL; 171 struct eventfd_ctx *eventfd = NULL; 172 int ret; 173 unsigned int events; 174 175 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL); 176 if (!irqfd) 177 return -ENOMEM; 178 179 irqfd->kvm = kvm; 180 irqfd->gsi = gsi; 181 INIT_LIST_HEAD(&irqfd->list); 182 INIT_WORK(&irqfd->inject, irqfd_inject); 183 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 184 185 file = eventfd_fget(fd); 186 if (IS_ERR(file)) { 187 ret = PTR_ERR(file); 188 goto fail; 189 } 190 191 eventfd = eventfd_ctx_fileget(file); 192 if (IS_ERR(eventfd)) { 193 ret = PTR_ERR(eventfd); 194 goto fail; 195 } 196 197 irqfd->eventfd = eventfd; 198 199 /* 200 * Install our own custom wake-up handling so we are notified via 201 * a callback whenever someone signals the underlying eventfd 202 */ 203 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 204 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 205 206 events = file->f_op->poll(file, &irqfd->pt); 207 208 spin_lock_irq(&kvm->irqfds.lock); 209 list_add_tail(&irqfd->list, &kvm->irqfds.items); 210 spin_unlock_irq(&kvm->irqfds.lock); 211 212 /* 213 * Check if there was an event already pending on the eventfd 214 * before we registered, and trigger it as if we didn't miss it. 215 */ 216 if (events & POLLIN) 217 schedule_work(&irqfd->inject); 218 219 /* 220 * do not drop the file until the irqfd is fully initialized, otherwise 221 * we might race against the POLLHUP 222 */ 223 fput(file); 224 225 return 0; 226 227 fail: 228 if (eventfd && !IS_ERR(eventfd)) 229 eventfd_ctx_put(eventfd); 230 231 if (!IS_ERR(file)) 232 fput(file); 233 234 kfree(irqfd); 235 return ret; 236 } 237 238 void 239 kvm_eventfd_init(struct kvm *kvm) 240 { 241 spin_lock_init(&kvm->irqfds.lock); 242 INIT_LIST_HEAD(&kvm->irqfds.items); 243 INIT_LIST_HEAD(&kvm->ioeventfds); 244 } 245 246 /* 247 * shutdown any irqfd's that match fd+gsi 248 */ 249 static int 250 kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) 251 { 252 struct _irqfd *irqfd, *tmp; 253 struct eventfd_ctx *eventfd; 254 255 eventfd = eventfd_ctx_fdget(fd); 256 if (IS_ERR(eventfd)) 257 return PTR_ERR(eventfd); 258 259 spin_lock_irq(&kvm->irqfds.lock); 260 261 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 262 if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) 263 irqfd_deactivate(irqfd); 264 } 265 266 spin_unlock_irq(&kvm->irqfds.lock); 267 eventfd_ctx_put(eventfd); 268 269 /* 270 * Block until we know all outstanding shutdown jobs have completed 271 * so that we guarantee there will not be any more interrupts on this 272 * gsi once this deassign function returns. 273 */ 274 flush_workqueue(irqfd_cleanup_wq); 275 276 return 0; 277 } 278 279 int 280 kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) 281 { 282 if (flags & KVM_IRQFD_FLAG_DEASSIGN) 283 return kvm_irqfd_deassign(kvm, fd, gsi); 284 285 return kvm_irqfd_assign(kvm, fd, gsi); 286 } 287 288 /* 289 * This function is called as the kvm VM fd is being released. Shutdown all 290 * irqfds that still remain open 291 */ 292 void 293 kvm_irqfd_release(struct kvm *kvm) 294 { 295 struct _irqfd *irqfd, *tmp; 296 297 spin_lock_irq(&kvm->irqfds.lock); 298 299 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 300 irqfd_deactivate(irqfd); 301 302 spin_unlock_irq(&kvm->irqfds.lock); 303 304 /* 305 * Block until we know all outstanding shutdown jobs have completed 306 * since we do not take a kvm* reference. 307 */ 308 flush_workqueue(irqfd_cleanup_wq); 309 310 } 311 312 /* 313 * create a host-wide workqueue for issuing deferred shutdown requests 314 * aggregated from all vm* instances. We need our own isolated single-thread 315 * queue to prevent deadlock against flushing the normal work-queue. 316 */ 317 static int __init irqfd_module_init(void) 318 { 319 irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup"); 320 if (!irqfd_cleanup_wq) 321 return -ENOMEM; 322 323 return 0; 324 } 325 326 static void __exit irqfd_module_exit(void) 327 { 328 destroy_workqueue(irqfd_cleanup_wq); 329 } 330 331 module_init(irqfd_module_init); 332 module_exit(irqfd_module_exit); 333 334 /* 335 * -------------------------------------------------------------------- 336 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 337 * 338 * userspace can register a PIO/MMIO address with an eventfd for receiving 339 * notification when the memory has been touched. 340 * -------------------------------------------------------------------- 341 */ 342 343 struct _ioeventfd { 344 struct list_head list; 345 u64 addr; 346 int length; 347 struct eventfd_ctx *eventfd; 348 u64 datamatch; 349 struct kvm_io_device dev; 350 bool wildcard; 351 }; 352 353 static inline struct _ioeventfd * 354 to_ioeventfd(struct kvm_io_device *dev) 355 { 356 return container_of(dev, struct _ioeventfd, dev); 357 } 358 359 static void 360 ioeventfd_release(struct _ioeventfd *p) 361 { 362 eventfd_ctx_put(p->eventfd); 363 list_del(&p->list); 364 kfree(p); 365 } 366 367 static bool 368 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 369 { 370 u64 _val; 371 372 if (!(addr == p->addr && len == p->length)) 373 /* address-range must be precise for a hit */ 374 return false; 375 376 if (p->wildcard) 377 /* all else equal, wildcard is always a hit */ 378 return true; 379 380 /* otherwise, we have to actually compare the data */ 381 382 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 383 384 switch (len) { 385 case 1: 386 _val = *(u8 *)val; 387 break; 388 case 2: 389 _val = *(u16 *)val; 390 break; 391 case 4: 392 _val = *(u32 *)val; 393 break; 394 case 8: 395 _val = *(u64 *)val; 396 break; 397 default: 398 return false; 399 } 400 401 return _val == p->datamatch ? true : false; 402 } 403 404 /* MMIO/PIO writes trigger an event if the addr/val match */ 405 static int 406 ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, 407 const void *val) 408 { 409 struct _ioeventfd *p = to_ioeventfd(this); 410 411 if (!ioeventfd_in_range(p, addr, len, val)) 412 return -EOPNOTSUPP; 413 414 eventfd_signal(p->eventfd, 1); 415 return 0; 416 } 417 418 /* 419 * This function is called as KVM is completely shutting down. We do not 420 * need to worry about locking just nuke anything we have as quickly as possible 421 */ 422 static void 423 ioeventfd_destructor(struct kvm_io_device *this) 424 { 425 struct _ioeventfd *p = to_ioeventfd(this); 426 427 ioeventfd_release(p); 428 } 429 430 static const struct kvm_io_device_ops ioeventfd_ops = { 431 .write = ioeventfd_write, 432 .destructor = ioeventfd_destructor, 433 }; 434 435 /* assumes kvm->slots_lock held */ 436 static bool 437 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 438 { 439 struct _ioeventfd *_p; 440 441 list_for_each_entry(_p, &kvm->ioeventfds, list) 442 if (_p->addr == p->addr && _p->length == p->length && 443 (_p->wildcard || p->wildcard || 444 _p->datamatch == p->datamatch)) 445 return true; 446 447 return false; 448 } 449 450 static int 451 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 452 { 453 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 454 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; 455 struct _ioeventfd *p; 456 struct eventfd_ctx *eventfd; 457 int ret; 458 459 /* must be natural-word sized */ 460 switch (args->len) { 461 case 1: 462 case 2: 463 case 4: 464 case 8: 465 break; 466 default: 467 return -EINVAL; 468 } 469 470 /* check for range overflow */ 471 if (args->addr + args->len < args->addr) 472 return -EINVAL; 473 474 /* check for extra flags that we don't understand */ 475 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 476 return -EINVAL; 477 478 eventfd = eventfd_ctx_fdget(args->fd); 479 if (IS_ERR(eventfd)) 480 return PTR_ERR(eventfd); 481 482 p = kzalloc(sizeof(*p), GFP_KERNEL); 483 if (!p) { 484 ret = -ENOMEM; 485 goto fail; 486 } 487 488 INIT_LIST_HEAD(&p->list); 489 p->addr = args->addr; 490 p->length = args->len; 491 p->eventfd = eventfd; 492 493 /* The datamatch feature is optional, otherwise this is a wildcard */ 494 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 495 p->datamatch = args->datamatch; 496 else 497 p->wildcard = true; 498 499 down_write(&kvm->slots_lock); 500 501 /* Verify that there isnt a match already */ 502 if (ioeventfd_check_collision(kvm, p)) { 503 ret = -EEXIST; 504 goto unlock_fail; 505 } 506 507 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 508 509 ret = __kvm_io_bus_register_dev(bus, &p->dev); 510 if (ret < 0) 511 goto unlock_fail; 512 513 list_add_tail(&p->list, &kvm->ioeventfds); 514 515 up_write(&kvm->slots_lock); 516 517 return 0; 518 519 unlock_fail: 520 up_write(&kvm->slots_lock); 521 522 fail: 523 kfree(p); 524 eventfd_ctx_put(eventfd); 525 526 return ret; 527 } 528 529 static int 530 kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 531 { 532 int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; 533 struct kvm_io_bus *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus; 534 struct _ioeventfd *p, *tmp; 535 struct eventfd_ctx *eventfd; 536 int ret = -ENOENT; 537 538 eventfd = eventfd_ctx_fdget(args->fd); 539 if (IS_ERR(eventfd)) 540 return PTR_ERR(eventfd); 541 542 down_write(&kvm->slots_lock); 543 544 list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { 545 bool wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 546 547 if (p->eventfd != eventfd || 548 p->addr != args->addr || 549 p->length != args->len || 550 p->wildcard != wildcard) 551 continue; 552 553 if (!p->wildcard && p->datamatch != args->datamatch) 554 continue; 555 556 __kvm_io_bus_unregister_dev(bus, &p->dev); 557 ioeventfd_release(p); 558 ret = 0; 559 break; 560 } 561 562 up_write(&kvm->slots_lock); 563 564 eventfd_ctx_put(eventfd); 565 566 return ret; 567 } 568 569 int 570 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 571 { 572 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 573 return kvm_deassign_ioeventfd(kvm, args); 574 575 return kvm_assign_ioeventfd(kvm, args); 576 } 577