1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * kvm eventfd support - use eventfd objects to signal various KVM events 4 * 5 * Copyright 2009 Novell. All Rights Reserved. 6 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 7 * 8 * Author: 9 * Gregory Haskins <ghaskins@novell.com> 10 */ 11 12 #include <linux/kvm_host.h> 13 #include <linux/kvm.h> 14 #include <linux/kvm_irqfd.h> 15 #include <linux/workqueue.h> 16 #include <linux/syscalls.h> 17 #include <linux/wait.h> 18 #include <linux/poll.h> 19 #include <linux/file.h> 20 #include <linux/list.h> 21 #include <linux/eventfd.h> 22 #include <linux/kernel.h> 23 #include <linux/srcu.h> 24 #include <linux/slab.h> 25 #include <linux/seqlock.h> 26 #include <linux/irqbypass.h> 27 #include <trace/events/kvm.h> 28 29 #include <kvm/iodev.h> 30 31 #ifdef CONFIG_HAVE_KVM_IRQFD 32 33 static struct workqueue_struct *irqfd_cleanup_wq; 34 35 bool __attribute__((weak)) 36 kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) 37 { 38 return true; 39 } 40 41 static void 42 irqfd_inject(struct work_struct *work) 43 { 44 struct kvm_kernel_irqfd *irqfd = 45 container_of(work, struct kvm_kernel_irqfd, inject); 46 struct kvm *kvm = irqfd->kvm; 47 48 if (!irqfd->resampler) { 49 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1, 50 false); 51 kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0, 52 false); 53 } else 54 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 55 irqfd->gsi, 1, false); 56 } 57 58 static void irqfd_resampler_notify(struct kvm_kernel_irqfd_resampler *resampler) 59 { 60 struct kvm_kernel_irqfd *irqfd; 61 62 list_for_each_entry_srcu(irqfd, &resampler->list, resampler_link, 63 srcu_read_lock_held(&resampler->kvm->irq_srcu)) 64 eventfd_signal(irqfd->resamplefd, 1); 65 } 66 67 /* 68 * Since resampler irqfds share an IRQ source ID, we de-assert once 69 * then notify all of the resampler irqfds using this GSI. We can't 70 * do multiple de-asserts or we risk racing with incoming re-asserts. 71 */ 72 static void 73 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian) 74 { 75 struct kvm_kernel_irqfd_resampler *resampler; 76 struct kvm *kvm; 77 int idx; 78 79 resampler = container_of(kian, 80 struct kvm_kernel_irqfd_resampler, notifier); 81 kvm = resampler->kvm; 82 83 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 84 resampler->notifier.gsi, 0, false); 85 86 idx = srcu_read_lock(&kvm->irq_srcu); 87 irqfd_resampler_notify(resampler); 88 srcu_read_unlock(&kvm->irq_srcu, idx); 89 } 90 91 static void 92 irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd) 93 { 94 struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler; 95 struct kvm *kvm = resampler->kvm; 96 97 mutex_lock(&kvm->irqfds.resampler_lock); 98 99 list_del_rcu(&irqfd->resampler_link); 100 synchronize_srcu(&kvm->irq_srcu); 101 102 if (list_empty(&resampler->list)) { 103 list_del_rcu(&resampler->link); 104 kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier); 105 /* 106 * synchronize_srcu(&kvm->irq_srcu) already called 107 * in kvm_unregister_irq_ack_notifier(). 108 */ 109 kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, 110 resampler->notifier.gsi, 0, false); 111 kfree(resampler); 112 } 113 114 mutex_unlock(&kvm->irqfds.resampler_lock); 115 } 116 117 /* 118 * Race-free decouple logic (ordering is critical) 119 */ 120 static void 121 irqfd_shutdown(struct work_struct *work) 122 { 123 struct kvm_kernel_irqfd *irqfd = 124 container_of(work, struct kvm_kernel_irqfd, shutdown); 125 struct kvm *kvm = irqfd->kvm; 126 u64 cnt; 127 128 /* Make sure irqfd has been initialized in assign path. */ 129 synchronize_srcu(&kvm->irq_srcu); 130 131 /* 132 * Synchronize with the wait-queue and unhook ourselves to prevent 133 * further events. 134 */ 135 eventfd_ctx_remove_wait_queue(irqfd->eventfd, &irqfd->wait, &cnt); 136 137 /* 138 * We know no new events will be scheduled at this point, so block 139 * until all previously outstanding events have completed 140 */ 141 flush_work(&irqfd->inject); 142 143 if (irqfd->resampler) { 144 irqfd_resampler_shutdown(irqfd); 145 eventfd_ctx_put(irqfd->resamplefd); 146 } 147 148 /* 149 * It is now safe to release the object's resources 150 */ 151 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 152 irq_bypass_unregister_consumer(&irqfd->consumer); 153 #endif 154 eventfd_ctx_put(irqfd->eventfd); 155 kfree(irqfd); 156 } 157 158 159 /* assumes kvm->irqfds.lock is held */ 160 static bool 161 irqfd_is_active(struct kvm_kernel_irqfd *irqfd) 162 { 163 return list_empty(&irqfd->list) ? false : true; 164 } 165 166 /* 167 * Mark the irqfd as inactive and schedule it for removal 168 * 169 * assumes kvm->irqfds.lock is held 170 */ 171 static void 172 irqfd_deactivate(struct kvm_kernel_irqfd *irqfd) 173 { 174 BUG_ON(!irqfd_is_active(irqfd)); 175 176 list_del_init(&irqfd->list); 177 178 queue_work(irqfd_cleanup_wq, &irqfd->shutdown); 179 } 180 181 int __attribute__((weak)) kvm_arch_set_irq_inatomic( 182 struct kvm_kernel_irq_routing_entry *irq, 183 struct kvm *kvm, int irq_source_id, 184 int level, 185 bool line_status) 186 { 187 return -EWOULDBLOCK; 188 } 189 190 /* 191 * Called with wqh->lock held and interrupts disabled 192 */ 193 static int 194 irqfd_wakeup(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) 195 { 196 struct kvm_kernel_irqfd *irqfd = 197 container_of(wait, struct kvm_kernel_irqfd, wait); 198 __poll_t flags = key_to_poll(key); 199 struct kvm_kernel_irq_routing_entry irq; 200 struct kvm *kvm = irqfd->kvm; 201 unsigned seq; 202 int idx; 203 int ret = 0; 204 205 if (flags & EPOLLIN) { 206 u64 cnt; 207 eventfd_ctx_do_read(irqfd->eventfd, &cnt); 208 209 idx = srcu_read_lock(&kvm->irq_srcu); 210 do { 211 seq = read_seqcount_begin(&irqfd->irq_entry_sc); 212 irq = irqfd->irq_entry; 213 } while (read_seqcount_retry(&irqfd->irq_entry_sc, seq)); 214 /* An event has been signaled, inject an interrupt */ 215 if (kvm_arch_set_irq_inatomic(&irq, kvm, 216 KVM_USERSPACE_IRQ_SOURCE_ID, 1, 217 false) == -EWOULDBLOCK) 218 schedule_work(&irqfd->inject); 219 srcu_read_unlock(&kvm->irq_srcu, idx); 220 ret = 1; 221 } 222 223 if (flags & EPOLLHUP) { 224 /* The eventfd is closing, detach from KVM */ 225 unsigned long iflags; 226 227 spin_lock_irqsave(&kvm->irqfds.lock, iflags); 228 229 /* 230 * We must check if someone deactivated the irqfd before 231 * we could acquire the irqfds.lock since the item is 232 * deactivated from the KVM side before it is unhooked from 233 * the wait-queue. If it is already deactivated, we can 234 * simply return knowing the other side will cleanup for us. 235 * We cannot race against the irqfd going away since the 236 * other side is required to acquire wqh->lock, which we hold 237 */ 238 if (irqfd_is_active(irqfd)) 239 irqfd_deactivate(irqfd); 240 241 spin_unlock_irqrestore(&kvm->irqfds.lock, iflags); 242 } 243 244 return ret; 245 } 246 247 static void 248 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh, 249 poll_table *pt) 250 { 251 struct kvm_kernel_irqfd *irqfd = 252 container_of(pt, struct kvm_kernel_irqfd, pt); 253 add_wait_queue_priority(wqh, &irqfd->wait); 254 } 255 256 /* Must be called under irqfds.lock */ 257 static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd) 258 { 259 struct kvm_kernel_irq_routing_entry *e; 260 struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; 261 int n_entries; 262 263 n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi); 264 265 write_seqcount_begin(&irqfd->irq_entry_sc); 266 267 e = entries; 268 if (n_entries == 1) 269 irqfd->irq_entry = *e; 270 else 271 irqfd->irq_entry.type = 0; 272 273 write_seqcount_end(&irqfd->irq_entry_sc); 274 } 275 276 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 277 void __attribute__((weak)) kvm_arch_irq_bypass_stop( 278 struct irq_bypass_consumer *cons) 279 { 280 } 281 282 void __attribute__((weak)) kvm_arch_irq_bypass_start( 283 struct irq_bypass_consumer *cons) 284 { 285 } 286 287 int __attribute__((weak)) kvm_arch_update_irqfd_routing( 288 struct kvm *kvm, unsigned int host_irq, 289 uint32_t guest_irq, bool set) 290 { 291 return 0; 292 } 293 294 bool __attribute__((weak)) kvm_arch_irqfd_route_changed( 295 struct kvm_kernel_irq_routing_entry *old, 296 struct kvm_kernel_irq_routing_entry *new) 297 { 298 return true; 299 } 300 #endif 301 302 static int 303 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) 304 { 305 struct kvm_kernel_irqfd *irqfd, *tmp; 306 struct fd f; 307 struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL; 308 int ret; 309 __poll_t events; 310 int idx; 311 312 if (!kvm_arch_intc_initialized(kvm)) 313 return -EAGAIN; 314 315 if (!kvm_arch_irqfd_allowed(kvm, args)) 316 return -EINVAL; 317 318 irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT); 319 if (!irqfd) 320 return -ENOMEM; 321 322 irqfd->kvm = kvm; 323 irqfd->gsi = args->gsi; 324 INIT_LIST_HEAD(&irqfd->list); 325 INIT_WORK(&irqfd->inject, irqfd_inject); 326 INIT_WORK(&irqfd->shutdown, irqfd_shutdown); 327 seqcount_spinlock_init(&irqfd->irq_entry_sc, &kvm->irqfds.lock); 328 329 f = fdget(args->fd); 330 if (!f.file) { 331 ret = -EBADF; 332 goto out; 333 } 334 335 eventfd = eventfd_ctx_fileget(f.file); 336 if (IS_ERR(eventfd)) { 337 ret = PTR_ERR(eventfd); 338 goto fail; 339 } 340 341 irqfd->eventfd = eventfd; 342 343 if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) { 344 struct kvm_kernel_irqfd_resampler *resampler; 345 346 resamplefd = eventfd_ctx_fdget(args->resamplefd); 347 if (IS_ERR(resamplefd)) { 348 ret = PTR_ERR(resamplefd); 349 goto fail; 350 } 351 352 irqfd->resamplefd = resamplefd; 353 INIT_LIST_HEAD(&irqfd->resampler_link); 354 355 mutex_lock(&kvm->irqfds.resampler_lock); 356 357 list_for_each_entry(resampler, 358 &kvm->irqfds.resampler_list, link) { 359 if (resampler->notifier.gsi == irqfd->gsi) { 360 irqfd->resampler = resampler; 361 break; 362 } 363 } 364 365 if (!irqfd->resampler) { 366 resampler = kzalloc(sizeof(*resampler), 367 GFP_KERNEL_ACCOUNT); 368 if (!resampler) { 369 ret = -ENOMEM; 370 mutex_unlock(&kvm->irqfds.resampler_lock); 371 goto fail; 372 } 373 374 resampler->kvm = kvm; 375 INIT_LIST_HEAD(&resampler->list); 376 resampler->notifier.gsi = irqfd->gsi; 377 resampler->notifier.irq_acked = irqfd_resampler_ack; 378 INIT_LIST_HEAD(&resampler->link); 379 380 list_add_rcu(&resampler->link, &kvm->irqfds.resampler_list); 381 kvm_register_irq_ack_notifier(kvm, 382 &resampler->notifier); 383 irqfd->resampler = resampler; 384 } 385 386 list_add_rcu(&irqfd->resampler_link, &irqfd->resampler->list); 387 synchronize_srcu(&kvm->irq_srcu); 388 389 mutex_unlock(&kvm->irqfds.resampler_lock); 390 } 391 392 /* 393 * Install our own custom wake-up handling so we are notified via 394 * a callback whenever someone signals the underlying eventfd 395 */ 396 init_waitqueue_func_entry(&irqfd->wait, irqfd_wakeup); 397 init_poll_funcptr(&irqfd->pt, irqfd_ptable_queue_proc); 398 399 spin_lock_irq(&kvm->irqfds.lock); 400 401 ret = 0; 402 list_for_each_entry(tmp, &kvm->irqfds.items, list) { 403 if (irqfd->eventfd != tmp->eventfd) 404 continue; 405 /* This fd is used for another irq already. */ 406 ret = -EBUSY; 407 spin_unlock_irq(&kvm->irqfds.lock); 408 goto fail; 409 } 410 411 idx = srcu_read_lock(&kvm->irq_srcu); 412 irqfd_update(kvm, irqfd); 413 414 list_add_tail(&irqfd->list, &kvm->irqfds.items); 415 416 spin_unlock_irq(&kvm->irqfds.lock); 417 418 /* 419 * Check if there was an event already pending on the eventfd 420 * before we registered, and trigger it as if we didn't miss it. 421 */ 422 events = vfs_poll(f.file, &irqfd->pt); 423 424 if (events & EPOLLIN) 425 schedule_work(&irqfd->inject); 426 427 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 428 if (kvm_arch_has_irq_bypass()) { 429 irqfd->consumer.token = (void *)irqfd->eventfd; 430 irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer; 431 irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer; 432 irqfd->consumer.stop = kvm_arch_irq_bypass_stop; 433 irqfd->consumer.start = kvm_arch_irq_bypass_start; 434 ret = irq_bypass_register_consumer(&irqfd->consumer); 435 if (ret) 436 pr_info("irq bypass consumer (token %p) registration fails: %d\n", 437 irqfd->consumer.token, ret); 438 } 439 #endif 440 441 srcu_read_unlock(&kvm->irq_srcu, idx); 442 443 /* 444 * do not drop the file until the irqfd is fully initialized, otherwise 445 * we might race against the EPOLLHUP 446 */ 447 fdput(f); 448 return 0; 449 450 fail: 451 if (irqfd->resampler) 452 irqfd_resampler_shutdown(irqfd); 453 454 if (resamplefd && !IS_ERR(resamplefd)) 455 eventfd_ctx_put(resamplefd); 456 457 if (eventfd && !IS_ERR(eventfd)) 458 eventfd_ctx_put(eventfd); 459 460 fdput(f); 461 462 out: 463 kfree(irqfd); 464 return ret; 465 } 466 467 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin) 468 { 469 struct kvm_irq_ack_notifier *kian; 470 int gsi, idx; 471 472 idx = srcu_read_lock(&kvm->irq_srcu); 473 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 474 if (gsi != -1) 475 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 476 link, srcu_read_lock_held(&kvm->irq_srcu)) 477 if (kian->gsi == gsi) { 478 srcu_read_unlock(&kvm->irq_srcu, idx); 479 return true; 480 } 481 482 srcu_read_unlock(&kvm->irq_srcu, idx); 483 484 return false; 485 } 486 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier); 487 488 void kvm_notify_acked_gsi(struct kvm *kvm, int gsi) 489 { 490 struct kvm_irq_ack_notifier *kian; 491 492 hlist_for_each_entry_srcu(kian, &kvm->irq_ack_notifier_list, 493 link, srcu_read_lock_held(&kvm->irq_srcu)) 494 if (kian->gsi == gsi) 495 kian->irq_acked(kian); 496 } 497 498 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) 499 { 500 int gsi, idx; 501 502 trace_kvm_ack_irq(irqchip, pin); 503 504 idx = srcu_read_lock(&kvm->irq_srcu); 505 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 506 if (gsi != -1) 507 kvm_notify_acked_gsi(kvm, gsi); 508 srcu_read_unlock(&kvm->irq_srcu, idx); 509 } 510 511 void kvm_register_irq_ack_notifier(struct kvm *kvm, 512 struct kvm_irq_ack_notifier *kian) 513 { 514 mutex_lock(&kvm->irq_lock); 515 hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); 516 mutex_unlock(&kvm->irq_lock); 517 kvm_arch_post_irq_ack_notifier_list_update(kvm); 518 } 519 520 void kvm_unregister_irq_ack_notifier(struct kvm *kvm, 521 struct kvm_irq_ack_notifier *kian) 522 { 523 mutex_lock(&kvm->irq_lock); 524 hlist_del_init_rcu(&kian->link); 525 mutex_unlock(&kvm->irq_lock); 526 synchronize_srcu(&kvm->irq_srcu); 527 kvm_arch_post_irq_ack_notifier_list_update(kvm); 528 } 529 #endif 530 531 void 532 kvm_eventfd_init(struct kvm *kvm) 533 { 534 #ifdef CONFIG_HAVE_KVM_IRQFD 535 spin_lock_init(&kvm->irqfds.lock); 536 INIT_LIST_HEAD(&kvm->irqfds.items); 537 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 538 mutex_init(&kvm->irqfds.resampler_lock); 539 #endif 540 INIT_LIST_HEAD(&kvm->ioeventfds); 541 } 542 543 #ifdef CONFIG_HAVE_KVM_IRQFD 544 /* 545 * shutdown any irqfd's that match fd+gsi 546 */ 547 static int 548 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) 549 { 550 struct kvm_kernel_irqfd *irqfd, *tmp; 551 struct eventfd_ctx *eventfd; 552 553 eventfd = eventfd_ctx_fdget(args->fd); 554 if (IS_ERR(eventfd)) 555 return PTR_ERR(eventfd); 556 557 spin_lock_irq(&kvm->irqfds.lock); 558 559 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { 560 if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { 561 /* 562 * This clearing of irq_entry.type is needed for when 563 * another thread calls kvm_irq_routing_update before 564 * we flush workqueue below (we synchronize with 565 * kvm_irq_routing_update using irqfds.lock). 566 */ 567 write_seqcount_begin(&irqfd->irq_entry_sc); 568 irqfd->irq_entry.type = 0; 569 write_seqcount_end(&irqfd->irq_entry_sc); 570 irqfd_deactivate(irqfd); 571 } 572 } 573 574 spin_unlock_irq(&kvm->irqfds.lock); 575 eventfd_ctx_put(eventfd); 576 577 /* 578 * Block until we know all outstanding shutdown jobs have completed 579 * so that we guarantee there will not be any more interrupts on this 580 * gsi once this deassign function returns. 581 */ 582 flush_workqueue(irqfd_cleanup_wq); 583 584 return 0; 585 } 586 587 int 588 kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 589 { 590 if (args->flags & ~(KVM_IRQFD_FLAG_DEASSIGN | KVM_IRQFD_FLAG_RESAMPLE)) 591 return -EINVAL; 592 593 if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) 594 return kvm_irqfd_deassign(kvm, args); 595 596 return kvm_irqfd_assign(kvm, args); 597 } 598 599 /* 600 * This function is called as the kvm VM fd is being released. Shutdown all 601 * irqfds that still remain open 602 */ 603 void 604 kvm_irqfd_release(struct kvm *kvm) 605 { 606 struct kvm_kernel_irqfd *irqfd, *tmp; 607 608 spin_lock_irq(&kvm->irqfds.lock); 609 610 list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) 611 irqfd_deactivate(irqfd); 612 613 spin_unlock_irq(&kvm->irqfds.lock); 614 615 /* 616 * Block until we know all outstanding shutdown jobs have completed 617 * since we do not take a kvm* reference. 618 */ 619 flush_workqueue(irqfd_cleanup_wq); 620 621 } 622 623 /* 624 * Take note of a change in irq routing. 625 * Caller must invoke synchronize_srcu(&kvm->irq_srcu) afterwards. 626 */ 627 void kvm_irq_routing_update(struct kvm *kvm) 628 { 629 struct kvm_kernel_irqfd *irqfd; 630 631 spin_lock_irq(&kvm->irqfds.lock); 632 633 list_for_each_entry(irqfd, &kvm->irqfds.items, list) { 634 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 635 /* Under irqfds.lock, so can read irq_entry safely */ 636 struct kvm_kernel_irq_routing_entry old = irqfd->irq_entry; 637 #endif 638 639 irqfd_update(kvm, irqfd); 640 641 #ifdef CONFIG_HAVE_KVM_IRQ_BYPASS 642 if (irqfd->producer && 643 kvm_arch_irqfd_route_changed(&old, &irqfd->irq_entry)) { 644 int ret = kvm_arch_update_irqfd_routing( 645 irqfd->kvm, irqfd->producer->irq, 646 irqfd->gsi, 1); 647 WARN_ON(ret); 648 } 649 #endif 650 } 651 652 spin_unlock_irq(&kvm->irqfds.lock); 653 } 654 655 bool kvm_notify_irqfd_resampler(struct kvm *kvm, 656 unsigned int irqchip, 657 unsigned int pin) 658 { 659 struct kvm_kernel_irqfd_resampler *resampler; 660 int gsi, idx; 661 662 idx = srcu_read_lock(&kvm->irq_srcu); 663 gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin); 664 if (gsi != -1) { 665 list_for_each_entry_srcu(resampler, 666 &kvm->irqfds.resampler_list, link, 667 srcu_read_lock_held(&kvm->irq_srcu)) { 668 if (resampler->notifier.gsi == gsi) { 669 irqfd_resampler_notify(resampler); 670 srcu_read_unlock(&kvm->irq_srcu, idx); 671 return true; 672 } 673 } 674 } 675 srcu_read_unlock(&kvm->irq_srcu, idx); 676 677 return false; 678 } 679 680 /* 681 * create a host-wide workqueue for issuing deferred shutdown requests 682 * aggregated from all vm* instances. We need our own isolated 683 * queue to ease flushing work items when a VM exits. 684 */ 685 int kvm_irqfd_init(void) 686 { 687 irqfd_cleanup_wq = alloc_workqueue("kvm-irqfd-cleanup", 0, 0); 688 if (!irqfd_cleanup_wq) 689 return -ENOMEM; 690 691 return 0; 692 } 693 694 void kvm_irqfd_exit(void) 695 { 696 destroy_workqueue(irqfd_cleanup_wq); 697 } 698 #endif 699 700 /* 701 * -------------------------------------------------------------------- 702 * ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. 703 * 704 * userspace can register a PIO/MMIO address with an eventfd for receiving 705 * notification when the memory has been touched. 706 * -------------------------------------------------------------------- 707 */ 708 709 struct _ioeventfd { 710 struct list_head list; 711 u64 addr; 712 int length; 713 struct eventfd_ctx *eventfd; 714 u64 datamatch; 715 struct kvm_io_device dev; 716 u8 bus_idx; 717 bool wildcard; 718 }; 719 720 static inline struct _ioeventfd * 721 to_ioeventfd(struct kvm_io_device *dev) 722 { 723 return container_of(dev, struct _ioeventfd, dev); 724 } 725 726 static void 727 ioeventfd_release(struct _ioeventfd *p) 728 { 729 eventfd_ctx_put(p->eventfd); 730 list_del(&p->list); 731 kfree(p); 732 } 733 734 static bool 735 ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val) 736 { 737 u64 _val; 738 739 if (addr != p->addr) 740 /* address must be precise for a hit */ 741 return false; 742 743 if (!p->length) 744 /* length = 0 means only look at the address, so always a hit */ 745 return true; 746 747 if (len != p->length) 748 /* address-range must be precise for a hit */ 749 return false; 750 751 if (p->wildcard) 752 /* all else equal, wildcard is always a hit */ 753 return true; 754 755 /* otherwise, we have to actually compare the data */ 756 757 BUG_ON(!IS_ALIGNED((unsigned long)val, len)); 758 759 switch (len) { 760 case 1: 761 _val = *(u8 *)val; 762 break; 763 case 2: 764 _val = *(u16 *)val; 765 break; 766 case 4: 767 _val = *(u32 *)val; 768 break; 769 case 8: 770 _val = *(u64 *)val; 771 break; 772 default: 773 return false; 774 } 775 776 return _val == p->datamatch; 777 } 778 779 /* MMIO/PIO writes trigger an event if the addr/val match */ 780 static int 781 ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, 782 int len, const void *val) 783 { 784 struct _ioeventfd *p = to_ioeventfd(this); 785 786 if (!ioeventfd_in_range(p, addr, len, val)) 787 return -EOPNOTSUPP; 788 789 eventfd_signal(p->eventfd, 1); 790 return 0; 791 } 792 793 /* 794 * This function is called as KVM is completely shutting down. We do not 795 * need to worry about locking just nuke anything we have as quickly as possible 796 */ 797 static void 798 ioeventfd_destructor(struct kvm_io_device *this) 799 { 800 struct _ioeventfd *p = to_ioeventfd(this); 801 802 ioeventfd_release(p); 803 } 804 805 static const struct kvm_io_device_ops ioeventfd_ops = { 806 .write = ioeventfd_write, 807 .destructor = ioeventfd_destructor, 808 }; 809 810 /* assumes kvm->slots_lock held */ 811 static bool 812 ioeventfd_check_collision(struct kvm *kvm, struct _ioeventfd *p) 813 { 814 struct _ioeventfd *_p; 815 816 list_for_each_entry(_p, &kvm->ioeventfds, list) 817 if (_p->bus_idx == p->bus_idx && 818 _p->addr == p->addr && 819 (!_p->length || !p->length || 820 (_p->length == p->length && 821 (_p->wildcard || p->wildcard || 822 _p->datamatch == p->datamatch)))) 823 return true; 824 825 return false; 826 } 827 828 static enum kvm_bus ioeventfd_bus_from_flags(__u32 flags) 829 { 830 if (flags & KVM_IOEVENTFD_FLAG_PIO) 831 return KVM_PIO_BUS; 832 if (flags & KVM_IOEVENTFD_FLAG_VIRTIO_CCW_NOTIFY) 833 return KVM_VIRTIO_CCW_NOTIFY_BUS; 834 return KVM_MMIO_BUS; 835 } 836 837 static int kvm_assign_ioeventfd_idx(struct kvm *kvm, 838 enum kvm_bus bus_idx, 839 struct kvm_ioeventfd *args) 840 { 841 842 struct eventfd_ctx *eventfd; 843 struct _ioeventfd *p; 844 int ret; 845 846 eventfd = eventfd_ctx_fdget(args->fd); 847 if (IS_ERR(eventfd)) 848 return PTR_ERR(eventfd); 849 850 p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT); 851 if (!p) { 852 ret = -ENOMEM; 853 goto fail; 854 } 855 856 INIT_LIST_HEAD(&p->list); 857 p->addr = args->addr; 858 p->bus_idx = bus_idx; 859 p->length = args->len; 860 p->eventfd = eventfd; 861 862 /* The datamatch feature is optional, otherwise this is a wildcard */ 863 if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) 864 p->datamatch = args->datamatch; 865 else 866 p->wildcard = true; 867 868 mutex_lock(&kvm->slots_lock); 869 870 /* Verify that there isn't a match already */ 871 if (ioeventfd_check_collision(kvm, p)) { 872 ret = -EEXIST; 873 goto unlock_fail; 874 } 875 876 kvm_iodevice_init(&p->dev, &ioeventfd_ops); 877 878 ret = kvm_io_bus_register_dev(kvm, bus_idx, p->addr, p->length, 879 &p->dev); 880 if (ret < 0) 881 goto unlock_fail; 882 883 kvm_get_bus(kvm, bus_idx)->ioeventfd_count++; 884 list_add_tail(&p->list, &kvm->ioeventfds); 885 886 mutex_unlock(&kvm->slots_lock); 887 888 return 0; 889 890 unlock_fail: 891 mutex_unlock(&kvm->slots_lock); 892 kfree(p); 893 894 fail: 895 eventfd_ctx_put(eventfd); 896 897 return ret; 898 } 899 900 static int 901 kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, 902 struct kvm_ioeventfd *args) 903 { 904 struct _ioeventfd *p; 905 struct eventfd_ctx *eventfd; 906 struct kvm_io_bus *bus; 907 int ret = -ENOENT; 908 bool wildcard; 909 910 eventfd = eventfd_ctx_fdget(args->fd); 911 if (IS_ERR(eventfd)) 912 return PTR_ERR(eventfd); 913 914 wildcard = !(args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH); 915 916 mutex_lock(&kvm->slots_lock); 917 918 list_for_each_entry(p, &kvm->ioeventfds, list) { 919 if (p->bus_idx != bus_idx || 920 p->eventfd != eventfd || 921 p->addr != args->addr || 922 p->length != args->len || 923 p->wildcard != wildcard) 924 continue; 925 926 if (!p->wildcard && p->datamatch != args->datamatch) 927 continue; 928 929 kvm_io_bus_unregister_dev(kvm, bus_idx, &p->dev); 930 bus = kvm_get_bus(kvm, bus_idx); 931 if (bus) 932 bus->ioeventfd_count--; 933 ret = 0; 934 break; 935 } 936 937 mutex_unlock(&kvm->slots_lock); 938 939 eventfd_ctx_put(eventfd); 940 941 return ret; 942 } 943 944 static int kvm_deassign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 945 { 946 enum kvm_bus bus_idx = ioeventfd_bus_from_flags(args->flags); 947 int ret = kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 948 949 if (!args->len && bus_idx == KVM_MMIO_BUS) 950 kvm_deassign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 951 952 return ret; 953 } 954 955 static int 956 kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 957 { 958 enum kvm_bus bus_idx; 959 int ret; 960 961 bus_idx = ioeventfd_bus_from_flags(args->flags); 962 /* must be natural-word sized, or 0 to ignore length */ 963 switch (args->len) { 964 case 0: 965 case 1: 966 case 2: 967 case 4: 968 case 8: 969 break; 970 default: 971 return -EINVAL; 972 } 973 974 /* check for range overflow */ 975 if (args->addr + args->len < args->addr) 976 return -EINVAL; 977 978 /* check for extra flags that we don't understand */ 979 if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) 980 return -EINVAL; 981 982 /* ioeventfd with no length can't be combined with DATAMATCH */ 983 if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)) 984 return -EINVAL; 985 986 ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args); 987 if (ret) 988 goto fail; 989 990 /* When length is ignored, MMIO is also put on a separate bus, for 991 * faster lookups. 992 */ 993 if (!args->len && bus_idx == KVM_MMIO_BUS) { 994 ret = kvm_assign_ioeventfd_idx(kvm, KVM_FAST_MMIO_BUS, args); 995 if (ret < 0) 996 goto fast_fail; 997 } 998 999 return 0; 1000 1001 fast_fail: 1002 kvm_deassign_ioeventfd_idx(kvm, bus_idx, args); 1003 fail: 1004 return ret; 1005 } 1006 1007 int 1008 kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) 1009 { 1010 if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) 1011 return kvm_deassign_ioeventfd(kvm, args); 1012 1013 return kvm_assign_ioeventfd(kvm, args); 1014 } 1015