1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <kvm/iodev.h> 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 #include <linux/sort.h> 51 #include <linux/bsearch.h> 52 53 #include <asm/processor.h> 54 #include <asm/io.h> 55 #include <asm/ioctl.h> 56 #include <asm/uaccess.h> 57 #include <asm/pgtable.h> 58 59 #include "coalesced_mmio.h" 60 #include "async_pf.h" 61 #include "vfio.h" 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/kvm.h> 65 66 MODULE_AUTHOR("Qumranet"); 67 MODULE_LICENSE("GPL"); 68 69 static unsigned int halt_poll_ns; 70 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR); 71 72 /* 73 * Ordering of locks: 74 * 75 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 76 */ 77 78 DEFINE_SPINLOCK(kvm_lock); 79 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 80 LIST_HEAD(vm_list); 81 82 static cpumask_var_t cpus_hardware_enabled; 83 static int kvm_usage_count; 84 static atomic_t hardware_enable_failed; 85 86 struct kmem_cache *kvm_vcpu_cache; 87 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 88 89 static __read_mostly struct preempt_ops kvm_preempt_ops; 90 91 struct dentry *kvm_debugfs_dir; 92 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 93 94 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 95 unsigned long arg); 96 #ifdef CONFIG_KVM_COMPAT 97 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 98 unsigned long arg); 99 #endif 100 static int hardware_enable_all(void); 101 static void hardware_disable_all(void); 102 103 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 104 105 static void kvm_release_pfn_dirty(pfn_t pfn); 106 static void mark_page_dirty_in_slot(struct kvm *kvm, 107 struct kvm_memory_slot *memslot, gfn_t gfn); 108 109 __visible bool kvm_rebooting; 110 EXPORT_SYMBOL_GPL(kvm_rebooting); 111 112 static bool largepages_enabled = true; 113 114 bool kvm_is_reserved_pfn(pfn_t pfn) 115 { 116 if (pfn_valid(pfn)) 117 return PageReserved(pfn_to_page(pfn)); 118 119 return true; 120 } 121 122 /* 123 * Switches to specified vcpu, until a matching vcpu_put() 124 */ 125 int vcpu_load(struct kvm_vcpu *vcpu) 126 { 127 int cpu; 128 129 if (mutex_lock_killable(&vcpu->mutex)) 130 return -EINTR; 131 cpu = get_cpu(); 132 preempt_notifier_register(&vcpu->preempt_notifier); 133 kvm_arch_vcpu_load(vcpu, cpu); 134 put_cpu(); 135 return 0; 136 } 137 138 void vcpu_put(struct kvm_vcpu *vcpu) 139 { 140 preempt_disable(); 141 kvm_arch_vcpu_put(vcpu); 142 preempt_notifier_unregister(&vcpu->preempt_notifier); 143 preempt_enable(); 144 mutex_unlock(&vcpu->mutex); 145 } 146 147 static void ack_flush(void *_completed) 148 { 149 } 150 151 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 152 { 153 int i, cpu, me; 154 cpumask_var_t cpus; 155 bool called = true; 156 struct kvm_vcpu *vcpu; 157 158 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 159 160 me = get_cpu(); 161 kvm_for_each_vcpu(i, vcpu, kvm) { 162 kvm_make_request(req, vcpu); 163 cpu = vcpu->cpu; 164 165 /* Set ->requests bit before we read ->mode */ 166 smp_mb(); 167 168 if (cpus != NULL && cpu != -1 && cpu != me && 169 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 170 cpumask_set_cpu(cpu, cpus); 171 } 172 if (unlikely(cpus == NULL)) 173 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 174 else if (!cpumask_empty(cpus)) 175 smp_call_function_many(cpus, ack_flush, NULL, 1); 176 else 177 called = false; 178 put_cpu(); 179 free_cpumask_var(cpus); 180 return called; 181 } 182 183 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 184 void kvm_flush_remote_tlbs(struct kvm *kvm) 185 { 186 long dirty_count = kvm->tlbs_dirty; 187 188 smp_mb(); 189 if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 190 ++kvm->stat.remote_tlb_flush; 191 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 192 } 193 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 194 #endif 195 196 void kvm_reload_remote_mmus(struct kvm *kvm) 197 { 198 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 199 } 200 201 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 202 { 203 kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 204 } 205 206 void kvm_make_scan_ioapic_request(struct kvm *kvm) 207 { 208 kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); 209 } 210 211 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 212 { 213 struct page *page; 214 int r; 215 216 mutex_init(&vcpu->mutex); 217 vcpu->cpu = -1; 218 vcpu->kvm = kvm; 219 vcpu->vcpu_id = id; 220 vcpu->pid = NULL; 221 init_waitqueue_head(&vcpu->wq); 222 kvm_async_pf_vcpu_init(vcpu); 223 224 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 225 if (!page) { 226 r = -ENOMEM; 227 goto fail; 228 } 229 vcpu->run = page_address(page); 230 231 kvm_vcpu_set_in_spin_loop(vcpu, false); 232 kvm_vcpu_set_dy_eligible(vcpu, false); 233 vcpu->preempted = false; 234 235 r = kvm_arch_vcpu_init(vcpu); 236 if (r < 0) 237 goto fail_free_run; 238 return 0; 239 240 fail_free_run: 241 free_page((unsigned long)vcpu->run); 242 fail: 243 return r; 244 } 245 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 246 247 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 248 { 249 put_pid(vcpu->pid); 250 kvm_arch_vcpu_uninit(vcpu); 251 free_page((unsigned long)vcpu->run); 252 } 253 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 254 255 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 256 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 257 { 258 return container_of(mn, struct kvm, mmu_notifier); 259 } 260 261 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 262 struct mm_struct *mm, 263 unsigned long address) 264 { 265 struct kvm *kvm = mmu_notifier_to_kvm(mn); 266 int need_tlb_flush, idx; 267 268 /* 269 * When ->invalidate_page runs, the linux pte has been zapped 270 * already but the page is still allocated until 271 * ->invalidate_page returns. So if we increase the sequence 272 * here the kvm page fault will notice if the spte can't be 273 * established because the page is going to be freed. If 274 * instead the kvm page fault establishes the spte before 275 * ->invalidate_page runs, kvm_unmap_hva will release it 276 * before returning. 277 * 278 * The sequence increase only need to be seen at spin_unlock 279 * time, and not at spin_lock time. 280 * 281 * Increasing the sequence after the spin_unlock would be 282 * unsafe because the kvm page fault could then establish the 283 * pte after kvm_unmap_hva returned, without noticing the page 284 * is going to be freed. 285 */ 286 idx = srcu_read_lock(&kvm->srcu); 287 spin_lock(&kvm->mmu_lock); 288 289 kvm->mmu_notifier_seq++; 290 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 291 /* we've to flush the tlb before the pages can be freed */ 292 if (need_tlb_flush) 293 kvm_flush_remote_tlbs(kvm); 294 295 spin_unlock(&kvm->mmu_lock); 296 297 kvm_arch_mmu_notifier_invalidate_page(kvm, address); 298 299 srcu_read_unlock(&kvm->srcu, idx); 300 } 301 302 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 303 struct mm_struct *mm, 304 unsigned long address, 305 pte_t pte) 306 { 307 struct kvm *kvm = mmu_notifier_to_kvm(mn); 308 int idx; 309 310 idx = srcu_read_lock(&kvm->srcu); 311 spin_lock(&kvm->mmu_lock); 312 kvm->mmu_notifier_seq++; 313 kvm_set_spte_hva(kvm, address, pte); 314 spin_unlock(&kvm->mmu_lock); 315 srcu_read_unlock(&kvm->srcu, idx); 316 } 317 318 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 319 struct mm_struct *mm, 320 unsigned long start, 321 unsigned long end) 322 { 323 struct kvm *kvm = mmu_notifier_to_kvm(mn); 324 int need_tlb_flush = 0, idx; 325 326 idx = srcu_read_lock(&kvm->srcu); 327 spin_lock(&kvm->mmu_lock); 328 /* 329 * The count increase must become visible at unlock time as no 330 * spte can be established without taking the mmu_lock and 331 * count is also read inside the mmu_lock critical section. 332 */ 333 kvm->mmu_notifier_count++; 334 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 335 need_tlb_flush |= kvm->tlbs_dirty; 336 /* we've to flush the tlb before the pages can be freed */ 337 if (need_tlb_flush) 338 kvm_flush_remote_tlbs(kvm); 339 340 spin_unlock(&kvm->mmu_lock); 341 srcu_read_unlock(&kvm->srcu, idx); 342 } 343 344 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 345 struct mm_struct *mm, 346 unsigned long start, 347 unsigned long end) 348 { 349 struct kvm *kvm = mmu_notifier_to_kvm(mn); 350 351 spin_lock(&kvm->mmu_lock); 352 /* 353 * This sequence increase will notify the kvm page fault that 354 * the page that is going to be mapped in the spte could have 355 * been freed. 356 */ 357 kvm->mmu_notifier_seq++; 358 smp_wmb(); 359 /* 360 * The above sequence increase must be visible before the 361 * below count decrease, which is ensured by the smp_wmb above 362 * in conjunction with the smp_rmb in mmu_notifier_retry(). 363 */ 364 kvm->mmu_notifier_count--; 365 spin_unlock(&kvm->mmu_lock); 366 367 BUG_ON(kvm->mmu_notifier_count < 0); 368 } 369 370 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 371 struct mm_struct *mm, 372 unsigned long start, 373 unsigned long end) 374 { 375 struct kvm *kvm = mmu_notifier_to_kvm(mn); 376 int young, idx; 377 378 idx = srcu_read_lock(&kvm->srcu); 379 spin_lock(&kvm->mmu_lock); 380 381 young = kvm_age_hva(kvm, start, end); 382 if (young) 383 kvm_flush_remote_tlbs(kvm); 384 385 spin_unlock(&kvm->mmu_lock); 386 srcu_read_unlock(&kvm->srcu, idx); 387 388 return young; 389 } 390 391 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 392 struct mm_struct *mm, 393 unsigned long address) 394 { 395 struct kvm *kvm = mmu_notifier_to_kvm(mn); 396 int young, idx; 397 398 idx = srcu_read_lock(&kvm->srcu); 399 spin_lock(&kvm->mmu_lock); 400 young = kvm_test_age_hva(kvm, address); 401 spin_unlock(&kvm->mmu_lock); 402 srcu_read_unlock(&kvm->srcu, idx); 403 404 return young; 405 } 406 407 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 408 struct mm_struct *mm) 409 { 410 struct kvm *kvm = mmu_notifier_to_kvm(mn); 411 int idx; 412 413 idx = srcu_read_lock(&kvm->srcu); 414 kvm_arch_flush_shadow_all(kvm); 415 srcu_read_unlock(&kvm->srcu, idx); 416 } 417 418 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 419 .invalidate_page = kvm_mmu_notifier_invalidate_page, 420 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 421 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 422 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 423 .test_young = kvm_mmu_notifier_test_young, 424 .change_pte = kvm_mmu_notifier_change_pte, 425 .release = kvm_mmu_notifier_release, 426 }; 427 428 static int kvm_init_mmu_notifier(struct kvm *kvm) 429 { 430 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 431 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 432 } 433 434 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 435 436 static int kvm_init_mmu_notifier(struct kvm *kvm) 437 { 438 return 0; 439 } 440 441 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 442 443 static void kvm_init_memslots_id(struct kvm *kvm) 444 { 445 int i; 446 struct kvm_memslots *slots = kvm->memslots; 447 448 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 449 slots->id_to_index[i] = slots->memslots[i].id = i; 450 } 451 452 static struct kvm *kvm_create_vm(unsigned long type) 453 { 454 int r, i; 455 struct kvm *kvm = kvm_arch_alloc_vm(); 456 457 if (!kvm) 458 return ERR_PTR(-ENOMEM); 459 460 r = kvm_arch_init_vm(kvm, type); 461 if (r) 462 goto out_err_no_disable; 463 464 r = hardware_enable_all(); 465 if (r) 466 goto out_err_no_disable; 467 468 #ifdef CONFIG_HAVE_KVM_IRQFD 469 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 470 #endif 471 472 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 473 474 r = -ENOMEM; 475 kvm->memslots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 476 if (!kvm->memslots) 477 goto out_err_no_srcu; 478 479 /* 480 * Init kvm generation close to the maximum to easily test the 481 * code of handling generation number wrap-around. 482 */ 483 kvm->memslots->generation = -150; 484 485 kvm_init_memslots_id(kvm); 486 if (init_srcu_struct(&kvm->srcu)) 487 goto out_err_no_srcu; 488 if (init_srcu_struct(&kvm->irq_srcu)) 489 goto out_err_no_irq_srcu; 490 for (i = 0; i < KVM_NR_BUSES; i++) { 491 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 492 GFP_KERNEL); 493 if (!kvm->buses[i]) 494 goto out_err; 495 } 496 497 spin_lock_init(&kvm->mmu_lock); 498 kvm->mm = current->mm; 499 atomic_inc(&kvm->mm->mm_count); 500 kvm_eventfd_init(kvm); 501 mutex_init(&kvm->lock); 502 mutex_init(&kvm->irq_lock); 503 mutex_init(&kvm->slots_lock); 504 atomic_set(&kvm->users_count, 1); 505 INIT_LIST_HEAD(&kvm->devices); 506 507 r = kvm_init_mmu_notifier(kvm); 508 if (r) 509 goto out_err; 510 511 spin_lock(&kvm_lock); 512 list_add(&kvm->vm_list, &vm_list); 513 spin_unlock(&kvm_lock); 514 515 return kvm; 516 517 out_err: 518 cleanup_srcu_struct(&kvm->irq_srcu); 519 out_err_no_irq_srcu: 520 cleanup_srcu_struct(&kvm->srcu); 521 out_err_no_srcu: 522 hardware_disable_all(); 523 out_err_no_disable: 524 for (i = 0; i < KVM_NR_BUSES; i++) 525 kfree(kvm->buses[i]); 526 kvfree(kvm->memslots); 527 kvm_arch_free_vm(kvm); 528 return ERR_PTR(r); 529 } 530 531 /* 532 * Avoid using vmalloc for a small buffer. 533 * Should not be used when the size is statically known. 534 */ 535 void *kvm_kvzalloc(unsigned long size) 536 { 537 if (size > PAGE_SIZE) 538 return vzalloc(size); 539 else 540 return kzalloc(size, GFP_KERNEL); 541 } 542 543 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 544 { 545 if (!memslot->dirty_bitmap) 546 return; 547 548 kvfree(memslot->dirty_bitmap); 549 memslot->dirty_bitmap = NULL; 550 } 551 552 /* 553 * Free any memory in @free but not in @dont. 554 */ 555 static void kvm_free_physmem_slot(struct kvm *kvm, struct kvm_memory_slot *free, 556 struct kvm_memory_slot *dont) 557 { 558 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 559 kvm_destroy_dirty_bitmap(free); 560 561 kvm_arch_free_memslot(kvm, free, dont); 562 563 free->npages = 0; 564 } 565 566 static void kvm_free_physmem(struct kvm *kvm) 567 { 568 struct kvm_memslots *slots = kvm->memslots; 569 struct kvm_memory_slot *memslot; 570 571 kvm_for_each_memslot(memslot, slots) 572 kvm_free_physmem_slot(kvm, memslot, NULL); 573 574 kvfree(kvm->memslots); 575 } 576 577 static void kvm_destroy_devices(struct kvm *kvm) 578 { 579 struct list_head *node, *tmp; 580 581 list_for_each_safe(node, tmp, &kvm->devices) { 582 struct kvm_device *dev = 583 list_entry(node, struct kvm_device, vm_node); 584 585 list_del(node); 586 dev->ops->destroy(dev); 587 } 588 } 589 590 static void kvm_destroy_vm(struct kvm *kvm) 591 { 592 int i; 593 struct mm_struct *mm = kvm->mm; 594 595 kvm_arch_sync_events(kvm); 596 spin_lock(&kvm_lock); 597 list_del(&kvm->vm_list); 598 spin_unlock(&kvm_lock); 599 kvm_free_irq_routing(kvm); 600 for (i = 0; i < KVM_NR_BUSES; i++) 601 kvm_io_bus_destroy(kvm->buses[i]); 602 kvm_coalesced_mmio_free(kvm); 603 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 604 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 605 #else 606 kvm_arch_flush_shadow_all(kvm); 607 #endif 608 kvm_arch_destroy_vm(kvm); 609 kvm_destroy_devices(kvm); 610 kvm_free_physmem(kvm); 611 cleanup_srcu_struct(&kvm->irq_srcu); 612 cleanup_srcu_struct(&kvm->srcu); 613 kvm_arch_free_vm(kvm); 614 hardware_disable_all(); 615 mmdrop(mm); 616 } 617 618 void kvm_get_kvm(struct kvm *kvm) 619 { 620 atomic_inc(&kvm->users_count); 621 } 622 EXPORT_SYMBOL_GPL(kvm_get_kvm); 623 624 void kvm_put_kvm(struct kvm *kvm) 625 { 626 if (atomic_dec_and_test(&kvm->users_count)) 627 kvm_destroy_vm(kvm); 628 } 629 EXPORT_SYMBOL_GPL(kvm_put_kvm); 630 631 632 static int kvm_vm_release(struct inode *inode, struct file *filp) 633 { 634 struct kvm *kvm = filp->private_data; 635 636 kvm_irqfd_release(kvm); 637 638 kvm_put_kvm(kvm); 639 return 0; 640 } 641 642 /* 643 * Allocation size is twice as large as the actual dirty bitmap size. 644 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 645 */ 646 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 647 { 648 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 649 650 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 651 if (!memslot->dirty_bitmap) 652 return -ENOMEM; 653 654 return 0; 655 } 656 657 /* 658 * Insert memslot and re-sort memslots based on their GFN, 659 * so binary search could be used to lookup GFN. 660 * Sorting algorithm takes advantage of having initially 661 * sorted array and known changed memslot position. 662 */ 663 static void update_memslots(struct kvm_memslots *slots, 664 struct kvm_memory_slot *new) 665 { 666 int id = new->id; 667 int i = slots->id_to_index[id]; 668 struct kvm_memory_slot *mslots = slots->memslots; 669 670 WARN_ON(mslots[i].id != id); 671 if (!new->npages) { 672 WARN_ON(!mslots[i].npages); 673 new->base_gfn = 0; 674 new->flags = 0; 675 if (mslots[i].npages) 676 slots->used_slots--; 677 } else { 678 if (!mslots[i].npages) 679 slots->used_slots++; 680 } 681 682 while (i < KVM_MEM_SLOTS_NUM - 1 && 683 new->base_gfn <= mslots[i + 1].base_gfn) { 684 if (!mslots[i + 1].npages) 685 break; 686 mslots[i] = mslots[i + 1]; 687 slots->id_to_index[mslots[i].id] = i; 688 i++; 689 } 690 691 /* 692 * The ">=" is needed when creating a slot with base_gfn == 0, 693 * so that it moves before all those with base_gfn == npages == 0. 694 * 695 * On the other hand, if new->npages is zero, the above loop has 696 * already left i pointing to the beginning of the empty part of 697 * mslots, and the ">=" would move the hole backwards in this 698 * case---which is wrong. So skip the loop when deleting a slot. 699 */ 700 if (new->npages) { 701 while (i > 0 && 702 new->base_gfn >= mslots[i - 1].base_gfn) { 703 mslots[i] = mslots[i - 1]; 704 slots->id_to_index[mslots[i].id] = i; 705 i--; 706 } 707 } else 708 WARN_ON_ONCE(i != slots->used_slots); 709 710 mslots[i] = *new; 711 slots->id_to_index[mslots[i].id] = i; 712 } 713 714 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 715 { 716 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 717 718 #ifdef __KVM_HAVE_READONLY_MEM 719 valid_flags |= KVM_MEM_READONLY; 720 #endif 721 722 if (mem->flags & ~valid_flags) 723 return -EINVAL; 724 725 return 0; 726 } 727 728 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 729 struct kvm_memslots *slots) 730 { 731 struct kvm_memslots *old_memslots = kvm->memslots; 732 733 /* 734 * Set the low bit in the generation, which disables SPTE caching 735 * until the end of synchronize_srcu_expedited. 736 */ 737 WARN_ON(old_memslots->generation & 1); 738 slots->generation = old_memslots->generation + 1; 739 740 rcu_assign_pointer(kvm->memslots, slots); 741 synchronize_srcu_expedited(&kvm->srcu); 742 743 /* 744 * Increment the new memslot generation a second time. This prevents 745 * vm exits that race with memslot updates from caching a memslot 746 * generation that will (potentially) be valid forever. 747 */ 748 slots->generation++; 749 750 kvm_arch_memslots_updated(kvm); 751 752 return old_memslots; 753 } 754 755 /* 756 * Allocate some memory and give it an address in the guest physical address 757 * space. 758 * 759 * Discontiguous memory is allowed, mostly for framebuffers. 760 * 761 * Must be called holding kvm->slots_lock for write. 762 */ 763 int __kvm_set_memory_region(struct kvm *kvm, 764 struct kvm_userspace_memory_region *mem) 765 { 766 int r; 767 gfn_t base_gfn; 768 unsigned long npages; 769 struct kvm_memory_slot *slot; 770 struct kvm_memory_slot old, new; 771 struct kvm_memslots *slots = NULL, *old_memslots; 772 enum kvm_mr_change change; 773 774 r = check_memory_region_flags(mem); 775 if (r) 776 goto out; 777 778 r = -EINVAL; 779 /* General sanity checks */ 780 if (mem->memory_size & (PAGE_SIZE - 1)) 781 goto out; 782 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 783 goto out; 784 /* We can read the guest memory with __xxx_user() later on. */ 785 if ((mem->slot < KVM_USER_MEM_SLOTS) && 786 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 787 !access_ok(VERIFY_WRITE, 788 (void __user *)(unsigned long)mem->userspace_addr, 789 mem->memory_size))) 790 goto out; 791 if (mem->slot >= KVM_MEM_SLOTS_NUM) 792 goto out; 793 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 794 goto out; 795 796 slot = id_to_memslot(kvm->memslots, mem->slot); 797 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 798 npages = mem->memory_size >> PAGE_SHIFT; 799 800 if (npages > KVM_MEM_MAX_NR_PAGES) 801 goto out; 802 803 if (!npages) 804 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 805 806 new = old = *slot; 807 808 new.id = mem->slot; 809 new.base_gfn = base_gfn; 810 new.npages = npages; 811 new.flags = mem->flags; 812 813 if (npages) { 814 if (!old.npages) 815 change = KVM_MR_CREATE; 816 else { /* Modify an existing slot. */ 817 if ((mem->userspace_addr != old.userspace_addr) || 818 (npages != old.npages) || 819 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 820 goto out; 821 822 if (base_gfn != old.base_gfn) 823 change = KVM_MR_MOVE; 824 else if (new.flags != old.flags) 825 change = KVM_MR_FLAGS_ONLY; 826 else { /* Nothing to change. */ 827 r = 0; 828 goto out; 829 } 830 } 831 } else if (old.npages) { 832 change = KVM_MR_DELETE; 833 } else /* Modify a non-existent slot: disallowed. */ 834 goto out; 835 836 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 837 /* Check for overlaps */ 838 r = -EEXIST; 839 kvm_for_each_memslot(slot, kvm->memslots) { 840 if ((slot->id >= KVM_USER_MEM_SLOTS) || 841 (slot->id == mem->slot)) 842 continue; 843 if (!((base_gfn + npages <= slot->base_gfn) || 844 (base_gfn >= slot->base_gfn + slot->npages))) 845 goto out; 846 } 847 } 848 849 /* Free page dirty bitmap if unneeded */ 850 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 851 new.dirty_bitmap = NULL; 852 853 r = -ENOMEM; 854 if (change == KVM_MR_CREATE) { 855 new.userspace_addr = mem->userspace_addr; 856 857 if (kvm_arch_create_memslot(kvm, &new, npages)) 858 goto out_free; 859 } 860 861 /* Allocate page dirty bitmap if needed */ 862 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 863 if (kvm_create_dirty_bitmap(&new) < 0) 864 goto out_free; 865 } 866 867 slots = kvm_kvzalloc(sizeof(struct kvm_memslots)); 868 if (!slots) 869 goto out_free; 870 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 871 872 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 873 slot = id_to_memslot(slots, mem->slot); 874 slot->flags |= KVM_MEMSLOT_INVALID; 875 876 old_memslots = install_new_memslots(kvm, slots); 877 878 /* slot was deleted or moved, clear iommu mapping */ 879 kvm_iommu_unmap_pages(kvm, &old); 880 /* From this point no new shadow pages pointing to a deleted, 881 * or moved, memslot will be created. 882 * 883 * validation of sp->gfn happens in: 884 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 885 * - kvm_is_visible_gfn (mmu_check_roots) 886 */ 887 kvm_arch_flush_shadow_memslot(kvm, slot); 888 889 /* 890 * We can re-use the old_memslots from above, the only difference 891 * from the currently installed memslots is the invalid flag. This 892 * will get overwritten by update_memslots anyway. 893 */ 894 slots = old_memslots; 895 } 896 897 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 898 if (r) 899 goto out_slots; 900 901 /* actual memory is freed via old in kvm_free_physmem_slot below */ 902 if (change == KVM_MR_DELETE) { 903 new.dirty_bitmap = NULL; 904 memset(&new.arch, 0, sizeof(new.arch)); 905 } 906 907 update_memslots(slots, &new); 908 old_memslots = install_new_memslots(kvm, slots); 909 910 kvm_arch_commit_memory_region(kvm, mem, &old, change); 911 912 kvm_free_physmem_slot(kvm, &old, &new); 913 kvfree(old_memslots); 914 915 /* 916 * IOMMU mapping: New slots need to be mapped. Old slots need to be 917 * un-mapped and re-mapped if their base changes. Since base change 918 * unmapping is handled above with slot deletion, mapping alone is 919 * needed here. Anything else the iommu might care about for existing 920 * slots (size changes, userspace addr changes and read-only flag 921 * changes) is disallowed above, so any other attribute changes getting 922 * here can be skipped. 923 */ 924 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 925 r = kvm_iommu_map_pages(kvm, &new); 926 return r; 927 } 928 929 return 0; 930 931 out_slots: 932 kvfree(slots); 933 out_free: 934 kvm_free_physmem_slot(kvm, &new, &old); 935 out: 936 return r; 937 } 938 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 939 940 int kvm_set_memory_region(struct kvm *kvm, 941 struct kvm_userspace_memory_region *mem) 942 { 943 int r; 944 945 mutex_lock(&kvm->slots_lock); 946 r = __kvm_set_memory_region(kvm, mem); 947 mutex_unlock(&kvm->slots_lock); 948 return r; 949 } 950 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 951 952 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 953 struct kvm_userspace_memory_region *mem) 954 { 955 if (mem->slot >= KVM_USER_MEM_SLOTS) 956 return -EINVAL; 957 return kvm_set_memory_region(kvm, mem); 958 } 959 960 int kvm_get_dirty_log(struct kvm *kvm, 961 struct kvm_dirty_log *log, int *is_dirty) 962 { 963 struct kvm_memory_slot *memslot; 964 int r, i; 965 unsigned long n; 966 unsigned long any = 0; 967 968 r = -EINVAL; 969 if (log->slot >= KVM_USER_MEM_SLOTS) 970 goto out; 971 972 memslot = id_to_memslot(kvm->memslots, log->slot); 973 r = -ENOENT; 974 if (!memslot->dirty_bitmap) 975 goto out; 976 977 n = kvm_dirty_bitmap_bytes(memslot); 978 979 for (i = 0; !any && i < n/sizeof(long); ++i) 980 any = memslot->dirty_bitmap[i]; 981 982 r = -EFAULT; 983 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 984 goto out; 985 986 if (any) 987 *is_dirty = 1; 988 989 r = 0; 990 out: 991 return r; 992 } 993 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 994 995 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 996 /** 997 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 998 * are dirty write protect them for next write. 999 * @kvm: pointer to kvm instance 1000 * @log: slot id and address to which we copy the log 1001 * @is_dirty: flag set if any page is dirty 1002 * 1003 * We need to keep it in mind that VCPU threads can write to the bitmap 1004 * concurrently. So, to avoid losing track of dirty pages we keep the 1005 * following order: 1006 * 1007 * 1. Take a snapshot of the bit and clear it if needed. 1008 * 2. Write protect the corresponding page. 1009 * 3. Copy the snapshot to the userspace. 1010 * 4. Upon return caller flushes TLB's if needed. 1011 * 1012 * Between 2 and 4, the guest may write to the page using the remaining TLB 1013 * entry. This is not a problem because the page is reported dirty using 1014 * the snapshot taken before and step 4 ensures that writes done after 1015 * exiting to userspace will be logged for the next call. 1016 * 1017 */ 1018 int kvm_get_dirty_log_protect(struct kvm *kvm, 1019 struct kvm_dirty_log *log, bool *is_dirty) 1020 { 1021 struct kvm_memory_slot *memslot; 1022 int r, i; 1023 unsigned long n; 1024 unsigned long *dirty_bitmap; 1025 unsigned long *dirty_bitmap_buffer; 1026 1027 r = -EINVAL; 1028 if (log->slot >= KVM_USER_MEM_SLOTS) 1029 goto out; 1030 1031 memslot = id_to_memslot(kvm->memslots, log->slot); 1032 1033 dirty_bitmap = memslot->dirty_bitmap; 1034 r = -ENOENT; 1035 if (!dirty_bitmap) 1036 goto out; 1037 1038 n = kvm_dirty_bitmap_bytes(memslot); 1039 1040 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); 1041 memset(dirty_bitmap_buffer, 0, n); 1042 1043 spin_lock(&kvm->mmu_lock); 1044 *is_dirty = false; 1045 for (i = 0; i < n / sizeof(long); i++) { 1046 unsigned long mask; 1047 gfn_t offset; 1048 1049 if (!dirty_bitmap[i]) 1050 continue; 1051 1052 *is_dirty = true; 1053 1054 mask = xchg(&dirty_bitmap[i], 0); 1055 dirty_bitmap_buffer[i] = mask; 1056 1057 if (mask) { 1058 offset = i * BITS_PER_LONG; 1059 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1060 offset, mask); 1061 } 1062 } 1063 1064 spin_unlock(&kvm->mmu_lock); 1065 1066 r = -EFAULT; 1067 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1068 goto out; 1069 1070 r = 0; 1071 out: 1072 return r; 1073 } 1074 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1075 #endif 1076 1077 bool kvm_largepages_enabled(void) 1078 { 1079 return largepages_enabled; 1080 } 1081 1082 void kvm_disable_largepages(void) 1083 { 1084 largepages_enabled = false; 1085 } 1086 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1087 1088 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1089 { 1090 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1091 } 1092 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1093 1094 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1095 { 1096 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1097 1098 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1099 memslot->flags & KVM_MEMSLOT_INVALID) 1100 return 0; 1101 1102 return 1; 1103 } 1104 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1105 1106 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1107 { 1108 struct vm_area_struct *vma; 1109 unsigned long addr, size; 1110 1111 size = PAGE_SIZE; 1112 1113 addr = gfn_to_hva(kvm, gfn); 1114 if (kvm_is_error_hva(addr)) 1115 return PAGE_SIZE; 1116 1117 down_read(¤t->mm->mmap_sem); 1118 vma = find_vma(current->mm, addr); 1119 if (!vma) 1120 goto out; 1121 1122 size = vma_kernel_pagesize(vma); 1123 1124 out: 1125 up_read(¤t->mm->mmap_sem); 1126 1127 return size; 1128 } 1129 1130 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1131 { 1132 return slot->flags & KVM_MEM_READONLY; 1133 } 1134 1135 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1136 gfn_t *nr_pages, bool write) 1137 { 1138 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1139 return KVM_HVA_ERR_BAD; 1140 1141 if (memslot_is_readonly(slot) && write) 1142 return KVM_HVA_ERR_RO_BAD; 1143 1144 if (nr_pages) 1145 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1146 1147 return __gfn_to_hva_memslot(slot, gfn); 1148 } 1149 1150 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1151 gfn_t *nr_pages) 1152 { 1153 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1154 } 1155 1156 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1157 gfn_t gfn) 1158 { 1159 return gfn_to_hva_many(slot, gfn, NULL); 1160 } 1161 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1162 1163 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1164 { 1165 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1166 } 1167 EXPORT_SYMBOL_GPL(gfn_to_hva); 1168 1169 /* 1170 * If writable is set to false, the hva returned by this function is only 1171 * allowed to be read. 1172 */ 1173 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1174 gfn_t gfn, bool *writable) 1175 { 1176 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1177 1178 if (!kvm_is_error_hva(hva) && writable) 1179 *writable = !memslot_is_readonly(slot); 1180 1181 return hva; 1182 } 1183 1184 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1185 { 1186 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1187 1188 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1189 } 1190 1191 static int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1192 unsigned long start, int write, struct page **page) 1193 { 1194 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1195 1196 if (write) 1197 flags |= FOLL_WRITE; 1198 1199 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1200 } 1201 1202 static inline int check_user_page_hwpoison(unsigned long addr) 1203 { 1204 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1205 1206 rc = __get_user_pages(current, current->mm, addr, 1, 1207 flags, NULL, NULL, NULL); 1208 return rc == -EHWPOISON; 1209 } 1210 1211 /* 1212 * The atomic path to get the writable pfn which will be stored in @pfn, 1213 * true indicates success, otherwise false is returned. 1214 */ 1215 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1216 bool write_fault, bool *writable, pfn_t *pfn) 1217 { 1218 struct page *page[1]; 1219 int npages; 1220 1221 if (!(async || atomic)) 1222 return false; 1223 1224 /* 1225 * Fast pin a writable pfn only if it is a write fault request 1226 * or the caller allows to map a writable pfn for a read fault 1227 * request. 1228 */ 1229 if (!(write_fault || writable)) 1230 return false; 1231 1232 npages = __get_user_pages_fast(addr, 1, 1, page); 1233 if (npages == 1) { 1234 *pfn = page_to_pfn(page[0]); 1235 1236 if (writable) 1237 *writable = true; 1238 return true; 1239 } 1240 1241 return false; 1242 } 1243 1244 /* 1245 * The slow path to get the pfn of the specified host virtual address, 1246 * 1 indicates success, -errno is returned if error is detected. 1247 */ 1248 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1249 bool *writable, pfn_t *pfn) 1250 { 1251 struct page *page[1]; 1252 int npages = 0; 1253 1254 might_sleep(); 1255 1256 if (writable) 1257 *writable = write_fault; 1258 1259 if (async) { 1260 down_read(¤t->mm->mmap_sem); 1261 npages = get_user_page_nowait(current, current->mm, 1262 addr, write_fault, page); 1263 up_read(¤t->mm->mmap_sem); 1264 } else 1265 npages = __get_user_pages_unlocked(current, current->mm, addr, 1, 1266 write_fault, 0, page, 1267 FOLL_TOUCH|FOLL_HWPOISON); 1268 if (npages != 1) 1269 return npages; 1270 1271 /* map read fault as writable if possible */ 1272 if (unlikely(!write_fault) && writable) { 1273 struct page *wpage[1]; 1274 1275 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1276 if (npages == 1) { 1277 *writable = true; 1278 put_page(page[0]); 1279 page[0] = wpage[0]; 1280 } 1281 1282 npages = 1; 1283 } 1284 *pfn = page_to_pfn(page[0]); 1285 return npages; 1286 } 1287 1288 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1289 { 1290 if (unlikely(!(vma->vm_flags & VM_READ))) 1291 return false; 1292 1293 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1294 return false; 1295 1296 return true; 1297 } 1298 1299 /* 1300 * Pin guest page in memory and return its pfn. 1301 * @addr: host virtual address which maps memory to the guest 1302 * @atomic: whether this function can sleep 1303 * @async: whether this function need to wait IO complete if the 1304 * host page is not in the memory 1305 * @write_fault: whether we should get a writable host page 1306 * @writable: whether it allows to map a writable host page for !@write_fault 1307 * 1308 * The function will map a writable host page for these two cases: 1309 * 1): @write_fault = true 1310 * 2): @write_fault = false && @writable, @writable will tell the caller 1311 * whether the mapping is writable. 1312 */ 1313 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1314 bool write_fault, bool *writable) 1315 { 1316 struct vm_area_struct *vma; 1317 pfn_t pfn = 0; 1318 int npages; 1319 1320 /* we can do it either atomically or asynchronously, not both */ 1321 BUG_ON(atomic && async); 1322 1323 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1324 return pfn; 1325 1326 if (atomic) 1327 return KVM_PFN_ERR_FAULT; 1328 1329 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1330 if (npages == 1) 1331 return pfn; 1332 1333 down_read(¤t->mm->mmap_sem); 1334 if (npages == -EHWPOISON || 1335 (!async && check_user_page_hwpoison(addr))) { 1336 pfn = KVM_PFN_ERR_HWPOISON; 1337 goto exit; 1338 } 1339 1340 vma = find_vma_intersection(current->mm, addr, addr + 1); 1341 1342 if (vma == NULL) 1343 pfn = KVM_PFN_ERR_FAULT; 1344 else if ((vma->vm_flags & VM_PFNMAP)) { 1345 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1346 vma->vm_pgoff; 1347 BUG_ON(!kvm_is_reserved_pfn(pfn)); 1348 } else { 1349 if (async && vma_is_valid(vma, write_fault)) 1350 *async = true; 1351 pfn = KVM_PFN_ERR_FAULT; 1352 } 1353 exit: 1354 up_read(¤t->mm->mmap_sem); 1355 return pfn; 1356 } 1357 1358 static pfn_t 1359 __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1360 bool *async, bool write_fault, bool *writable) 1361 { 1362 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1363 1364 if (addr == KVM_HVA_ERR_RO_BAD) 1365 return KVM_PFN_ERR_RO_FAULT; 1366 1367 if (kvm_is_error_hva(addr)) 1368 return KVM_PFN_NOSLOT; 1369 1370 /* Do not map writable pfn in the readonly memslot. */ 1371 if (writable && memslot_is_readonly(slot)) { 1372 *writable = false; 1373 writable = NULL; 1374 } 1375 1376 return hva_to_pfn(addr, atomic, async, write_fault, 1377 writable); 1378 } 1379 1380 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1381 bool write_fault, bool *writable) 1382 { 1383 struct kvm_memory_slot *slot; 1384 1385 if (async) 1386 *async = false; 1387 1388 slot = gfn_to_memslot(kvm, gfn); 1389 1390 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, 1391 writable); 1392 } 1393 1394 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1395 { 1396 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1397 } 1398 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1399 1400 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1401 bool write_fault, bool *writable) 1402 { 1403 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1404 } 1405 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1406 1407 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1408 { 1409 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1410 } 1411 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1412 1413 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1414 bool *writable) 1415 { 1416 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1417 } 1418 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1419 1420 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1421 { 1422 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1423 } 1424 1425 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1426 { 1427 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1428 } 1429 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1430 1431 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1432 int nr_pages) 1433 { 1434 unsigned long addr; 1435 gfn_t entry; 1436 1437 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1438 if (kvm_is_error_hva(addr)) 1439 return -1; 1440 1441 if (entry < nr_pages) 1442 return 0; 1443 1444 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1445 } 1446 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1447 1448 static struct page *kvm_pfn_to_page(pfn_t pfn) 1449 { 1450 if (is_error_noslot_pfn(pfn)) 1451 return KVM_ERR_PTR_BAD_PAGE; 1452 1453 if (kvm_is_reserved_pfn(pfn)) { 1454 WARN_ON(1); 1455 return KVM_ERR_PTR_BAD_PAGE; 1456 } 1457 1458 return pfn_to_page(pfn); 1459 } 1460 1461 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1462 { 1463 pfn_t pfn; 1464 1465 pfn = gfn_to_pfn(kvm, gfn); 1466 1467 return kvm_pfn_to_page(pfn); 1468 } 1469 EXPORT_SYMBOL_GPL(gfn_to_page); 1470 1471 void kvm_release_page_clean(struct page *page) 1472 { 1473 WARN_ON(is_error_page(page)); 1474 1475 kvm_release_pfn_clean(page_to_pfn(page)); 1476 } 1477 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1478 1479 void kvm_release_pfn_clean(pfn_t pfn) 1480 { 1481 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1482 put_page(pfn_to_page(pfn)); 1483 } 1484 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1485 1486 void kvm_release_page_dirty(struct page *page) 1487 { 1488 WARN_ON(is_error_page(page)); 1489 1490 kvm_release_pfn_dirty(page_to_pfn(page)); 1491 } 1492 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1493 1494 static void kvm_release_pfn_dirty(pfn_t pfn) 1495 { 1496 kvm_set_pfn_dirty(pfn); 1497 kvm_release_pfn_clean(pfn); 1498 } 1499 1500 void kvm_set_pfn_dirty(pfn_t pfn) 1501 { 1502 if (!kvm_is_reserved_pfn(pfn)) { 1503 struct page *page = pfn_to_page(pfn); 1504 1505 if (!PageReserved(page)) 1506 SetPageDirty(page); 1507 } 1508 } 1509 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1510 1511 void kvm_set_pfn_accessed(pfn_t pfn) 1512 { 1513 if (!kvm_is_reserved_pfn(pfn)) 1514 mark_page_accessed(pfn_to_page(pfn)); 1515 } 1516 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1517 1518 void kvm_get_pfn(pfn_t pfn) 1519 { 1520 if (!kvm_is_reserved_pfn(pfn)) 1521 get_page(pfn_to_page(pfn)); 1522 } 1523 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1524 1525 static int next_segment(unsigned long len, int offset) 1526 { 1527 if (len > PAGE_SIZE - offset) 1528 return PAGE_SIZE - offset; 1529 else 1530 return len; 1531 } 1532 1533 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1534 int len) 1535 { 1536 int r; 1537 unsigned long addr; 1538 1539 addr = gfn_to_hva_prot(kvm, gfn, NULL); 1540 if (kvm_is_error_hva(addr)) 1541 return -EFAULT; 1542 r = __copy_from_user(data, (void __user *)addr + offset, len); 1543 if (r) 1544 return -EFAULT; 1545 return 0; 1546 } 1547 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1548 1549 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1550 { 1551 gfn_t gfn = gpa >> PAGE_SHIFT; 1552 int seg; 1553 int offset = offset_in_page(gpa); 1554 int ret; 1555 1556 while ((seg = next_segment(len, offset)) != 0) { 1557 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1558 if (ret < 0) 1559 return ret; 1560 offset = 0; 1561 len -= seg; 1562 data += seg; 1563 ++gfn; 1564 } 1565 return 0; 1566 } 1567 EXPORT_SYMBOL_GPL(kvm_read_guest); 1568 1569 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1570 unsigned long len) 1571 { 1572 int r; 1573 unsigned long addr; 1574 gfn_t gfn = gpa >> PAGE_SHIFT; 1575 int offset = offset_in_page(gpa); 1576 1577 addr = gfn_to_hva_prot(kvm, gfn, NULL); 1578 if (kvm_is_error_hva(addr)) 1579 return -EFAULT; 1580 pagefault_disable(); 1581 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1582 pagefault_enable(); 1583 if (r) 1584 return -EFAULT; 1585 return 0; 1586 } 1587 EXPORT_SYMBOL(kvm_read_guest_atomic); 1588 1589 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1590 int offset, int len) 1591 { 1592 int r; 1593 unsigned long addr; 1594 1595 addr = gfn_to_hva(kvm, gfn); 1596 if (kvm_is_error_hva(addr)) 1597 return -EFAULT; 1598 r = __copy_to_user((void __user *)addr + offset, data, len); 1599 if (r) 1600 return -EFAULT; 1601 mark_page_dirty(kvm, gfn); 1602 return 0; 1603 } 1604 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1605 1606 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1607 unsigned long len) 1608 { 1609 gfn_t gfn = gpa >> PAGE_SHIFT; 1610 int seg; 1611 int offset = offset_in_page(gpa); 1612 int ret; 1613 1614 while ((seg = next_segment(len, offset)) != 0) { 1615 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1616 if (ret < 0) 1617 return ret; 1618 offset = 0; 1619 len -= seg; 1620 data += seg; 1621 ++gfn; 1622 } 1623 return 0; 1624 } 1625 EXPORT_SYMBOL_GPL(kvm_write_guest); 1626 1627 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1628 gpa_t gpa, unsigned long len) 1629 { 1630 struct kvm_memslots *slots = kvm_memslots(kvm); 1631 int offset = offset_in_page(gpa); 1632 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1633 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1634 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 1635 gfn_t nr_pages_avail; 1636 1637 ghc->gpa = gpa; 1638 ghc->generation = slots->generation; 1639 ghc->len = len; 1640 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1641 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1642 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1643 ghc->hva += offset; 1644 } else { 1645 /* 1646 * If the requested region crosses two memslots, we still 1647 * verify that the entire region is valid here. 1648 */ 1649 while (start_gfn <= end_gfn) { 1650 ghc->memslot = gfn_to_memslot(kvm, start_gfn); 1651 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1652 &nr_pages_avail); 1653 if (kvm_is_error_hva(ghc->hva)) 1654 return -EFAULT; 1655 start_gfn += nr_pages_avail; 1656 } 1657 /* Use the slow path for cross page reads and writes. */ 1658 ghc->memslot = NULL; 1659 } 1660 return 0; 1661 } 1662 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1663 1664 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1665 void *data, unsigned long len) 1666 { 1667 struct kvm_memslots *slots = kvm_memslots(kvm); 1668 int r; 1669 1670 BUG_ON(len > ghc->len); 1671 1672 if (slots->generation != ghc->generation) 1673 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 1674 1675 if (unlikely(!ghc->memslot)) 1676 return kvm_write_guest(kvm, ghc->gpa, data, len); 1677 1678 if (kvm_is_error_hva(ghc->hva)) 1679 return -EFAULT; 1680 1681 r = __copy_to_user((void __user *)ghc->hva, data, len); 1682 if (r) 1683 return -EFAULT; 1684 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1685 1686 return 0; 1687 } 1688 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1689 1690 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1691 void *data, unsigned long len) 1692 { 1693 struct kvm_memslots *slots = kvm_memslots(kvm); 1694 int r; 1695 1696 BUG_ON(len > ghc->len); 1697 1698 if (slots->generation != ghc->generation) 1699 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa, ghc->len); 1700 1701 if (unlikely(!ghc->memslot)) 1702 return kvm_read_guest(kvm, ghc->gpa, data, len); 1703 1704 if (kvm_is_error_hva(ghc->hva)) 1705 return -EFAULT; 1706 1707 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1708 if (r) 1709 return -EFAULT; 1710 1711 return 0; 1712 } 1713 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1714 1715 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1716 { 1717 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 1718 1719 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 1720 } 1721 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1722 1723 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1724 { 1725 gfn_t gfn = gpa >> PAGE_SHIFT; 1726 int seg; 1727 int offset = offset_in_page(gpa); 1728 int ret; 1729 1730 while ((seg = next_segment(len, offset)) != 0) { 1731 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1732 if (ret < 0) 1733 return ret; 1734 offset = 0; 1735 len -= seg; 1736 ++gfn; 1737 } 1738 return 0; 1739 } 1740 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1741 1742 static void mark_page_dirty_in_slot(struct kvm *kvm, 1743 struct kvm_memory_slot *memslot, 1744 gfn_t gfn) 1745 { 1746 if (memslot && memslot->dirty_bitmap) { 1747 unsigned long rel_gfn = gfn - memslot->base_gfn; 1748 1749 set_bit_le(rel_gfn, memslot->dirty_bitmap); 1750 } 1751 } 1752 1753 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1754 { 1755 struct kvm_memory_slot *memslot; 1756 1757 memslot = gfn_to_memslot(kvm, gfn); 1758 mark_page_dirty_in_slot(kvm, memslot, gfn); 1759 } 1760 EXPORT_SYMBOL_GPL(mark_page_dirty); 1761 1762 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 1763 { 1764 if (kvm_arch_vcpu_runnable(vcpu)) { 1765 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1766 return -EINTR; 1767 } 1768 if (kvm_cpu_has_pending_timer(vcpu)) 1769 return -EINTR; 1770 if (signal_pending(current)) 1771 return -EINTR; 1772 1773 return 0; 1774 } 1775 1776 /* 1777 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1778 */ 1779 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1780 { 1781 ktime_t start, cur; 1782 DEFINE_WAIT(wait); 1783 bool waited = false; 1784 1785 start = cur = ktime_get(); 1786 if (halt_poll_ns) { 1787 ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns); 1788 1789 do { 1790 /* 1791 * This sets KVM_REQ_UNHALT if an interrupt 1792 * arrives. 1793 */ 1794 if (kvm_vcpu_check_block(vcpu) < 0) { 1795 ++vcpu->stat.halt_successful_poll; 1796 goto out; 1797 } 1798 cur = ktime_get(); 1799 } while (single_task_running() && ktime_before(cur, stop)); 1800 } 1801 1802 for (;;) { 1803 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1804 1805 if (kvm_vcpu_check_block(vcpu) < 0) 1806 break; 1807 1808 waited = true; 1809 schedule(); 1810 } 1811 1812 finish_wait(&vcpu->wq, &wait); 1813 cur = ktime_get(); 1814 1815 out: 1816 trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited); 1817 } 1818 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 1819 1820 #ifndef CONFIG_S390 1821 /* 1822 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 1823 */ 1824 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 1825 { 1826 int me; 1827 int cpu = vcpu->cpu; 1828 wait_queue_head_t *wqp; 1829 1830 wqp = kvm_arch_vcpu_wq(vcpu); 1831 if (waitqueue_active(wqp)) { 1832 wake_up_interruptible(wqp); 1833 ++vcpu->stat.halt_wakeup; 1834 } 1835 1836 me = get_cpu(); 1837 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 1838 if (kvm_arch_vcpu_should_kick(vcpu)) 1839 smp_send_reschedule(cpu); 1840 put_cpu(); 1841 } 1842 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 1843 #endif /* !CONFIG_S390 */ 1844 1845 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 1846 { 1847 struct pid *pid; 1848 struct task_struct *task = NULL; 1849 int ret = 0; 1850 1851 rcu_read_lock(); 1852 pid = rcu_dereference(target->pid); 1853 if (pid) 1854 task = get_pid_task(pid, PIDTYPE_PID); 1855 rcu_read_unlock(); 1856 if (!task) 1857 return ret; 1858 ret = yield_to(task, 1); 1859 put_task_struct(task); 1860 1861 return ret; 1862 } 1863 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1864 1865 /* 1866 * Helper that checks whether a VCPU is eligible for directed yield. 1867 * Most eligible candidate to yield is decided by following heuristics: 1868 * 1869 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 1870 * (preempted lock holder), indicated by @in_spin_loop. 1871 * Set at the beiginning and cleared at the end of interception/PLE handler. 1872 * 1873 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 1874 * chance last time (mostly it has become eligible now since we have probably 1875 * yielded to lockholder in last iteration. This is done by toggling 1876 * @dy_eligible each time a VCPU checked for eligibility.) 1877 * 1878 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 1879 * to preempted lock-holder could result in wrong VCPU selection and CPU 1880 * burning. Giving priority for a potential lock-holder increases lock 1881 * progress. 1882 * 1883 * Since algorithm is based on heuristics, accessing another VCPU data without 1884 * locking does not harm. It may result in trying to yield to same VCPU, fail 1885 * and continue with next VCPU and so on. 1886 */ 1887 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 1888 { 1889 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 1890 bool eligible; 1891 1892 eligible = !vcpu->spin_loop.in_spin_loop || 1893 vcpu->spin_loop.dy_eligible; 1894 1895 if (vcpu->spin_loop.in_spin_loop) 1896 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 1897 1898 return eligible; 1899 #else 1900 return true; 1901 #endif 1902 } 1903 1904 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1905 { 1906 struct kvm *kvm = me->kvm; 1907 struct kvm_vcpu *vcpu; 1908 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1909 int yielded = 0; 1910 int try = 3; 1911 int pass; 1912 int i; 1913 1914 kvm_vcpu_set_in_spin_loop(me, true); 1915 /* 1916 * We boost the priority of a VCPU that is runnable but not 1917 * currently running, because it got preempted by something 1918 * else and called schedule in __vcpu_run. Hopefully that 1919 * VCPU is holding the lock that we need and will release it. 1920 * We approximate round-robin by starting at the last boosted VCPU. 1921 */ 1922 for (pass = 0; pass < 2 && !yielded && try; pass++) { 1923 kvm_for_each_vcpu(i, vcpu, kvm) { 1924 if (!pass && i <= last_boosted_vcpu) { 1925 i = last_boosted_vcpu; 1926 continue; 1927 } else if (pass && i > last_boosted_vcpu) 1928 break; 1929 if (!ACCESS_ONCE(vcpu->preempted)) 1930 continue; 1931 if (vcpu == me) 1932 continue; 1933 if (waitqueue_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 1934 continue; 1935 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1936 continue; 1937 1938 yielded = kvm_vcpu_yield_to(vcpu); 1939 if (yielded > 0) { 1940 kvm->last_boosted_vcpu = i; 1941 break; 1942 } else if (yielded < 0) { 1943 try--; 1944 if (!try) 1945 break; 1946 } 1947 } 1948 } 1949 kvm_vcpu_set_in_spin_loop(me, false); 1950 1951 /* Ensure vcpu is not eligible during next spinloop */ 1952 kvm_vcpu_set_dy_eligible(me, false); 1953 } 1954 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1955 1956 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1957 { 1958 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1959 struct page *page; 1960 1961 if (vmf->pgoff == 0) 1962 page = virt_to_page(vcpu->run); 1963 #ifdef CONFIG_X86 1964 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1965 page = virt_to_page(vcpu->arch.pio_data); 1966 #endif 1967 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1968 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1969 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1970 #endif 1971 else 1972 return kvm_arch_vcpu_fault(vcpu, vmf); 1973 get_page(page); 1974 vmf->page = page; 1975 return 0; 1976 } 1977 1978 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1979 .fault = kvm_vcpu_fault, 1980 }; 1981 1982 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1983 { 1984 vma->vm_ops = &kvm_vcpu_vm_ops; 1985 return 0; 1986 } 1987 1988 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1989 { 1990 struct kvm_vcpu *vcpu = filp->private_data; 1991 1992 kvm_put_kvm(vcpu->kvm); 1993 return 0; 1994 } 1995 1996 static struct file_operations kvm_vcpu_fops = { 1997 .release = kvm_vcpu_release, 1998 .unlocked_ioctl = kvm_vcpu_ioctl, 1999 #ifdef CONFIG_KVM_COMPAT 2000 .compat_ioctl = kvm_vcpu_compat_ioctl, 2001 #endif 2002 .mmap = kvm_vcpu_mmap, 2003 .llseek = noop_llseek, 2004 }; 2005 2006 /* 2007 * Allocates an inode for the vcpu. 2008 */ 2009 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2010 { 2011 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2012 } 2013 2014 /* 2015 * Creates some virtual cpus. Good luck creating more than one. 2016 */ 2017 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2018 { 2019 int r; 2020 struct kvm_vcpu *vcpu, *v; 2021 2022 if (id >= KVM_MAX_VCPUS) 2023 return -EINVAL; 2024 2025 vcpu = kvm_arch_vcpu_create(kvm, id); 2026 if (IS_ERR(vcpu)) 2027 return PTR_ERR(vcpu); 2028 2029 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2030 2031 r = kvm_arch_vcpu_setup(vcpu); 2032 if (r) 2033 goto vcpu_destroy; 2034 2035 mutex_lock(&kvm->lock); 2036 if (!kvm_vcpu_compatible(vcpu)) { 2037 r = -EINVAL; 2038 goto unlock_vcpu_destroy; 2039 } 2040 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 2041 r = -EINVAL; 2042 goto unlock_vcpu_destroy; 2043 } 2044 2045 kvm_for_each_vcpu(r, v, kvm) 2046 if (v->vcpu_id == id) { 2047 r = -EEXIST; 2048 goto unlock_vcpu_destroy; 2049 } 2050 2051 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2052 2053 /* Now it's all set up, let userspace reach it */ 2054 kvm_get_kvm(kvm); 2055 r = create_vcpu_fd(vcpu); 2056 if (r < 0) { 2057 kvm_put_kvm(kvm); 2058 goto unlock_vcpu_destroy; 2059 } 2060 2061 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2062 smp_wmb(); 2063 atomic_inc(&kvm->online_vcpus); 2064 2065 mutex_unlock(&kvm->lock); 2066 kvm_arch_vcpu_postcreate(vcpu); 2067 return r; 2068 2069 unlock_vcpu_destroy: 2070 mutex_unlock(&kvm->lock); 2071 vcpu_destroy: 2072 kvm_arch_vcpu_destroy(vcpu); 2073 return r; 2074 } 2075 2076 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2077 { 2078 if (sigset) { 2079 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2080 vcpu->sigset_active = 1; 2081 vcpu->sigset = *sigset; 2082 } else 2083 vcpu->sigset_active = 0; 2084 return 0; 2085 } 2086 2087 static long kvm_vcpu_ioctl(struct file *filp, 2088 unsigned int ioctl, unsigned long arg) 2089 { 2090 struct kvm_vcpu *vcpu = filp->private_data; 2091 void __user *argp = (void __user *)arg; 2092 int r; 2093 struct kvm_fpu *fpu = NULL; 2094 struct kvm_sregs *kvm_sregs = NULL; 2095 2096 if (vcpu->kvm->mm != current->mm) 2097 return -EIO; 2098 2099 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2100 return -EINVAL; 2101 2102 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) 2103 /* 2104 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 2105 * so vcpu_load() would break it. 2106 */ 2107 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) 2108 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2109 #endif 2110 2111 2112 r = vcpu_load(vcpu); 2113 if (r) 2114 return r; 2115 switch (ioctl) { 2116 case KVM_RUN: 2117 r = -EINVAL; 2118 if (arg) 2119 goto out; 2120 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 2121 /* The thread running this VCPU changed. */ 2122 struct pid *oldpid = vcpu->pid; 2123 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 2124 2125 rcu_assign_pointer(vcpu->pid, newpid); 2126 if (oldpid) 2127 synchronize_rcu(); 2128 put_pid(oldpid); 2129 } 2130 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2131 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2132 break; 2133 case KVM_GET_REGS: { 2134 struct kvm_regs *kvm_regs; 2135 2136 r = -ENOMEM; 2137 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2138 if (!kvm_regs) 2139 goto out; 2140 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2141 if (r) 2142 goto out_free1; 2143 r = -EFAULT; 2144 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2145 goto out_free1; 2146 r = 0; 2147 out_free1: 2148 kfree(kvm_regs); 2149 break; 2150 } 2151 case KVM_SET_REGS: { 2152 struct kvm_regs *kvm_regs; 2153 2154 r = -ENOMEM; 2155 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2156 if (IS_ERR(kvm_regs)) { 2157 r = PTR_ERR(kvm_regs); 2158 goto out; 2159 } 2160 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2161 kfree(kvm_regs); 2162 break; 2163 } 2164 case KVM_GET_SREGS: { 2165 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2166 r = -ENOMEM; 2167 if (!kvm_sregs) 2168 goto out; 2169 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2170 if (r) 2171 goto out; 2172 r = -EFAULT; 2173 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2174 goto out; 2175 r = 0; 2176 break; 2177 } 2178 case KVM_SET_SREGS: { 2179 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2180 if (IS_ERR(kvm_sregs)) { 2181 r = PTR_ERR(kvm_sregs); 2182 kvm_sregs = NULL; 2183 goto out; 2184 } 2185 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2186 break; 2187 } 2188 case KVM_GET_MP_STATE: { 2189 struct kvm_mp_state mp_state; 2190 2191 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2192 if (r) 2193 goto out; 2194 r = -EFAULT; 2195 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2196 goto out; 2197 r = 0; 2198 break; 2199 } 2200 case KVM_SET_MP_STATE: { 2201 struct kvm_mp_state mp_state; 2202 2203 r = -EFAULT; 2204 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2205 goto out; 2206 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2207 break; 2208 } 2209 case KVM_TRANSLATE: { 2210 struct kvm_translation tr; 2211 2212 r = -EFAULT; 2213 if (copy_from_user(&tr, argp, sizeof(tr))) 2214 goto out; 2215 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2216 if (r) 2217 goto out; 2218 r = -EFAULT; 2219 if (copy_to_user(argp, &tr, sizeof(tr))) 2220 goto out; 2221 r = 0; 2222 break; 2223 } 2224 case KVM_SET_GUEST_DEBUG: { 2225 struct kvm_guest_debug dbg; 2226 2227 r = -EFAULT; 2228 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2229 goto out; 2230 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2231 break; 2232 } 2233 case KVM_SET_SIGNAL_MASK: { 2234 struct kvm_signal_mask __user *sigmask_arg = argp; 2235 struct kvm_signal_mask kvm_sigmask; 2236 sigset_t sigset, *p; 2237 2238 p = NULL; 2239 if (argp) { 2240 r = -EFAULT; 2241 if (copy_from_user(&kvm_sigmask, argp, 2242 sizeof(kvm_sigmask))) 2243 goto out; 2244 r = -EINVAL; 2245 if (kvm_sigmask.len != sizeof(sigset)) 2246 goto out; 2247 r = -EFAULT; 2248 if (copy_from_user(&sigset, sigmask_arg->sigset, 2249 sizeof(sigset))) 2250 goto out; 2251 p = &sigset; 2252 } 2253 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2254 break; 2255 } 2256 case KVM_GET_FPU: { 2257 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2258 r = -ENOMEM; 2259 if (!fpu) 2260 goto out; 2261 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2262 if (r) 2263 goto out; 2264 r = -EFAULT; 2265 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2266 goto out; 2267 r = 0; 2268 break; 2269 } 2270 case KVM_SET_FPU: { 2271 fpu = memdup_user(argp, sizeof(*fpu)); 2272 if (IS_ERR(fpu)) { 2273 r = PTR_ERR(fpu); 2274 fpu = NULL; 2275 goto out; 2276 } 2277 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2278 break; 2279 } 2280 default: 2281 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2282 } 2283 out: 2284 vcpu_put(vcpu); 2285 kfree(fpu); 2286 kfree(kvm_sregs); 2287 return r; 2288 } 2289 2290 #ifdef CONFIG_KVM_COMPAT 2291 static long kvm_vcpu_compat_ioctl(struct file *filp, 2292 unsigned int ioctl, unsigned long arg) 2293 { 2294 struct kvm_vcpu *vcpu = filp->private_data; 2295 void __user *argp = compat_ptr(arg); 2296 int r; 2297 2298 if (vcpu->kvm->mm != current->mm) 2299 return -EIO; 2300 2301 switch (ioctl) { 2302 case KVM_SET_SIGNAL_MASK: { 2303 struct kvm_signal_mask __user *sigmask_arg = argp; 2304 struct kvm_signal_mask kvm_sigmask; 2305 compat_sigset_t csigset; 2306 sigset_t sigset; 2307 2308 if (argp) { 2309 r = -EFAULT; 2310 if (copy_from_user(&kvm_sigmask, argp, 2311 sizeof(kvm_sigmask))) 2312 goto out; 2313 r = -EINVAL; 2314 if (kvm_sigmask.len != sizeof(csigset)) 2315 goto out; 2316 r = -EFAULT; 2317 if (copy_from_user(&csigset, sigmask_arg->sigset, 2318 sizeof(csigset))) 2319 goto out; 2320 sigset_from_compat(&sigset, &csigset); 2321 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2322 } else 2323 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2324 break; 2325 } 2326 default: 2327 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2328 } 2329 2330 out: 2331 return r; 2332 } 2333 #endif 2334 2335 static int kvm_device_ioctl_attr(struct kvm_device *dev, 2336 int (*accessor)(struct kvm_device *dev, 2337 struct kvm_device_attr *attr), 2338 unsigned long arg) 2339 { 2340 struct kvm_device_attr attr; 2341 2342 if (!accessor) 2343 return -EPERM; 2344 2345 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2346 return -EFAULT; 2347 2348 return accessor(dev, &attr); 2349 } 2350 2351 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2352 unsigned long arg) 2353 { 2354 struct kvm_device *dev = filp->private_data; 2355 2356 switch (ioctl) { 2357 case KVM_SET_DEVICE_ATTR: 2358 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2359 case KVM_GET_DEVICE_ATTR: 2360 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2361 case KVM_HAS_DEVICE_ATTR: 2362 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2363 default: 2364 if (dev->ops->ioctl) 2365 return dev->ops->ioctl(dev, ioctl, arg); 2366 2367 return -ENOTTY; 2368 } 2369 } 2370 2371 static int kvm_device_release(struct inode *inode, struct file *filp) 2372 { 2373 struct kvm_device *dev = filp->private_data; 2374 struct kvm *kvm = dev->kvm; 2375 2376 kvm_put_kvm(kvm); 2377 return 0; 2378 } 2379 2380 static const struct file_operations kvm_device_fops = { 2381 .unlocked_ioctl = kvm_device_ioctl, 2382 #ifdef CONFIG_KVM_COMPAT 2383 .compat_ioctl = kvm_device_ioctl, 2384 #endif 2385 .release = kvm_device_release, 2386 }; 2387 2388 struct kvm_device *kvm_device_from_filp(struct file *filp) 2389 { 2390 if (filp->f_op != &kvm_device_fops) 2391 return NULL; 2392 2393 return filp->private_data; 2394 } 2395 2396 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 2397 #ifdef CONFIG_KVM_MPIC 2398 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 2399 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 2400 #endif 2401 2402 #ifdef CONFIG_KVM_XICS 2403 [KVM_DEV_TYPE_XICS] = &kvm_xics_ops, 2404 #endif 2405 }; 2406 2407 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 2408 { 2409 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 2410 return -ENOSPC; 2411 2412 if (kvm_device_ops_table[type] != NULL) 2413 return -EEXIST; 2414 2415 kvm_device_ops_table[type] = ops; 2416 return 0; 2417 } 2418 2419 void kvm_unregister_device_ops(u32 type) 2420 { 2421 if (kvm_device_ops_table[type] != NULL) 2422 kvm_device_ops_table[type] = NULL; 2423 } 2424 2425 static int kvm_ioctl_create_device(struct kvm *kvm, 2426 struct kvm_create_device *cd) 2427 { 2428 struct kvm_device_ops *ops = NULL; 2429 struct kvm_device *dev; 2430 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2431 int ret; 2432 2433 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 2434 return -ENODEV; 2435 2436 ops = kvm_device_ops_table[cd->type]; 2437 if (ops == NULL) 2438 return -ENODEV; 2439 2440 if (test) 2441 return 0; 2442 2443 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2444 if (!dev) 2445 return -ENOMEM; 2446 2447 dev->ops = ops; 2448 dev->kvm = kvm; 2449 2450 ret = ops->create(dev, cd->type); 2451 if (ret < 0) { 2452 kfree(dev); 2453 return ret; 2454 } 2455 2456 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 2457 if (ret < 0) { 2458 ops->destroy(dev); 2459 return ret; 2460 } 2461 2462 list_add(&dev->vm_node, &kvm->devices); 2463 kvm_get_kvm(kvm); 2464 cd->fd = ret; 2465 return 0; 2466 } 2467 2468 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 2469 { 2470 switch (arg) { 2471 case KVM_CAP_USER_MEMORY: 2472 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2473 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2474 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2475 case KVM_CAP_SET_BOOT_CPU_ID: 2476 #endif 2477 case KVM_CAP_INTERNAL_ERROR_DATA: 2478 #ifdef CONFIG_HAVE_KVM_MSI 2479 case KVM_CAP_SIGNAL_MSI: 2480 #endif 2481 #ifdef CONFIG_HAVE_KVM_IRQFD 2482 case KVM_CAP_IRQFD: 2483 case KVM_CAP_IRQFD_RESAMPLE: 2484 #endif 2485 case KVM_CAP_CHECK_EXTENSION_VM: 2486 return 1; 2487 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2488 case KVM_CAP_IRQ_ROUTING: 2489 return KVM_MAX_IRQ_ROUTES; 2490 #endif 2491 default: 2492 break; 2493 } 2494 return kvm_vm_ioctl_check_extension(kvm, arg); 2495 } 2496 2497 static long kvm_vm_ioctl(struct file *filp, 2498 unsigned int ioctl, unsigned long arg) 2499 { 2500 struct kvm *kvm = filp->private_data; 2501 void __user *argp = (void __user *)arg; 2502 int r; 2503 2504 if (kvm->mm != current->mm) 2505 return -EIO; 2506 switch (ioctl) { 2507 case KVM_CREATE_VCPU: 2508 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2509 break; 2510 case KVM_SET_USER_MEMORY_REGION: { 2511 struct kvm_userspace_memory_region kvm_userspace_mem; 2512 2513 r = -EFAULT; 2514 if (copy_from_user(&kvm_userspace_mem, argp, 2515 sizeof(kvm_userspace_mem))) 2516 goto out; 2517 2518 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2519 break; 2520 } 2521 case KVM_GET_DIRTY_LOG: { 2522 struct kvm_dirty_log log; 2523 2524 r = -EFAULT; 2525 if (copy_from_user(&log, argp, sizeof(log))) 2526 goto out; 2527 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2528 break; 2529 } 2530 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2531 case KVM_REGISTER_COALESCED_MMIO: { 2532 struct kvm_coalesced_mmio_zone zone; 2533 2534 r = -EFAULT; 2535 if (copy_from_user(&zone, argp, sizeof(zone))) 2536 goto out; 2537 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2538 break; 2539 } 2540 case KVM_UNREGISTER_COALESCED_MMIO: { 2541 struct kvm_coalesced_mmio_zone zone; 2542 2543 r = -EFAULT; 2544 if (copy_from_user(&zone, argp, sizeof(zone))) 2545 goto out; 2546 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2547 break; 2548 } 2549 #endif 2550 case KVM_IRQFD: { 2551 struct kvm_irqfd data; 2552 2553 r = -EFAULT; 2554 if (copy_from_user(&data, argp, sizeof(data))) 2555 goto out; 2556 r = kvm_irqfd(kvm, &data); 2557 break; 2558 } 2559 case KVM_IOEVENTFD: { 2560 struct kvm_ioeventfd data; 2561 2562 r = -EFAULT; 2563 if (copy_from_user(&data, argp, sizeof(data))) 2564 goto out; 2565 r = kvm_ioeventfd(kvm, &data); 2566 break; 2567 } 2568 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2569 case KVM_SET_BOOT_CPU_ID: 2570 r = 0; 2571 mutex_lock(&kvm->lock); 2572 if (atomic_read(&kvm->online_vcpus) != 0) 2573 r = -EBUSY; 2574 else 2575 kvm->bsp_vcpu_id = arg; 2576 mutex_unlock(&kvm->lock); 2577 break; 2578 #endif 2579 #ifdef CONFIG_HAVE_KVM_MSI 2580 case KVM_SIGNAL_MSI: { 2581 struct kvm_msi msi; 2582 2583 r = -EFAULT; 2584 if (copy_from_user(&msi, argp, sizeof(msi))) 2585 goto out; 2586 r = kvm_send_userspace_msi(kvm, &msi); 2587 break; 2588 } 2589 #endif 2590 #ifdef __KVM_HAVE_IRQ_LINE 2591 case KVM_IRQ_LINE_STATUS: 2592 case KVM_IRQ_LINE: { 2593 struct kvm_irq_level irq_event; 2594 2595 r = -EFAULT; 2596 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 2597 goto out; 2598 2599 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 2600 ioctl == KVM_IRQ_LINE_STATUS); 2601 if (r) 2602 goto out; 2603 2604 r = -EFAULT; 2605 if (ioctl == KVM_IRQ_LINE_STATUS) { 2606 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 2607 goto out; 2608 } 2609 2610 r = 0; 2611 break; 2612 } 2613 #endif 2614 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2615 case KVM_SET_GSI_ROUTING: { 2616 struct kvm_irq_routing routing; 2617 struct kvm_irq_routing __user *urouting; 2618 struct kvm_irq_routing_entry *entries; 2619 2620 r = -EFAULT; 2621 if (copy_from_user(&routing, argp, sizeof(routing))) 2622 goto out; 2623 r = -EINVAL; 2624 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2625 goto out; 2626 if (routing.flags) 2627 goto out; 2628 r = -ENOMEM; 2629 entries = vmalloc(routing.nr * sizeof(*entries)); 2630 if (!entries) 2631 goto out; 2632 r = -EFAULT; 2633 urouting = argp; 2634 if (copy_from_user(entries, urouting->entries, 2635 routing.nr * sizeof(*entries))) 2636 goto out_free_irq_routing; 2637 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2638 routing.flags); 2639 out_free_irq_routing: 2640 vfree(entries); 2641 break; 2642 } 2643 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 2644 case KVM_CREATE_DEVICE: { 2645 struct kvm_create_device cd; 2646 2647 r = -EFAULT; 2648 if (copy_from_user(&cd, argp, sizeof(cd))) 2649 goto out; 2650 2651 r = kvm_ioctl_create_device(kvm, &cd); 2652 if (r) 2653 goto out; 2654 2655 r = -EFAULT; 2656 if (copy_to_user(argp, &cd, sizeof(cd))) 2657 goto out; 2658 2659 r = 0; 2660 break; 2661 } 2662 case KVM_CHECK_EXTENSION: 2663 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 2664 break; 2665 default: 2666 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2667 } 2668 out: 2669 return r; 2670 } 2671 2672 #ifdef CONFIG_KVM_COMPAT 2673 struct compat_kvm_dirty_log { 2674 __u32 slot; 2675 __u32 padding1; 2676 union { 2677 compat_uptr_t dirty_bitmap; /* one bit per page */ 2678 __u64 padding2; 2679 }; 2680 }; 2681 2682 static long kvm_vm_compat_ioctl(struct file *filp, 2683 unsigned int ioctl, unsigned long arg) 2684 { 2685 struct kvm *kvm = filp->private_data; 2686 int r; 2687 2688 if (kvm->mm != current->mm) 2689 return -EIO; 2690 switch (ioctl) { 2691 case KVM_GET_DIRTY_LOG: { 2692 struct compat_kvm_dirty_log compat_log; 2693 struct kvm_dirty_log log; 2694 2695 r = -EFAULT; 2696 if (copy_from_user(&compat_log, (void __user *)arg, 2697 sizeof(compat_log))) 2698 goto out; 2699 log.slot = compat_log.slot; 2700 log.padding1 = compat_log.padding1; 2701 log.padding2 = compat_log.padding2; 2702 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2703 2704 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2705 break; 2706 } 2707 default: 2708 r = kvm_vm_ioctl(filp, ioctl, arg); 2709 } 2710 2711 out: 2712 return r; 2713 } 2714 #endif 2715 2716 static struct file_operations kvm_vm_fops = { 2717 .release = kvm_vm_release, 2718 .unlocked_ioctl = kvm_vm_ioctl, 2719 #ifdef CONFIG_KVM_COMPAT 2720 .compat_ioctl = kvm_vm_compat_ioctl, 2721 #endif 2722 .llseek = noop_llseek, 2723 }; 2724 2725 static int kvm_dev_ioctl_create_vm(unsigned long type) 2726 { 2727 int r; 2728 struct kvm *kvm; 2729 2730 kvm = kvm_create_vm(type); 2731 if (IS_ERR(kvm)) 2732 return PTR_ERR(kvm); 2733 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2734 r = kvm_coalesced_mmio_init(kvm); 2735 if (r < 0) { 2736 kvm_put_kvm(kvm); 2737 return r; 2738 } 2739 #endif 2740 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR | O_CLOEXEC); 2741 if (r < 0) 2742 kvm_put_kvm(kvm); 2743 2744 return r; 2745 } 2746 2747 static long kvm_dev_ioctl(struct file *filp, 2748 unsigned int ioctl, unsigned long arg) 2749 { 2750 long r = -EINVAL; 2751 2752 switch (ioctl) { 2753 case KVM_GET_API_VERSION: 2754 if (arg) 2755 goto out; 2756 r = KVM_API_VERSION; 2757 break; 2758 case KVM_CREATE_VM: 2759 r = kvm_dev_ioctl_create_vm(arg); 2760 break; 2761 case KVM_CHECK_EXTENSION: 2762 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 2763 break; 2764 case KVM_GET_VCPU_MMAP_SIZE: 2765 if (arg) 2766 goto out; 2767 r = PAGE_SIZE; /* struct kvm_run */ 2768 #ifdef CONFIG_X86 2769 r += PAGE_SIZE; /* pio data page */ 2770 #endif 2771 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2772 r += PAGE_SIZE; /* coalesced mmio ring page */ 2773 #endif 2774 break; 2775 case KVM_TRACE_ENABLE: 2776 case KVM_TRACE_PAUSE: 2777 case KVM_TRACE_DISABLE: 2778 r = -EOPNOTSUPP; 2779 break; 2780 default: 2781 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2782 } 2783 out: 2784 return r; 2785 } 2786 2787 static struct file_operations kvm_chardev_ops = { 2788 .unlocked_ioctl = kvm_dev_ioctl, 2789 .compat_ioctl = kvm_dev_ioctl, 2790 .llseek = noop_llseek, 2791 }; 2792 2793 static struct miscdevice kvm_dev = { 2794 KVM_MINOR, 2795 "kvm", 2796 &kvm_chardev_ops, 2797 }; 2798 2799 static void hardware_enable_nolock(void *junk) 2800 { 2801 int cpu = raw_smp_processor_id(); 2802 int r; 2803 2804 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2805 return; 2806 2807 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2808 2809 r = kvm_arch_hardware_enable(); 2810 2811 if (r) { 2812 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2813 atomic_inc(&hardware_enable_failed); 2814 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 2815 } 2816 } 2817 2818 static void hardware_enable(void) 2819 { 2820 raw_spin_lock(&kvm_count_lock); 2821 if (kvm_usage_count) 2822 hardware_enable_nolock(NULL); 2823 raw_spin_unlock(&kvm_count_lock); 2824 } 2825 2826 static void hardware_disable_nolock(void *junk) 2827 { 2828 int cpu = raw_smp_processor_id(); 2829 2830 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2831 return; 2832 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2833 kvm_arch_hardware_disable(); 2834 } 2835 2836 static void hardware_disable(void) 2837 { 2838 raw_spin_lock(&kvm_count_lock); 2839 if (kvm_usage_count) 2840 hardware_disable_nolock(NULL); 2841 raw_spin_unlock(&kvm_count_lock); 2842 } 2843 2844 static void hardware_disable_all_nolock(void) 2845 { 2846 BUG_ON(!kvm_usage_count); 2847 2848 kvm_usage_count--; 2849 if (!kvm_usage_count) 2850 on_each_cpu(hardware_disable_nolock, NULL, 1); 2851 } 2852 2853 static void hardware_disable_all(void) 2854 { 2855 raw_spin_lock(&kvm_count_lock); 2856 hardware_disable_all_nolock(); 2857 raw_spin_unlock(&kvm_count_lock); 2858 } 2859 2860 static int hardware_enable_all(void) 2861 { 2862 int r = 0; 2863 2864 raw_spin_lock(&kvm_count_lock); 2865 2866 kvm_usage_count++; 2867 if (kvm_usage_count == 1) { 2868 atomic_set(&hardware_enable_failed, 0); 2869 on_each_cpu(hardware_enable_nolock, NULL, 1); 2870 2871 if (atomic_read(&hardware_enable_failed)) { 2872 hardware_disable_all_nolock(); 2873 r = -EBUSY; 2874 } 2875 } 2876 2877 raw_spin_unlock(&kvm_count_lock); 2878 2879 return r; 2880 } 2881 2882 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2883 void *v) 2884 { 2885 int cpu = (long)v; 2886 2887 val &= ~CPU_TASKS_FROZEN; 2888 switch (val) { 2889 case CPU_DYING: 2890 pr_info("kvm: disabling virtualization on CPU%d\n", 2891 cpu); 2892 hardware_disable(); 2893 break; 2894 case CPU_STARTING: 2895 pr_info("kvm: enabling virtualization on CPU%d\n", 2896 cpu); 2897 hardware_enable(); 2898 break; 2899 } 2900 return NOTIFY_OK; 2901 } 2902 2903 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2904 void *v) 2905 { 2906 /* 2907 * Some (well, at least mine) BIOSes hang on reboot if 2908 * in vmx root mode. 2909 * 2910 * And Intel TXT required VMX off for all cpu when system shutdown. 2911 */ 2912 pr_info("kvm: exiting hardware virtualization\n"); 2913 kvm_rebooting = true; 2914 on_each_cpu(hardware_disable_nolock, NULL, 1); 2915 return NOTIFY_OK; 2916 } 2917 2918 static struct notifier_block kvm_reboot_notifier = { 2919 .notifier_call = kvm_reboot, 2920 .priority = 0, 2921 }; 2922 2923 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2924 { 2925 int i; 2926 2927 for (i = 0; i < bus->dev_count; i++) { 2928 struct kvm_io_device *pos = bus->range[i].dev; 2929 2930 kvm_iodevice_destructor(pos); 2931 } 2932 kfree(bus); 2933 } 2934 2935 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 2936 const struct kvm_io_range *r2) 2937 { 2938 if (r1->addr < r2->addr) 2939 return -1; 2940 if (r1->addr + r1->len > r2->addr + r2->len) 2941 return 1; 2942 return 0; 2943 } 2944 2945 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2946 { 2947 return kvm_io_bus_cmp(p1, p2); 2948 } 2949 2950 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2951 gpa_t addr, int len) 2952 { 2953 bus->range[bus->dev_count++] = (struct kvm_io_range) { 2954 .addr = addr, 2955 .len = len, 2956 .dev = dev, 2957 }; 2958 2959 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 2960 kvm_io_bus_sort_cmp, NULL); 2961 2962 return 0; 2963 } 2964 2965 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2966 gpa_t addr, int len) 2967 { 2968 struct kvm_io_range *range, key; 2969 int off; 2970 2971 key = (struct kvm_io_range) { 2972 .addr = addr, 2973 .len = len, 2974 }; 2975 2976 range = bsearch(&key, bus->range, bus->dev_count, 2977 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 2978 if (range == NULL) 2979 return -ENOENT; 2980 2981 off = range - bus->range; 2982 2983 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 2984 off--; 2985 2986 return off; 2987 } 2988 2989 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 2990 struct kvm_io_range *range, const void *val) 2991 { 2992 int idx; 2993 2994 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 2995 if (idx < 0) 2996 return -EOPNOTSUPP; 2997 2998 while (idx < bus->dev_count && 2999 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3000 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3001 range->len, val)) 3002 return idx; 3003 idx++; 3004 } 3005 3006 return -EOPNOTSUPP; 3007 } 3008 3009 /* kvm_io_bus_write - called under kvm->slots_lock */ 3010 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3011 int len, const void *val) 3012 { 3013 struct kvm_io_bus *bus; 3014 struct kvm_io_range range; 3015 int r; 3016 3017 range = (struct kvm_io_range) { 3018 .addr = addr, 3019 .len = len, 3020 }; 3021 3022 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3023 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3024 return r < 0 ? r : 0; 3025 } 3026 3027 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3028 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3029 gpa_t addr, int len, const void *val, long cookie) 3030 { 3031 struct kvm_io_bus *bus; 3032 struct kvm_io_range range; 3033 3034 range = (struct kvm_io_range) { 3035 .addr = addr, 3036 .len = len, 3037 }; 3038 3039 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3040 3041 /* First try the device referenced by cookie. */ 3042 if ((cookie >= 0) && (cookie < bus->dev_count) && 3043 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3044 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3045 val)) 3046 return cookie; 3047 3048 /* 3049 * cookie contained garbage; fall back to search and return the 3050 * correct cookie value. 3051 */ 3052 return __kvm_io_bus_write(vcpu, bus, &range, val); 3053 } 3054 3055 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3056 struct kvm_io_range *range, void *val) 3057 { 3058 int idx; 3059 3060 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3061 if (idx < 0) 3062 return -EOPNOTSUPP; 3063 3064 while (idx < bus->dev_count && 3065 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3066 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3067 range->len, val)) 3068 return idx; 3069 idx++; 3070 } 3071 3072 return -EOPNOTSUPP; 3073 } 3074 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3075 3076 /* kvm_io_bus_read - called under kvm->slots_lock */ 3077 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3078 int len, void *val) 3079 { 3080 struct kvm_io_bus *bus; 3081 struct kvm_io_range range; 3082 int r; 3083 3084 range = (struct kvm_io_range) { 3085 .addr = addr, 3086 .len = len, 3087 }; 3088 3089 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3090 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3091 return r < 0 ? r : 0; 3092 } 3093 3094 3095 /* Caller must hold slots_lock. */ 3096 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3097 int len, struct kvm_io_device *dev) 3098 { 3099 struct kvm_io_bus *new_bus, *bus; 3100 3101 bus = kvm->buses[bus_idx]; 3102 /* exclude ioeventfd which is limited by maximum fd */ 3103 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3104 return -ENOSPC; 3105 3106 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3107 sizeof(struct kvm_io_range)), GFP_KERNEL); 3108 if (!new_bus) 3109 return -ENOMEM; 3110 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 3111 sizeof(struct kvm_io_range))); 3112 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 3113 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3114 synchronize_srcu_expedited(&kvm->srcu); 3115 kfree(bus); 3116 3117 return 0; 3118 } 3119 3120 /* Caller must hold slots_lock. */ 3121 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3122 struct kvm_io_device *dev) 3123 { 3124 int i, r; 3125 struct kvm_io_bus *new_bus, *bus; 3126 3127 bus = kvm->buses[bus_idx]; 3128 r = -ENOENT; 3129 for (i = 0; i < bus->dev_count; i++) 3130 if (bus->range[i].dev == dev) { 3131 r = 0; 3132 break; 3133 } 3134 3135 if (r) 3136 return r; 3137 3138 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3139 sizeof(struct kvm_io_range)), GFP_KERNEL); 3140 if (!new_bus) 3141 return -ENOMEM; 3142 3143 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3144 new_bus->dev_count--; 3145 memcpy(new_bus->range + i, bus->range + i + 1, 3146 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3147 3148 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3149 synchronize_srcu_expedited(&kvm->srcu); 3150 kfree(bus); 3151 return r; 3152 } 3153 3154 static struct notifier_block kvm_cpu_notifier = { 3155 .notifier_call = kvm_cpu_hotplug, 3156 }; 3157 3158 static int vm_stat_get(void *_offset, u64 *val) 3159 { 3160 unsigned offset = (long)_offset; 3161 struct kvm *kvm; 3162 3163 *val = 0; 3164 spin_lock(&kvm_lock); 3165 list_for_each_entry(kvm, &vm_list, vm_list) 3166 *val += *(u32 *)((void *)kvm + offset); 3167 spin_unlock(&kvm_lock); 3168 return 0; 3169 } 3170 3171 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 3172 3173 static int vcpu_stat_get(void *_offset, u64 *val) 3174 { 3175 unsigned offset = (long)_offset; 3176 struct kvm *kvm; 3177 struct kvm_vcpu *vcpu; 3178 int i; 3179 3180 *val = 0; 3181 spin_lock(&kvm_lock); 3182 list_for_each_entry(kvm, &vm_list, vm_list) 3183 kvm_for_each_vcpu(i, vcpu, kvm) 3184 *val += *(u32 *)((void *)vcpu + offset); 3185 3186 spin_unlock(&kvm_lock); 3187 return 0; 3188 } 3189 3190 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 3191 3192 static const struct file_operations *stat_fops[] = { 3193 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3194 [KVM_STAT_VM] = &vm_stat_fops, 3195 }; 3196 3197 static int kvm_init_debug(void) 3198 { 3199 int r = -EEXIST; 3200 struct kvm_stats_debugfs_item *p; 3201 3202 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 3203 if (kvm_debugfs_dir == NULL) 3204 goto out; 3205 3206 for (p = debugfs_entries; p->name; ++p) { 3207 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 3208 (void *)(long)p->offset, 3209 stat_fops[p->kind]); 3210 if (p->dentry == NULL) 3211 goto out_dir; 3212 } 3213 3214 return 0; 3215 3216 out_dir: 3217 debugfs_remove_recursive(kvm_debugfs_dir); 3218 out: 3219 return r; 3220 } 3221 3222 static void kvm_exit_debug(void) 3223 { 3224 struct kvm_stats_debugfs_item *p; 3225 3226 for (p = debugfs_entries; p->name; ++p) 3227 debugfs_remove(p->dentry); 3228 debugfs_remove(kvm_debugfs_dir); 3229 } 3230 3231 static int kvm_suspend(void) 3232 { 3233 if (kvm_usage_count) 3234 hardware_disable_nolock(NULL); 3235 return 0; 3236 } 3237 3238 static void kvm_resume(void) 3239 { 3240 if (kvm_usage_count) { 3241 WARN_ON(raw_spin_is_locked(&kvm_count_lock)); 3242 hardware_enable_nolock(NULL); 3243 } 3244 } 3245 3246 static struct syscore_ops kvm_syscore_ops = { 3247 .suspend = kvm_suspend, 3248 .resume = kvm_resume, 3249 }; 3250 3251 static inline 3252 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 3253 { 3254 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3255 } 3256 3257 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3258 { 3259 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3260 3261 if (vcpu->preempted) 3262 vcpu->preempted = false; 3263 3264 kvm_arch_sched_in(vcpu, cpu); 3265 3266 kvm_arch_vcpu_load(vcpu, cpu); 3267 } 3268 3269 static void kvm_sched_out(struct preempt_notifier *pn, 3270 struct task_struct *next) 3271 { 3272 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3273 3274 if (current->state == TASK_RUNNING) 3275 vcpu->preempted = true; 3276 kvm_arch_vcpu_put(vcpu); 3277 } 3278 3279 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 3280 struct module *module) 3281 { 3282 int r; 3283 int cpu; 3284 3285 r = kvm_arch_init(opaque); 3286 if (r) 3287 goto out_fail; 3288 3289 /* 3290 * kvm_arch_init makes sure there's at most one caller 3291 * for architectures that support multiple implementations, 3292 * like intel and amd on x86. 3293 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 3294 * conflicts in case kvm is already setup for another implementation. 3295 */ 3296 r = kvm_irqfd_init(); 3297 if (r) 3298 goto out_irqfd; 3299 3300 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 3301 r = -ENOMEM; 3302 goto out_free_0; 3303 } 3304 3305 r = kvm_arch_hardware_setup(); 3306 if (r < 0) 3307 goto out_free_0a; 3308 3309 for_each_online_cpu(cpu) { 3310 smp_call_function_single(cpu, 3311 kvm_arch_check_processor_compat, 3312 &r, 1); 3313 if (r < 0) 3314 goto out_free_1; 3315 } 3316 3317 r = register_cpu_notifier(&kvm_cpu_notifier); 3318 if (r) 3319 goto out_free_2; 3320 register_reboot_notifier(&kvm_reboot_notifier); 3321 3322 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 3323 if (!vcpu_align) 3324 vcpu_align = __alignof__(struct kvm_vcpu); 3325 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 3326 0, NULL); 3327 if (!kvm_vcpu_cache) { 3328 r = -ENOMEM; 3329 goto out_free_3; 3330 } 3331 3332 r = kvm_async_pf_init(); 3333 if (r) 3334 goto out_free; 3335 3336 kvm_chardev_ops.owner = module; 3337 kvm_vm_fops.owner = module; 3338 kvm_vcpu_fops.owner = module; 3339 3340 r = misc_register(&kvm_dev); 3341 if (r) { 3342 pr_err("kvm: misc device register failed\n"); 3343 goto out_unreg; 3344 } 3345 3346 register_syscore_ops(&kvm_syscore_ops); 3347 3348 kvm_preempt_ops.sched_in = kvm_sched_in; 3349 kvm_preempt_ops.sched_out = kvm_sched_out; 3350 3351 r = kvm_init_debug(); 3352 if (r) { 3353 pr_err("kvm: create debugfs files failed\n"); 3354 goto out_undebugfs; 3355 } 3356 3357 r = kvm_vfio_ops_init(); 3358 WARN_ON(r); 3359 3360 return 0; 3361 3362 out_undebugfs: 3363 unregister_syscore_ops(&kvm_syscore_ops); 3364 misc_deregister(&kvm_dev); 3365 out_unreg: 3366 kvm_async_pf_deinit(); 3367 out_free: 3368 kmem_cache_destroy(kvm_vcpu_cache); 3369 out_free_3: 3370 unregister_reboot_notifier(&kvm_reboot_notifier); 3371 unregister_cpu_notifier(&kvm_cpu_notifier); 3372 out_free_2: 3373 out_free_1: 3374 kvm_arch_hardware_unsetup(); 3375 out_free_0a: 3376 free_cpumask_var(cpus_hardware_enabled); 3377 out_free_0: 3378 kvm_irqfd_exit(); 3379 out_irqfd: 3380 kvm_arch_exit(); 3381 out_fail: 3382 return r; 3383 } 3384 EXPORT_SYMBOL_GPL(kvm_init); 3385 3386 void kvm_exit(void) 3387 { 3388 kvm_exit_debug(); 3389 misc_deregister(&kvm_dev); 3390 kmem_cache_destroy(kvm_vcpu_cache); 3391 kvm_async_pf_deinit(); 3392 unregister_syscore_ops(&kvm_syscore_ops); 3393 unregister_reboot_notifier(&kvm_reboot_notifier); 3394 unregister_cpu_notifier(&kvm_cpu_notifier); 3395 on_each_cpu(hardware_disable_nolock, NULL, 1); 3396 kvm_arch_hardware_unsetup(); 3397 kvm_arch_exit(); 3398 kvm_irqfd_exit(); 3399 free_cpumask_var(cpus_hardware_enabled); 3400 kvm_vfio_ops_exit(); 3401 } 3402 EXPORT_SYMBOL_GPL(kvm_exit); 3403