1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include "iodev.h" 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 #include <linux/sort.h> 51 #include <linux/bsearch.h> 52 53 #include <asm/processor.h> 54 #include <asm/io.h> 55 #include <asm/uaccess.h> 56 #include <asm/pgtable.h> 57 58 #include "coalesced_mmio.h" 59 #include "async_pf.h" 60 61 #define CREATE_TRACE_POINTS 62 #include <trace/events/kvm.h> 63 64 MODULE_AUTHOR("Qumranet"); 65 MODULE_LICENSE("GPL"); 66 67 /* 68 * Ordering of locks: 69 * 70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 71 */ 72 73 DEFINE_RAW_SPINLOCK(kvm_lock); 74 LIST_HEAD(vm_list); 75 76 static cpumask_var_t cpus_hardware_enabled; 77 static int kvm_usage_count = 0; 78 static atomic_t hardware_enable_failed; 79 80 struct kmem_cache *kvm_vcpu_cache; 81 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 82 83 static __read_mostly struct preempt_ops kvm_preempt_ops; 84 85 struct dentry *kvm_debugfs_dir; 86 87 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 88 unsigned long arg); 89 #ifdef CONFIG_COMPAT 90 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 91 unsigned long arg); 92 #endif 93 static int hardware_enable_all(void); 94 static void hardware_disable_all(void); 95 96 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 97 98 bool kvm_rebooting; 99 EXPORT_SYMBOL_GPL(kvm_rebooting); 100 101 static bool largepages_enabled = true; 102 103 static struct page *hwpoison_page; 104 static pfn_t hwpoison_pfn; 105 106 struct page *fault_page; 107 pfn_t fault_pfn; 108 109 inline int kvm_is_mmio_pfn(pfn_t pfn) 110 { 111 if (pfn_valid(pfn)) { 112 int reserved; 113 struct page *tail = pfn_to_page(pfn); 114 struct page *head = compound_trans_head(tail); 115 reserved = PageReserved(head); 116 if (head != tail) { 117 /* 118 * "head" is not a dangling pointer 119 * (compound_trans_head takes care of that) 120 * but the hugepage may have been splitted 121 * from under us (and we may not hold a 122 * reference count on the head page so it can 123 * be reused before we run PageReferenced), so 124 * we've to check PageTail before returning 125 * what we just read. 126 */ 127 smp_rmb(); 128 if (PageTail(tail)) 129 return reserved; 130 } 131 return PageReserved(tail); 132 } 133 134 return true; 135 } 136 137 /* 138 * Switches to specified vcpu, until a matching vcpu_put() 139 */ 140 void vcpu_load(struct kvm_vcpu *vcpu) 141 { 142 int cpu; 143 144 mutex_lock(&vcpu->mutex); 145 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 146 /* The thread running this VCPU changed. */ 147 struct pid *oldpid = vcpu->pid; 148 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 149 rcu_assign_pointer(vcpu->pid, newpid); 150 synchronize_rcu(); 151 put_pid(oldpid); 152 } 153 cpu = get_cpu(); 154 preempt_notifier_register(&vcpu->preempt_notifier); 155 kvm_arch_vcpu_load(vcpu, cpu); 156 put_cpu(); 157 } 158 159 void vcpu_put(struct kvm_vcpu *vcpu) 160 { 161 preempt_disable(); 162 kvm_arch_vcpu_put(vcpu); 163 preempt_notifier_unregister(&vcpu->preempt_notifier); 164 preempt_enable(); 165 mutex_unlock(&vcpu->mutex); 166 } 167 168 static void ack_flush(void *_completed) 169 { 170 } 171 172 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 173 { 174 int i, cpu, me; 175 cpumask_var_t cpus; 176 bool called = true; 177 struct kvm_vcpu *vcpu; 178 179 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 180 181 me = get_cpu(); 182 kvm_for_each_vcpu(i, vcpu, kvm) { 183 kvm_make_request(req, vcpu); 184 cpu = vcpu->cpu; 185 186 /* Set ->requests bit before we read ->mode */ 187 smp_mb(); 188 189 if (cpus != NULL && cpu != -1 && cpu != me && 190 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 191 cpumask_set_cpu(cpu, cpus); 192 } 193 if (unlikely(cpus == NULL)) 194 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 195 else if (!cpumask_empty(cpus)) 196 smp_call_function_many(cpus, ack_flush, NULL, 1); 197 else 198 called = false; 199 put_cpu(); 200 free_cpumask_var(cpus); 201 return called; 202 } 203 204 void kvm_flush_remote_tlbs(struct kvm *kvm) 205 { 206 long dirty_count = kvm->tlbs_dirty; 207 208 smp_mb(); 209 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 210 ++kvm->stat.remote_tlb_flush; 211 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 212 } 213 214 void kvm_reload_remote_mmus(struct kvm *kvm) 215 { 216 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 217 } 218 219 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 220 { 221 struct page *page; 222 int r; 223 224 mutex_init(&vcpu->mutex); 225 vcpu->cpu = -1; 226 vcpu->kvm = kvm; 227 vcpu->vcpu_id = id; 228 vcpu->pid = NULL; 229 init_waitqueue_head(&vcpu->wq); 230 kvm_async_pf_vcpu_init(vcpu); 231 232 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 233 if (!page) { 234 r = -ENOMEM; 235 goto fail; 236 } 237 vcpu->run = page_address(page); 238 239 r = kvm_arch_vcpu_init(vcpu); 240 if (r < 0) 241 goto fail_free_run; 242 return 0; 243 244 fail_free_run: 245 free_page((unsigned long)vcpu->run); 246 fail: 247 return r; 248 } 249 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 250 251 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 252 { 253 put_pid(vcpu->pid); 254 kvm_arch_vcpu_uninit(vcpu); 255 free_page((unsigned long)vcpu->run); 256 } 257 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 258 259 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 260 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 261 { 262 return container_of(mn, struct kvm, mmu_notifier); 263 } 264 265 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 266 struct mm_struct *mm, 267 unsigned long address) 268 { 269 struct kvm *kvm = mmu_notifier_to_kvm(mn); 270 int need_tlb_flush, idx; 271 272 /* 273 * When ->invalidate_page runs, the linux pte has been zapped 274 * already but the page is still allocated until 275 * ->invalidate_page returns. So if we increase the sequence 276 * here the kvm page fault will notice if the spte can't be 277 * established because the page is going to be freed. If 278 * instead the kvm page fault establishes the spte before 279 * ->invalidate_page runs, kvm_unmap_hva will release it 280 * before returning. 281 * 282 * The sequence increase only need to be seen at spin_unlock 283 * time, and not at spin_lock time. 284 * 285 * Increasing the sequence after the spin_unlock would be 286 * unsafe because the kvm page fault could then establish the 287 * pte after kvm_unmap_hva returned, without noticing the page 288 * is going to be freed. 289 */ 290 idx = srcu_read_lock(&kvm->srcu); 291 spin_lock(&kvm->mmu_lock); 292 293 kvm->mmu_notifier_seq++; 294 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 295 /* we've to flush the tlb before the pages can be freed */ 296 if (need_tlb_flush) 297 kvm_flush_remote_tlbs(kvm); 298 299 spin_unlock(&kvm->mmu_lock); 300 srcu_read_unlock(&kvm->srcu, idx); 301 } 302 303 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 304 struct mm_struct *mm, 305 unsigned long address, 306 pte_t pte) 307 { 308 struct kvm *kvm = mmu_notifier_to_kvm(mn); 309 int idx; 310 311 idx = srcu_read_lock(&kvm->srcu); 312 spin_lock(&kvm->mmu_lock); 313 kvm->mmu_notifier_seq++; 314 kvm_set_spte_hva(kvm, address, pte); 315 spin_unlock(&kvm->mmu_lock); 316 srcu_read_unlock(&kvm->srcu, idx); 317 } 318 319 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 320 struct mm_struct *mm, 321 unsigned long start, 322 unsigned long end) 323 { 324 struct kvm *kvm = mmu_notifier_to_kvm(mn); 325 int need_tlb_flush = 0, idx; 326 327 idx = srcu_read_lock(&kvm->srcu); 328 spin_lock(&kvm->mmu_lock); 329 /* 330 * The count increase must become visible at unlock time as no 331 * spte can be established without taking the mmu_lock and 332 * count is also read inside the mmu_lock critical section. 333 */ 334 kvm->mmu_notifier_count++; 335 for (; start < end; start += PAGE_SIZE) 336 need_tlb_flush |= kvm_unmap_hva(kvm, start); 337 need_tlb_flush |= kvm->tlbs_dirty; 338 /* we've to flush the tlb before the pages can be freed */ 339 if (need_tlb_flush) 340 kvm_flush_remote_tlbs(kvm); 341 342 spin_unlock(&kvm->mmu_lock); 343 srcu_read_unlock(&kvm->srcu, idx); 344 } 345 346 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 347 struct mm_struct *mm, 348 unsigned long start, 349 unsigned long end) 350 { 351 struct kvm *kvm = mmu_notifier_to_kvm(mn); 352 353 spin_lock(&kvm->mmu_lock); 354 /* 355 * This sequence increase will notify the kvm page fault that 356 * the page that is going to be mapped in the spte could have 357 * been freed. 358 */ 359 kvm->mmu_notifier_seq++; 360 smp_wmb(); 361 /* 362 * The above sequence increase must be visible before the 363 * below count decrease, which is ensured by the smp_wmb above 364 * in conjunction with the smp_rmb in mmu_notifier_retry(). 365 */ 366 kvm->mmu_notifier_count--; 367 spin_unlock(&kvm->mmu_lock); 368 369 BUG_ON(kvm->mmu_notifier_count < 0); 370 } 371 372 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 373 struct mm_struct *mm, 374 unsigned long address) 375 { 376 struct kvm *kvm = mmu_notifier_to_kvm(mn); 377 int young, idx; 378 379 idx = srcu_read_lock(&kvm->srcu); 380 spin_lock(&kvm->mmu_lock); 381 382 young = kvm_age_hva(kvm, address); 383 if (young) 384 kvm_flush_remote_tlbs(kvm); 385 386 spin_unlock(&kvm->mmu_lock); 387 srcu_read_unlock(&kvm->srcu, idx); 388 389 return young; 390 } 391 392 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 393 struct mm_struct *mm, 394 unsigned long address) 395 { 396 struct kvm *kvm = mmu_notifier_to_kvm(mn); 397 int young, idx; 398 399 idx = srcu_read_lock(&kvm->srcu); 400 spin_lock(&kvm->mmu_lock); 401 young = kvm_test_age_hva(kvm, address); 402 spin_unlock(&kvm->mmu_lock); 403 srcu_read_unlock(&kvm->srcu, idx); 404 405 return young; 406 } 407 408 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 409 struct mm_struct *mm) 410 { 411 struct kvm *kvm = mmu_notifier_to_kvm(mn); 412 int idx; 413 414 idx = srcu_read_lock(&kvm->srcu); 415 kvm_arch_flush_shadow(kvm); 416 srcu_read_unlock(&kvm->srcu, idx); 417 } 418 419 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 420 .invalidate_page = kvm_mmu_notifier_invalidate_page, 421 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 422 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 423 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 424 .test_young = kvm_mmu_notifier_test_young, 425 .change_pte = kvm_mmu_notifier_change_pte, 426 .release = kvm_mmu_notifier_release, 427 }; 428 429 static int kvm_init_mmu_notifier(struct kvm *kvm) 430 { 431 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 432 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 433 } 434 435 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 436 437 static int kvm_init_mmu_notifier(struct kvm *kvm) 438 { 439 return 0; 440 } 441 442 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 443 444 static void kvm_init_memslots_id(struct kvm *kvm) 445 { 446 int i; 447 struct kvm_memslots *slots = kvm->memslots; 448 449 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 450 slots->id_to_index[i] = slots->memslots[i].id = i; 451 } 452 453 static struct kvm *kvm_create_vm(unsigned long type) 454 { 455 int r, i; 456 struct kvm *kvm = kvm_arch_alloc_vm(); 457 458 if (!kvm) 459 return ERR_PTR(-ENOMEM); 460 461 r = kvm_arch_init_vm(kvm, type); 462 if (r) 463 goto out_err_nodisable; 464 465 r = hardware_enable_all(); 466 if (r) 467 goto out_err_nodisable; 468 469 #ifdef CONFIG_HAVE_KVM_IRQCHIP 470 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 471 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 472 #endif 473 474 r = -ENOMEM; 475 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 476 if (!kvm->memslots) 477 goto out_err_nosrcu; 478 kvm_init_memslots_id(kvm); 479 if (init_srcu_struct(&kvm->srcu)) 480 goto out_err_nosrcu; 481 for (i = 0; i < KVM_NR_BUSES; i++) { 482 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 483 GFP_KERNEL); 484 if (!kvm->buses[i]) 485 goto out_err; 486 } 487 488 spin_lock_init(&kvm->mmu_lock); 489 kvm->mm = current->mm; 490 atomic_inc(&kvm->mm->mm_count); 491 kvm_eventfd_init(kvm); 492 mutex_init(&kvm->lock); 493 mutex_init(&kvm->irq_lock); 494 mutex_init(&kvm->slots_lock); 495 atomic_set(&kvm->users_count, 1); 496 497 r = kvm_init_mmu_notifier(kvm); 498 if (r) 499 goto out_err; 500 501 raw_spin_lock(&kvm_lock); 502 list_add(&kvm->vm_list, &vm_list); 503 raw_spin_unlock(&kvm_lock); 504 505 return kvm; 506 507 out_err: 508 cleanup_srcu_struct(&kvm->srcu); 509 out_err_nosrcu: 510 hardware_disable_all(); 511 out_err_nodisable: 512 for (i = 0; i < KVM_NR_BUSES; i++) 513 kfree(kvm->buses[i]); 514 kfree(kvm->memslots); 515 kvm_arch_free_vm(kvm); 516 return ERR_PTR(r); 517 } 518 519 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 520 { 521 if (!memslot->dirty_bitmap) 522 return; 523 524 if (2 * kvm_dirty_bitmap_bytes(memslot) > PAGE_SIZE) 525 vfree(memslot->dirty_bitmap); 526 else 527 kfree(memslot->dirty_bitmap); 528 529 memslot->dirty_bitmap = NULL; 530 } 531 532 /* 533 * Free any memory in @free but not in @dont. 534 */ 535 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 536 struct kvm_memory_slot *dont) 537 { 538 if (!dont || free->rmap != dont->rmap) 539 vfree(free->rmap); 540 541 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 542 kvm_destroy_dirty_bitmap(free); 543 544 kvm_arch_free_memslot(free, dont); 545 546 free->npages = 0; 547 free->rmap = NULL; 548 } 549 550 void kvm_free_physmem(struct kvm *kvm) 551 { 552 struct kvm_memslots *slots = kvm->memslots; 553 struct kvm_memory_slot *memslot; 554 555 kvm_for_each_memslot(memslot, slots) 556 kvm_free_physmem_slot(memslot, NULL); 557 558 kfree(kvm->memslots); 559 } 560 561 static void kvm_destroy_vm(struct kvm *kvm) 562 { 563 int i; 564 struct mm_struct *mm = kvm->mm; 565 566 kvm_arch_sync_events(kvm); 567 raw_spin_lock(&kvm_lock); 568 list_del(&kvm->vm_list); 569 raw_spin_unlock(&kvm_lock); 570 kvm_free_irq_routing(kvm); 571 for (i = 0; i < KVM_NR_BUSES; i++) 572 kvm_io_bus_destroy(kvm->buses[i]); 573 kvm_coalesced_mmio_free(kvm); 574 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 575 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 576 #else 577 kvm_arch_flush_shadow(kvm); 578 #endif 579 kvm_arch_destroy_vm(kvm); 580 kvm_free_physmem(kvm); 581 cleanup_srcu_struct(&kvm->srcu); 582 kvm_arch_free_vm(kvm); 583 hardware_disable_all(); 584 mmdrop(mm); 585 } 586 587 void kvm_get_kvm(struct kvm *kvm) 588 { 589 atomic_inc(&kvm->users_count); 590 } 591 EXPORT_SYMBOL_GPL(kvm_get_kvm); 592 593 void kvm_put_kvm(struct kvm *kvm) 594 { 595 if (atomic_dec_and_test(&kvm->users_count)) 596 kvm_destroy_vm(kvm); 597 } 598 EXPORT_SYMBOL_GPL(kvm_put_kvm); 599 600 601 static int kvm_vm_release(struct inode *inode, struct file *filp) 602 { 603 struct kvm *kvm = filp->private_data; 604 605 kvm_irqfd_release(kvm); 606 607 kvm_put_kvm(kvm); 608 return 0; 609 } 610 611 /* 612 * Allocation size is twice as large as the actual dirty bitmap size. 613 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 614 */ 615 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 616 { 617 #ifndef CONFIG_S390 618 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 619 620 if (dirty_bytes > PAGE_SIZE) 621 memslot->dirty_bitmap = vzalloc(dirty_bytes); 622 else 623 memslot->dirty_bitmap = kzalloc(dirty_bytes, GFP_KERNEL); 624 625 if (!memslot->dirty_bitmap) 626 return -ENOMEM; 627 628 #endif /* !CONFIG_S390 */ 629 return 0; 630 } 631 632 static int cmp_memslot(const void *slot1, const void *slot2) 633 { 634 struct kvm_memory_slot *s1, *s2; 635 636 s1 = (struct kvm_memory_slot *)slot1; 637 s2 = (struct kvm_memory_slot *)slot2; 638 639 if (s1->npages < s2->npages) 640 return 1; 641 if (s1->npages > s2->npages) 642 return -1; 643 644 return 0; 645 } 646 647 /* 648 * Sort the memslots base on its size, so the larger slots 649 * will get better fit. 650 */ 651 static void sort_memslots(struct kvm_memslots *slots) 652 { 653 int i; 654 655 sort(slots->memslots, KVM_MEM_SLOTS_NUM, 656 sizeof(struct kvm_memory_slot), cmp_memslot, NULL); 657 658 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 659 slots->id_to_index[slots->memslots[i].id] = i; 660 } 661 662 void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) 663 { 664 if (new) { 665 int id = new->id; 666 struct kvm_memory_slot *old = id_to_memslot(slots, id); 667 unsigned long npages = old->npages; 668 669 *old = *new; 670 if (new->npages != npages) 671 sort_memslots(slots); 672 } 673 674 slots->generation++; 675 } 676 677 /* 678 * Allocate some memory and give it an address in the guest physical address 679 * space. 680 * 681 * Discontiguous memory is allowed, mostly for framebuffers. 682 * 683 * Must be called holding mmap_sem for write. 684 */ 685 int __kvm_set_memory_region(struct kvm *kvm, 686 struct kvm_userspace_memory_region *mem, 687 int user_alloc) 688 { 689 int r; 690 gfn_t base_gfn; 691 unsigned long npages; 692 unsigned long i; 693 struct kvm_memory_slot *memslot; 694 struct kvm_memory_slot old, new; 695 struct kvm_memslots *slots, *old_memslots; 696 697 r = -EINVAL; 698 /* General sanity checks */ 699 if (mem->memory_size & (PAGE_SIZE - 1)) 700 goto out; 701 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 702 goto out; 703 /* We can read the guest memory with __xxx_user() later on. */ 704 if (user_alloc && 705 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 706 !access_ok(VERIFY_WRITE, 707 (void __user *)(unsigned long)mem->userspace_addr, 708 mem->memory_size))) 709 goto out; 710 if (mem->slot >= KVM_MEM_SLOTS_NUM) 711 goto out; 712 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 713 goto out; 714 715 memslot = id_to_memslot(kvm->memslots, mem->slot); 716 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 717 npages = mem->memory_size >> PAGE_SHIFT; 718 719 r = -EINVAL; 720 if (npages > KVM_MEM_MAX_NR_PAGES) 721 goto out; 722 723 if (!npages) 724 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 725 726 new = old = *memslot; 727 728 new.id = mem->slot; 729 new.base_gfn = base_gfn; 730 new.npages = npages; 731 new.flags = mem->flags; 732 733 /* Disallow changing a memory slot's size. */ 734 r = -EINVAL; 735 if (npages && old.npages && npages != old.npages) 736 goto out_free; 737 738 /* Check for overlaps */ 739 r = -EEXIST; 740 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 741 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 742 743 if (s == memslot || !s->npages) 744 continue; 745 if (!((base_gfn + npages <= s->base_gfn) || 746 (base_gfn >= s->base_gfn + s->npages))) 747 goto out_free; 748 } 749 750 /* Free page dirty bitmap if unneeded */ 751 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 752 new.dirty_bitmap = NULL; 753 754 r = -ENOMEM; 755 756 /* Allocate if a slot is being created */ 757 if (npages && !old.npages) { 758 new.user_alloc = user_alloc; 759 new.userspace_addr = mem->userspace_addr; 760 #ifndef CONFIG_S390 761 new.rmap = vzalloc(npages * sizeof(*new.rmap)); 762 if (!new.rmap) 763 goto out_free; 764 #endif /* not defined CONFIG_S390 */ 765 if (kvm_arch_create_memslot(&new, npages)) 766 goto out_free; 767 } 768 769 /* Allocate page dirty bitmap if needed */ 770 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 771 if (kvm_create_dirty_bitmap(&new) < 0) 772 goto out_free; 773 /* destroy any largepage mappings for dirty tracking */ 774 } 775 776 if (!npages) { 777 struct kvm_memory_slot *slot; 778 779 r = -ENOMEM; 780 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 781 GFP_KERNEL); 782 if (!slots) 783 goto out_free; 784 slot = id_to_memslot(slots, mem->slot); 785 slot->flags |= KVM_MEMSLOT_INVALID; 786 787 update_memslots(slots, NULL); 788 789 old_memslots = kvm->memslots; 790 rcu_assign_pointer(kvm->memslots, slots); 791 synchronize_srcu_expedited(&kvm->srcu); 792 /* From this point no new shadow pages pointing to a deleted 793 * memslot will be created. 794 * 795 * validation of sp->gfn happens in: 796 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 797 * - kvm_is_visible_gfn (mmu_check_roots) 798 */ 799 kvm_arch_flush_shadow(kvm); 800 kfree(old_memslots); 801 } 802 803 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 804 if (r) 805 goto out_free; 806 807 /* map/unmap the pages in iommu page table */ 808 if (npages) { 809 r = kvm_iommu_map_pages(kvm, &new); 810 if (r) 811 goto out_free; 812 } else 813 kvm_iommu_unmap_pages(kvm, &old); 814 815 r = -ENOMEM; 816 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 817 GFP_KERNEL); 818 if (!slots) 819 goto out_free; 820 821 /* actual memory is freed via old in kvm_free_physmem_slot below */ 822 if (!npages) { 823 new.rmap = NULL; 824 new.dirty_bitmap = NULL; 825 memset(&new.arch, 0, sizeof(new.arch)); 826 } 827 828 update_memslots(slots, &new); 829 old_memslots = kvm->memslots; 830 rcu_assign_pointer(kvm->memslots, slots); 831 synchronize_srcu_expedited(&kvm->srcu); 832 833 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 834 835 /* 836 * If the new memory slot is created, we need to clear all 837 * mmio sptes. 838 */ 839 if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) 840 kvm_arch_flush_shadow(kvm); 841 842 kvm_free_physmem_slot(&old, &new); 843 kfree(old_memslots); 844 845 return 0; 846 847 out_free: 848 kvm_free_physmem_slot(&new, &old); 849 out: 850 return r; 851 852 } 853 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 854 855 int kvm_set_memory_region(struct kvm *kvm, 856 struct kvm_userspace_memory_region *mem, 857 int user_alloc) 858 { 859 int r; 860 861 mutex_lock(&kvm->slots_lock); 862 r = __kvm_set_memory_region(kvm, mem, user_alloc); 863 mutex_unlock(&kvm->slots_lock); 864 return r; 865 } 866 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 867 868 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 869 struct 870 kvm_userspace_memory_region *mem, 871 int user_alloc) 872 { 873 if (mem->slot >= KVM_MEMORY_SLOTS) 874 return -EINVAL; 875 return kvm_set_memory_region(kvm, mem, user_alloc); 876 } 877 878 int kvm_get_dirty_log(struct kvm *kvm, 879 struct kvm_dirty_log *log, int *is_dirty) 880 { 881 struct kvm_memory_slot *memslot; 882 int r, i; 883 unsigned long n; 884 unsigned long any = 0; 885 886 r = -EINVAL; 887 if (log->slot >= KVM_MEMORY_SLOTS) 888 goto out; 889 890 memslot = id_to_memslot(kvm->memslots, log->slot); 891 r = -ENOENT; 892 if (!memslot->dirty_bitmap) 893 goto out; 894 895 n = kvm_dirty_bitmap_bytes(memslot); 896 897 for (i = 0; !any && i < n/sizeof(long); ++i) 898 any = memslot->dirty_bitmap[i]; 899 900 r = -EFAULT; 901 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 902 goto out; 903 904 if (any) 905 *is_dirty = 1; 906 907 r = 0; 908 out: 909 return r; 910 } 911 912 bool kvm_largepages_enabled(void) 913 { 914 return largepages_enabled; 915 } 916 917 void kvm_disable_largepages(void) 918 { 919 largepages_enabled = false; 920 } 921 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 922 923 int is_error_page(struct page *page) 924 { 925 return page == bad_page || page == hwpoison_page || page == fault_page; 926 } 927 EXPORT_SYMBOL_GPL(is_error_page); 928 929 int is_error_pfn(pfn_t pfn) 930 { 931 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; 932 } 933 EXPORT_SYMBOL_GPL(is_error_pfn); 934 935 int is_hwpoison_pfn(pfn_t pfn) 936 { 937 return pfn == hwpoison_pfn; 938 } 939 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 940 941 int is_fault_pfn(pfn_t pfn) 942 { 943 return pfn == fault_pfn; 944 } 945 EXPORT_SYMBOL_GPL(is_fault_pfn); 946 947 int is_noslot_pfn(pfn_t pfn) 948 { 949 return pfn == bad_pfn; 950 } 951 EXPORT_SYMBOL_GPL(is_noslot_pfn); 952 953 int is_invalid_pfn(pfn_t pfn) 954 { 955 return pfn == hwpoison_pfn || pfn == fault_pfn; 956 } 957 EXPORT_SYMBOL_GPL(is_invalid_pfn); 958 959 static inline unsigned long bad_hva(void) 960 { 961 return PAGE_OFFSET; 962 } 963 964 int kvm_is_error_hva(unsigned long addr) 965 { 966 return addr == bad_hva(); 967 } 968 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 969 970 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 971 { 972 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 973 } 974 EXPORT_SYMBOL_GPL(gfn_to_memslot); 975 976 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 977 { 978 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 979 980 if (!memslot || memslot->id >= KVM_MEMORY_SLOTS || 981 memslot->flags & KVM_MEMSLOT_INVALID) 982 return 0; 983 984 return 1; 985 } 986 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 987 988 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 989 { 990 struct vm_area_struct *vma; 991 unsigned long addr, size; 992 993 size = PAGE_SIZE; 994 995 addr = gfn_to_hva(kvm, gfn); 996 if (kvm_is_error_hva(addr)) 997 return PAGE_SIZE; 998 999 down_read(¤t->mm->mmap_sem); 1000 vma = find_vma(current->mm, addr); 1001 if (!vma) 1002 goto out; 1003 1004 size = vma_kernel_pagesize(vma); 1005 1006 out: 1007 up_read(¤t->mm->mmap_sem); 1008 1009 return size; 1010 } 1011 1012 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1013 gfn_t *nr_pages) 1014 { 1015 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1016 return bad_hva(); 1017 1018 if (nr_pages) 1019 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1020 1021 return gfn_to_hva_memslot(slot, gfn); 1022 } 1023 1024 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1025 { 1026 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1027 } 1028 EXPORT_SYMBOL_GPL(gfn_to_hva); 1029 1030 static pfn_t get_fault_pfn(void) 1031 { 1032 get_page(fault_page); 1033 return fault_pfn; 1034 } 1035 1036 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1037 unsigned long start, int write, struct page **page) 1038 { 1039 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1040 1041 if (write) 1042 flags |= FOLL_WRITE; 1043 1044 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1045 } 1046 1047 static inline int check_user_page_hwpoison(unsigned long addr) 1048 { 1049 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1050 1051 rc = __get_user_pages(current, current->mm, addr, 1, 1052 flags, NULL, NULL, NULL); 1053 return rc == -EHWPOISON; 1054 } 1055 1056 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, 1057 bool *async, bool write_fault, bool *writable) 1058 { 1059 struct page *page[1]; 1060 int npages = 0; 1061 pfn_t pfn; 1062 1063 /* we can do it either atomically or asynchronously, not both */ 1064 BUG_ON(atomic && async); 1065 1066 BUG_ON(!write_fault && !writable); 1067 1068 if (writable) 1069 *writable = true; 1070 1071 if (atomic || async) 1072 npages = __get_user_pages_fast(addr, 1, 1, page); 1073 1074 if (unlikely(npages != 1) && !atomic) { 1075 might_sleep(); 1076 1077 if (writable) 1078 *writable = write_fault; 1079 1080 if (async) { 1081 down_read(¤t->mm->mmap_sem); 1082 npages = get_user_page_nowait(current, current->mm, 1083 addr, write_fault, page); 1084 up_read(¤t->mm->mmap_sem); 1085 } else 1086 npages = get_user_pages_fast(addr, 1, write_fault, 1087 page); 1088 1089 /* map read fault as writable if possible */ 1090 if (unlikely(!write_fault) && npages == 1) { 1091 struct page *wpage[1]; 1092 1093 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1094 if (npages == 1) { 1095 *writable = true; 1096 put_page(page[0]); 1097 page[0] = wpage[0]; 1098 } 1099 npages = 1; 1100 } 1101 } 1102 1103 if (unlikely(npages != 1)) { 1104 struct vm_area_struct *vma; 1105 1106 if (atomic) 1107 return get_fault_pfn(); 1108 1109 down_read(¤t->mm->mmap_sem); 1110 if (npages == -EHWPOISON || 1111 (!async && check_user_page_hwpoison(addr))) { 1112 up_read(¤t->mm->mmap_sem); 1113 get_page(hwpoison_page); 1114 return page_to_pfn(hwpoison_page); 1115 } 1116 1117 vma = find_vma_intersection(current->mm, addr, addr+1); 1118 1119 if (vma == NULL) 1120 pfn = get_fault_pfn(); 1121 else if ((vma->vm_flags & VM_PFNMAP)) { 1122 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1123 vma->vm_pgoff; 1124 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1125 } else { 1126 if (async && (vma->vm_flags & VM_WRITE)) 1127 *async = true; 1128 pfn = get_fault_pfn(); 1129 } 1130 up_read(¤t->mm->mmap_sem); 1131 } else 1132 pfn = page_to_pfn(page[0]); 1133 1134 return pfn; 1135 } 1136 1137 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 1138 { 1139 return hva_to_pfn(kvm, addr, true, NULL, true, NULL); 1140 } 1141 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 1142 1143 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1144 bool write_fault, bool *writable) 1145 { 1146 unsigned long addr; 1147 1148 if (async) 1149 *async = false; 1150 1151 addr = gfn_to_hva(kvm, gfn); 1152 if (kvm_is_error_hva(addr)) { 1153 get_page(bad_page); 1154 return page_to_pfn(bad_page); 1155 } 1156 1157 return hva_to_pfn(kvm, addr, atomic, async, write_fault, writable); 1158 } 1159 1160 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1161 { 1162 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1163 } 1164 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1165 1166 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1167 bool write_fault, bool *writable) 1168 { 1169 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1170 } 1171 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1172 1173 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1174 { 1175 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1176 } 1177 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1178 1179 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1180 bool *writable) 1181 { 1182 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1183 } 1184 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1185 1186 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1187 struct kvm_memory_slot *slot, gfn_t gfn) 1188 { 1189 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1190 return hva_to_pfn(kvm, addr, false, NULL, true, NULL); 1191 } 1192 1193 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1194 int nr_pages) 1195 { 1196 unsigned long addr; 1197 gfn_t entry; 1198 1199 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1200 if (kvm_is_error_hva(addr)) 1201 return -1; 1202 1203 if (entry < nr_pages) 1204 return 0; 1205 1206 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1207 } 1208 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1209 1210 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1211 { 1212 pfn_t pfn; 1213 1214 pfn = gfn_to_pfn(kvm, gfn); 1215 if (!kvm_is_mmio_pfn(pfn)) 1216 return pfn_to_page(pfn); 1217 1218 WARN_ON(kvm_is_mmio_pfn(pfn)); 1219 1220 get_page(bad_page); 1221 return bad_page; 1222 } 1223 1224 EXPORT_SYMBOL_GPL(gfn_to_page); 1225 1226 void kvm_release_page_clean(struct page *page) 1227 { 1228 kvm_release_pfn_clean(page_to_pfn(page)); 1229 } 1230 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1231 1232 void kvm_release_pfn_clean(pfn_t pfn) 1233 { 1234 if (!kvm_is_mmio_pfn(pfn)) 1235 put_page(pfn_to_page(pfn)); 1236 } 1237 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1238 1239 void kvm_release_page_dirty(struct page *page) 1240 { 1241 kvm_release_pfn_dirty(page_to_pfn(page)); 1242 } 1243 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1244 1245 void kvm_release_pfn_dirty(pfn_t pfn) 1246 { 1247 kvm_set_pfn_dirty(pfn); 1248 kvm_release_pfn_clean(pfn); 1249 } 1250 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1251 1252 void kvm_set_page_dirty(struct page *page) 1253 { 1254 kvm_set_pfn_dirty(page_to_pfn(page)); 1255 } 1256 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1257 1258 void kvm_set_pfn_dirty(pfn_t pfn) 1259 { 1260 if (!kvm_is_mmio_pfn(pfn)) { 1261 struct page *page = pfn_to_page(pfn); 1262 if (!PageReserved(page)) 1263 SetPageDirty(page); 1264 } 1265 } 1266 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1267 1268 void kvm_set_pfn_accessed(pfn_t pfn) 1269 { 1270 if (!kvm_is_mmio_pfn(pfn)) 1271 mark_page_accessed(pfn_to_page(pfn)); 1272 } 1273 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1274 1275 void kvm_get_pfn(pfn_t pfn) 1276 { 1277 if (!kvm_is_mmio_pfn(pfn)) 1278 get_page(pfn_to_page(pfn)); 1279 } 1280 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1281 1282 static int next_segment(unsigned long len, int offset) 1283 { 1284 if (len > PAGE_SIZE - offset) 1285 return PAGE_SIZE - offset; 1286 else 1287 return len; 1288 } 1289 1290 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1291 int len) 1292 { 1293 int r; 1294 unsigned long addr; 1295 1296 addr = gfn_to_hva(kvm, gfn); 1297 if (kvm_is_error_hva(addr)) 1298 return -EFAULT; 1299 r = __copy_from_user(data, (void __user *)addr + offset, len); 1300 if (r) 1301 return -EFAULT; 1302 return 0; 1303 } 1304 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1305 1306 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1307 { 1308 gfn_t gfn = gpa >> PAGE_SHIFT; 1309 int seg; 1310 int offset = offset_in_page(gpa); 1311 int ret; 1312 1313 while ((seg = next_segment(len, offset)) != 0) { 1314 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1315 if (ret < 0) 1316 return ret; 1317 offset = 0; 1318 len -= seg; 1319 data += seg; 1320 ++gfn; 1321 } 1322 return 0; 1323 } 1324 EXPORT_SYMBOL_GPL(kvm_read_guest); 1325 1326 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1327 unsigned long len) 1328 { 1329 int r; 1330 unsigned long addr; 1331 gfn_t gfn = gpa >> PAGE_SHIFT; 1332 int offset = offset_in_page(gpa); 1333 1334 addr = gfn_to_hva(kvm, gfn); 1335 if (kvm_is_error_hva(addr)) 1336 return -EFAULT; 1337 pagefault_disable(); 1338 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1339 pagefault_enable(); 1340 if (r) 1341 return -EFAULT; 1342 return 0; 1343 } 1344 EXPORT_SYMBOL(kvm_read_guest_atomic); 1345 1346 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1347 int offset, int len) 1348 { 1349 int r; 1350 unsigned long addr; 1351 1352 addr = gfn_to_hva(kvm, gfn); 1353 if (kvm_is_error_hva(addr)) 1354 return -EFAULT; 1355 r = __copy_to_user((void __user *)addr + offset, data, len); 1356 if (r) 1357 return -EFAULT; 1358 mark_page_dirty(kvm, gfn); 1359 return 0; 1360 } 1361 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1362 1363 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1364 unsigned long len) 1365 { 1366 gfn_t gfn = gpa >> PAGE_SHIFT; 1367 int seg; 1368 int offset = offset_in_page(gpa); 1369 int ret; 1370 1371 while ((seg = next_segment(len, offset)) != 0) { 1372 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1373 if (ret < 0) 1374 return ret; 1375 offset = 0; 1376 len -= seg; 1377 data += seg; 1378 ++gfn; 1379 } 1380 return 0; 1381 } 1382 1383 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1384 gpa_t gpa) 1385 { 1386 struct kvm_memslots *slots = kvm_memslots(kvm); 1387 int offset = offset_in_page(gpa); 1388 gfn_t gfn = gpa >> PAGE_SHIFT; 1389 1390 ghc->gpa = gpa; 1391 ghc->generation = slots->generation; 1392 ghc->memslot = gfn_to_memslot(kvm, gfn); 1393 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1394 if (!kvm_is_error_hva(ghc->hva)) 1395 ghc->hva += offset; 1396 else 1397 return -EFAULT; 1398 1399 return 0; 1400 } 1401 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1402 1403 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1404 void *data, unsigned long len) 1405 { 1406 struct kvm_memslots *slots = kvm_memslots(kvm); 1407 int r; 1408 1409 if (slots->generation != ghc->generation) 1410 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1411 1412 if (kvm_is_error_hva(ghc->hva)) 1413 return -EFAULT; 1414 1415 r = __copy_to_user((void __user *)ghc->hva, data, len); 1416 if (r) 1417 return -EFAULT; 1418 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1419 1420 return 0; 1421 } 1422 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1423 1424 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1425 void *data, unsigned long len) 1426 { 1427 struct kvm_memslots *slots = kvm_memslots(kvm); 1428 int r; 1429 1430 if (slots->generation != ghc->generation) 1431 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1432 1433 if (kvm_is_error_hva(ghc->hva)) 1434 return -EFAULT; 1435 1436 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1437 if (r) 1438 return -EFAULT; 1439 1440 return 0; 1441 } 1442 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1443 1444 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1445 { 1446 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1447 offset, len); 1448 } 1449 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1450 1451 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1452 { 1453 gfn_t gfn = gpa >> PAGE_SHIFT; 1454 int seg; 1455 int offset = offset_in_page(gpa); 1456 int ret; 1457 1458 while ((seg = next_segment(len, offset)) != 0) { 1459 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1460 if (ret < 0) 1461 return ret; 1462 offset = 0; 1463 len -= seg; 1464 ++gfn; 1465 } 1466 return 0; 1467 } 1468 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1469 1470 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, 1471 gfn_t gfn) 1472 { 1473 if (memslot && memslot->dirty_bitmap) { 1474 unsigned long rel_gfn = gfn - memslot->base_gfn; 1475 1476 /* TODO: introduce set_bit_le() and use it */ 1477 test_and_set_bit_le(rel_gfn, memslot->dirty_bitmap); 1478 } 1479 } 1480 1481 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1482 { 1483 struct kvm_memory_slot *memslot; 1484 1485 memslot = gfn_to_memslot(kvm, gfn); 1486 mark_page_dirty_in_slot(kvm, memslot, gfn); 1487 } 1488 1489 /* 1490 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1491 */ 1492 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1493 { 1494 DEFINE_WAIT(wait); 1495 1496 for (;;) { 1497 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1498 1499 if (kvm_arch_vcpu_runnable(vcpu)) { 1500 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1501 break; 1502 } 1503 if (kvm_cpu_has_pending_timer(vcpu)) 1504 break; 1505 if (signal_pending(current)) 1506 break; 1507 1508 schedule(); 1509 } 1510 1511 finish_wait(&vcpu->wq, &wait); 1512 } 1513 1514 #ifndef CONFIG_S390 1515 /* 1516 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 1517 */ 1518 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 1519 { 1520 int me; 1521 int cpu = vcpu->cpu; 1522 wait_queue_head_t *wqp; 1523 1524 wqp = kvm_arch_vcpu_wq(vcpu); 1525 if (waitqueue_active(wqp)) { 1526 wake_up_interruptible(wqp); 1527 ++vcpu->stat.halt_wakeup; 1528 } 1529 1530 me = get_cpu(); 1531 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 1532 if (kvm_arch_vcpu_should_kick(vcpu)) 1533 smp_send_reschedule(cpu); 1534 put_cpu(); 1535 } 1536 #endif /* !CONFIG_S390 */ 1537 1538 void kvm_resched(struct kvm_vcpu *vcpu) 1539 { 1540 if (!need_resched()) 1541 return; 1542 cond_resched(); 1543 } 1544 EXPORT_SYMBOL_GPL(kvm_resched); 1545 1546 bool kvm_vcpu_yield_to(struct kvm_vcpu *target) 1547 { 1548 struct pid *pid; 1549 struct task_struct *task = NULL; 1550 1551 rcu_read_lock(); 1552 pid = rcu_dereference(target->pid); 1553 if (pid) 1554 task = get_pid_task(target->pid, PIDTYPE_PID); 1555 rcu_read_unlock(); 1556 if (!task) 1557 return false; 1558 if (task->flags & PF_VCPU) { 1559 put_task_struct(task); 1560 return false; 1561 } 1562 if (yield_to(task, 1)) { 1563 put_task_struct(task); 1564 return true; 1565 } 1566 put_task_struct(task); 1567 return false; 1568 } 1569 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1570 1571 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1572 { 1573 struct kvm *kvm = me->kvm; 1574 struct kvm_vcpu *vcpu; 1575 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1576 int yielded = 0; 1577 int pass; 1578 int i; 1579 1580 /* 1581 * We boost the priority of a VCPU that is runnable but not 1582 * currently running, because it got preempted by something 1583 * else and called schedule in __vcpu_run. Hopefully that 1584 * VCPU is holding the lock that we need and will release it. 1585 * We approximate round-robin by starting at the last boosted VCPU. 1586 */ 1587 for (pass = 0; pass < 2 && !yielded; pass++) { 1588 kvm_for_each_vcpu(i, vcpu, kvm) { 1589 if (!pass && i < last_boosted_vcpu) { 1590 i = last_boosted_vcpu; 1591 continue; 1592 } else if (pass && i > last_boosted_vcpu) 1593 break; 1594 if (vcpu == me) 1595 continue; 1596 if (waitqueue_active(&vcpu->wq)) 1597 continue; 1598 if (kvm_vcpu_yield_to(vcpu)) { 1599 kvm->last_boosted_vcpu = i; 1600 yielded = 1; 1601 break; 1602 } 1603 } 1604 } 1605 } 1606 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1607 1608 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1609 { 1610 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1611 struct page *page; 1612 1613 if (vmf->pgoff == 0) 1614 page = virt_to_page(vcpu->run); 1615 #ifdef CONFIG_X86 1616 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1617 page = virt_to_page(vcpu->arch.pio_data); 1618 #endif 1619 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1620 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1621 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1622 #endif 1623 else 1624 return kvm_arch_vcpu_fault(vcpu, vmf); 1625 get_page(page); 1626 vmf->page = page; 1627 return 0; 1628 } 1629 1630 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1631 .fault = kvm_vcpu_fault, 1632 }; 1633 1634 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1635 { 1636 vma->vm_ops = &kvm_vcpu_vm_ops; 1637 return 0; 1638 } 1639 1640 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1641 { 1642 struct kvm_vcpu *vcpu = filp->private_data; 1643 1644 kvm_put_kvm(vcpu->kvm); 1645 return 0; 1646 } 1647 1648 static struct file_operations kvm_vcpu_fops = { 1649 .release = kvm_vcpu_release, 1650 .unlocked_ioctl = kvm_vcpu_ioctl, 1651 #ifdef CONFIG_COMPAT 1652 .compat_ioctl = kvm_vcpu_compat_ioctl, 1653 #endif 1654 .mmap = kvm_vcpu_mmap, 1655 .llseek = noop_llseek, 1656 }; 1657 1658 /* 1659 * Allocates an inode for the vcpu. 1660 */ 1661 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1662 { 1663 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1664 } 1665 1666 /* 1667 * Creates some virtual cpus. Good luck creating more than one. 1668 */ 1669 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1670 { 1671 int r; 1672 struct kvm_vcpu *vcpu, *v; 1673 1674 vcpu = kvm_arch_vcpu_create(kvm, id); 1675 if (IS_ERR(vcpu)) 1676 return PTR_ERR(vcpu); 1677 1678 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1679 1680 r = kvm_arch_vcpu_setup(vcpu); 1681 if (r) 1682 goto vcpu_destroy; 1683 1684 mutex_lock(&kvm->lock); 1685 if (!kvm_vcpu_compatible(vcpu)) { 1686 r = -EINVAL; 1687 goto unlock_vcpu_destroy; 1688 } 1689 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1690 r = -EINVAL; 1691 goto unlock_vcpu_destroy; 1692 } 1693 1694 kvm_for_each_vcpu(r, v, kvm) 1695 if (v->vcpu_id == id) { 1696 r = -EEXIST; 1697 goto unlock_vcpu_destroy; 1698 } 1699 1700 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1701 1702 /* Now it's all set up, let userspace reach it */ 1703 kvm_get_kvm(kvm); 1704 r = create_vcpu_fd(vcpu); 1705 if (r < 0) { 1706 kvm_put_kvm(kvm); 1707 goto unlock_vcpu_destroy; 1708 } 1709 1710 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1711 smp_wmb(); 1712 atomic_inc(&kvm->online_vcpus); 1713 1714 mutex_unlock(&kvm->lock); 1715 return r; 1716 1717 unlock_vcpu_destroy: 1718 mutex_unlock(&kvm->lock); 1719 vcpu_destroy: 1720 kvm_arch_vcpu_destroy(vcpu); 1721 return r; 1722 } 1723 1724 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1725 { 1726 if (sigset) { 1727 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1728 vcpu->sigset_active = 1; 1729 vcpu->sigset = *sigset; 1730 } else 1731 vcpu->sigset_active = 0; 1732 return 0; 1733 } 1734 1735 static long kvm_vcpu_ioctl(struct file *filp, 1736 unsigned int ioctl, unsigned long arg) 1737 { 1738 struct kvm_vcpu *vcpu = filp->private_data; 1739 void __user *argp = (void __user *)arg; 1740 int r; 1741 struct kvm_fpu *fpu = NULL; 1742 struct kvm_sregs *kvm_sregs = NULL; 1743 1744 if (vcpu->kvm->mm != current->mm) 1745 return -EIO; 1746 1747 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1748 /* 1749 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1750 * so vcpu_load() would break it. 1751 */ 1752 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1753 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1754 #endif 1755 1756 1757 vcpu_load(vcpu); 1758 switch (ioctl) { 1759 case KVM_RUN: 1760 r = -EINVAL; 1761 if (arg) 1762 goto out; 1763 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1764 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 1765 break; 1766 case KVM_GET_REGS: { 1767 struct kvm_regs *kvm_regs; 1768 1769 r = -ENOMEM; 1770 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1771 if (!kvm_regs) 1772 goto out; 1773 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1774 if (r) 1775 goto out_free1; 1776 r = -EFAULT; 1777 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1778 goto out_free1; 1779 r = 0; 1780 out_free1: 1781 kfree(kvm_regs); 1782 break; 1783 } 1784 case KVM_SET_REGS: { 1785 struct kvm_regs *kvm_regs; 1786 1787 r = -ENOMEM; 1788 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 1789 if (IS_ERR(kvm_regs)) { 1790 r = PTR_ERR(kvm_regs); 1791 goto out; 1792 } 1793 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1794 if (r) 1795 goto out_free2; 1796 r = 0; 1797 out_free2: 1798 kfree(kvm_regs); 1799 break; 1800 } 1801 case KVM_GET_SREGS: { 1802 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1803 r = -ENOMEM; 1804 if (!kvm_sregs) 1805 goto out; 1806 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1807 if (r) 1808 goto out; 1809 r = -EFAULT; 1810 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1811 goto out; 1812 r = 0; 1813 break; 1814 } 1815 case KVM_SET_SREGS: { 1816 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1817 if (IS_ERR(kvm_sregs)) { 1818 r = PTR_ERR(kvm_sregs); 1819 goto out; 1820 } 1821 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1822 if (r) 1823 goto out; 1824 r = 0; 1825 break; 1826 } 1827 case KVM_GET_MP_STATE: { 1828 struct kvm_mp_state mp_state; 1829 1830 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1831 if (r) 1832 goto out; 1833 r = -EFAULT; 1834 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1835 goto out; 1836 r = 0; 1837 break; 1838 } 1839 case KVM_SET_MP_STATE: { 1840 struct kvm_mp_state mp_state; 1841 1842 r = -EFAULT; 1843 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1844 goto out; 1845 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1846 if (r) 1847 goto out; 1848 r = 0; 1849 break; 1850 } 1851 case KVM_TRANSLATE: { 1852 struct kvm_translation tr; 1853 1854 r = -EFAULT; 1855 if (copy_from_user(&tr, argp, sizeof tr)) 1856 goto out; 1857 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1858 if (r) 1859 goto out; 1860 r = -EFAULT; 1861 if (copy_to_user(argp, &tr, sizeof tr)) 1862 goto out; 1863 r = 0; 1864 break; 1865 } 1866 case KVM_SET_GUEST_DEBUG: { 1867 struct kvm_guest_debug dbg; 1868 1869 r = -EFAULT; 1870 if (copy_from_user(&dbg, argp, sizeof dbg)) 1871 goto out; 1872 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1873 if (r) 1874 goto out; 1875 r = 0; 1876 break; 1877 } 1878 case KVM_SET_SIGNAL_MASK: { 1879 struct kvm_signal_mask __user *sigmask_arg = argp; 1880 struct kvm_signal_mask kvm_sigmask; 1881 sigset_t sigset, *p; 1882 1883 p = NULL; 1884 if (argp) { 1885 r = -EFAULT; 1886 if (copy_from_user(&kvm_sigmask, argp, 1887 sizeof kvm_sigmask)) 1888 goto out; 1889 r = -EINVAL; 1890 if (kvm_sigmask.len != sizeof sigset) 1891 goto out; 1892 r = -EFAULT; 1893 if (copy_from_user(&sigset, sigmask_arg->sigset, 1894 sizeof sigset)) 1895 goto out; 1896 p = &sigset; 1897 } 1898 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1899 break; 1900 } 1901 case KVM_GET_FPU: { 1902 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1903 r = -ENOMEM; 1904 if (!fpu) 1905 goto out; 1906 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1907 if (r) 1908 goto out; 1909 r = -EFAULT; 1910 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1911 goto out; 1912 r = 0; 1913 break; 1914 } 1915 case KVM_SET_FPU: { 1916 fpu = memdup_user(argp, sizeof(*fpu)); 1917 if (IS_ERR(fpu)) { 1918 r = PTR_ERR(fpu); 1919 goto out; 1920 } 1921 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1922 if (r) 1923 goto out; 1924 r = 0; 1925 break; 1926 } 1927 default: 1928 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1929 } 1930 out: 1931 vcpu_put(vcpu); 1932 kfree(fpu); 1933 kfree(kvm_sregs); 1934 return r; 1935 } 1936 1937 #ifdef CONFIG_COMPAT 1938 static long kvm_vcpu_compat_ioctl(struct file *filp, 1939 unsigned int ioctl, unsigned long arg) 1940 { 1941 struct kvm_vcpu *vcpu = filp->private_data; 1942 void __user *argp = compat_ptr(arg); 1943 int r; 1944 1945 if (vcpu->kvm->mm != current->mm) 1946 return -EIO; 1947 1948 switch (ioctl) { 1949 case KVM_SET_SIGNAL_MASK: { 1950 struct kvm_signal_mask __user *sigmask_arg = argp; 1951 struct kvm_signal_mask kvm_sigmask; 1952 compat_sigset_t csigset; 1953 sigset_t sigset; 1954 1955 if (argp) { 1956 r = -EFAULT; 1957 if (copy_from_user(&kvm_sigmask, argp, 1958 sizeof kvm_sigmask)) 1959 goto out; 1960 r = -EINVAL; 1961 if (kvm_sigmask.len != sizeof csigset) 1962 goto out; 1963 r = -EFAULT; 1964 if (copy_from_user(&csigset, sigmask_arg->sigset, 1965 sizeof csigset)) 1966 goto out; 1967 } 1968 sigset_from_compat(&sigset, &csigset); 1969 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1970 break; 1971 } 1972 default: 1973 r = kvm_vcpu_ioctl(filp, ioctl, arg); 1974 } 1975 1976 out: 1977 return r; 1978 } 1979 #endif 1980 1981 static long kvm_vm_ioctl(struct file *filp, 1982 unsigned int ioctl, unsigned long arg) 1983 { 1984 struct kvm *kvm = filp->private_data; 1985 void __user *argp = (void __user *)arg; 1986 int r; 1987 1988 if (kvm->mm != current->mm) 1989 return -EIO; 1990 switch (ioctl) { 1991 case KVM_CREATE_VCPU: 1992 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1993 if (r < 0) 1994 goto out; 1995 break; 1996 case KVM_SET_USER_MEMORY_REGION: { 1997 struct kvm_userspace_memory_region kvm_userspace_mem; 1998 1999 r = -EFAULT; 2000 if (copy_from_user(&kvm_userspace_mem, argp, 2001 sizeof kvm_userspace_mem)) 2002 goto out; 2003 2004 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2005 if (r) 2006 goto out; 2007 break; 2008 } 2009 case KVM_GET_DIRTY_LOG: { 2010 struct kvm_dirty_log log; 2011 2012 r = -EFAULT; 2013 if (copy_from_user(&log, argp, sizeof log)) 2014 goto out; 2015 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2016 if (r) 2017 goto out; 2018 break; 2019 } 2020 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2021 case KVM_REGISTER_COALESCED_MMIO: { 2022 struct kvm_coalesced_mmio_zone zone; 2023 r = -EFAULT; 2024 if (copy_from_user(&zone, argp, sizeof zone)) 2025 goto out; 2026 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2027 if (r) 2028 goto out; 2029 r = 0; 2030 break; 2031 } 2032 case KVM_UNREGISTER_COALESCED_MMIO: { 2033 struct kvm_coalesced_mmio_zone zone; 2034 r = -EFAULT; 2035 if (copy_from_user(&zone, argp, sizeof zone)) 2036 goto out; 2037 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2038 if (r) 2039 goto out; 2040 r = 0; 2041 break; 2042 } 2043 #endif 2044 case KVM_IRQFD: { 2045 struct kvm_irqfd data; 2046 2047 r = -EFAULT; 2048 if (copy_from_user(&data, argp, sizeof data)) 2049 goto out; 2050 r = kvm_irqfd(kvm, &data); 2051 break; 2052 } 2053 case KVM_IOEVENTFD: { 2054 struct kvm_ioeventfd data; 2055 2056 r = -EFAULT; 2057 if (copy_from_user(&data, argp, sizeof data)) 2058 goto out; 2059 r = kvm_ioeventfd(kvm, &data); 2060 break; 2061 } 2062 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2063 case KVM_SET_BOOT_CPU_ID: 2064 r = 0; 2065 mutex_lock(&kvm->lock); 2066 if (atomic_read(&kvm->online_vcpus) != 0) 2067 r = -EBUSY; 2068 else 2069 kvm->bsp_vcpu_id = arg; 2070 mutex_unlock(&kvm->lock); 2071 break; 2072 #endif 2073 #ifdef CONFIG_HAVE_KVM_MSI 2074 case KVM_SIGNAL_MSI: { 2075 struct kvm_msi msi; 2076 2077 r = -EFAULT; 2078 if (copy_from_user(&msi, argp, sizeof msi)) 2079 goto out; 2080 r = kvm_send_userspace_msi(kvm, &msi); 2081 break; 2082 } 2083 #endif 2084 default: 2085 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2086 if (r == -ENOTTY) 2087 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 2088 } 2089 out: 2090 return r; 2091 } 2092 2093 #ifdef CONFIG_COMPAT 2094 struct compat_kvm_dirty_log { 2095 __u32 slot; 2096 __u32 padding1; 2097 union { 2098 compat_uptr_t dirty_bitmap; /* one bit per page */ 2099 __u64 padding2; 2100 }; 2101 }; 2102 2103 static long kvm_vm_compat_ioctl(struct file *filp, 2104 unsigned int ioctl, unsigned long arg) 2105 { 2106 struct kvm *kvm = filp->private_data; 2107 int r; 2108 2109 if (kvm->mm != current->mm) 2110 return -EIO; 2111 switch (ioctl) { 2112 case KVM_GET_DIRTY_LOG: { 2113 struct compat_kvm_dirty_log compat_log; 2114 struct kvm_dirty_log log; 2115 2116 r = -EFAULT; 2117 if (copy_from_user(&compat_log, (void __user *)arg, 2118 sizeof(compat_log))) 2119 goto out; 2120 log.slot = compat_log.slot; 2121 log.padding1 = compat_log.padding1; 2122 log.padding2 = compat_log.padding2; 2123 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2124 2125 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2126 if (r) 2127 goto out; 2128 break; 2129 } 2130 default: 2131 r = kvm_vm_ioctl(filp, ioctl, arg); 2132 } 2133 2134 out: 2135 return r; 2136 } 2137 #endif 2138 2139 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2140 { 2141 struct page *page[1]; 2142 unsigned long addr; 2143 int npages; 2144 gfn_t gfn = vmf->pgoff; 2145 struct kvm *kvm = vma->vm_file->private_data; 2146 2147 addr = gfn_to_hva(kvm, gfn); 2148 if (kvm_is_error_hva(addr)) 2149 return VM_FAULT_SIGBUS; 2150 2151 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2152 NULL); 2153 if (unlikely(npages != 1)) 2154 return VM_FAULT_SIGBUS; 2155 2156 vmf->page = page[0]; 2157 return 0; 2158 } 2159 2160 static const struct vm_operations_struct kvm_vm_vm_ops = { 2161 .fault = kvm_vm_fault, 2162 }; 2163 2164 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2165 { 2166 vma->vm_ops = &kvm_vm_vm_ops; 2167 return 0; 2168 } 2169 2170 static struct file_operations kvm_vm_fops = { 2171 .release = kvm_vm_release, 2172 .unlocked_ioctl = kvm_vm_ioctl, 2173 #ifdef CONFIG_COMPAT 2174 .compat_ioctl = kvm_vm_compat_ioctl, 2175 #endif 2176 .mmap = kvm_vm_mmap, 2177 .llseek = noop_llseek, 2178 }; 2179 2180 static int kvm_dev_ioctl_create_vm(unsigned long type) 2181 { 2182 int r; 2183 struct kvm *kvm; 2184 2185 kvm = kvm_create_vm(type); 2186 if (IS_ERR(kvm)) 2187 return PTR_ERR(kvm); 2188 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2189 r = kvm_coalesced_mmio_init(kvm); 2190 if (r < 0) { 2191 kvm_put_kvm(kvm); 2192 return r; 2193 } 2194 #endif 2195 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2196 if (r < 0) 2197 kvm_put_kvm(kvm); 2198 2199 return r; 2200 } 2201 2202 static long kvm_dev_ioctl_check_extension_generic(long arg) 2203 { 2204 switch (arg) { 2205 case KVM_CAP_USER_MEMORY: 2206 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2207 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2208 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2209 case KVM_CAP_SET_BOOT_CPU_ID: 2210 #endif 2211 case KVM_CAP_INTERNAL_ERROR_DATA: 2212 #ifdef CONFIG_HAVE_KVM_MSI 2213 case KVM_CAP_SIGNAL_MSI: 2214 #endif 2215 return 1; 2216 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2217 case KVM_CAP_IRQ_ROUTING: 2218 return KVM_MAX_IRQ_ROUTES; 2219 #endif 2220 default: 2221 break; 2222 } 2223 return kvm_dev_ioctl_check_extension(arg); 2224 } 2225 2226 static long kvm_dev_ioctl(struct file *filp, 2227 unsigned int ioctl, unsigned long arg) 2228 { 2229 long r = -EINVAL; 2230 2231 switch (ioctl) { 2232 case KVM_GET_API_VERSION: 2233 r = -EINVAL; 2234 if (arg) 2235 goto out; 2236 r = KVM_API_VERSION; 2237 break; 2238 case KVM_CREATE_VM: 2239 r = kvm_dev_ioctl_create_vm(arg); 2240 break; 2241 case KVM_CHECK_EXTENSION: 2242 r = kvm_dev_ioctl_check_extension_generic(arg); 2243 break; 2244 case KVM_GET_VCPU_MMAP_SIZE: 2245 r = -EINVAL; 2246 if (arg) 2247 goto out; 2248 r = PAGE_SIZE; /* struct kvm_run */ 2249 #ifdef CONFIG_X86 2250 r += PAGE_SIZE; /* pio data page */ 2251 #endif 2252 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2253 r += PAGE_SIZE; /* coalesced mmio ring page */ 2254 #endif 2255 break; 2256 case KVM_TRACE_ENABLE: 2257 case KVM_TRACE_PAUSE: 2258 case KVM_TRACE_DISABLE: 2259 r = -EOPNOTSUPP; 2260 break; 2261 default: 2262 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2263 } 2264 out: 2265 return r; 2266 } 2267 2268 static struct file_operations kvm_chardev_ops = { 2269 .unlocked_ioctl = kvm_dev_ioctl, 2270 .compat_ioctl = kvm_dev_ioctl, 2271 .llseek = noop_llseek, 2272 }; 2273 2274 static struct miscdevice kvm_dev = { 2275 KVM_MINOR, 2276 "kvm", 2277 &kvm_chardev_ops, 2278 }; 2279 2280 static void hardware_enable_nolock(void *junk) 2281 { 2282 int cpu = raw_smp_processor_id(); 2283 int r; 2284 2285 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2286 return; 2287 2288 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2289 2290 r = kvm_arch_hardware_enable(NULL); 2291 2292 if (r) { 2293 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2294 atomic_inc(&hardware_enable_failed); 2295 printk(KERN_INFO "kvm: enabling virtualization on " 2296 "CPU%d failed\n", cpu); 2297 } 2298 } 2299 2300 static void hardware_enable(void *junk) 2301 { 2302 raw_spin_lock(&kvm_lock); 2303 hardware_enable_nolock(junk); 2304 raw_spin_unlock(&kvm_lock); 2305 } 2306 2307 static void hardware_disable_nolock(void *junk) 2308 { 2309 int cpu = raw_smp_processor_id(); 2310 2311 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2312 return; 2313 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2314 kvm_arch_hardware_disable(NULL); 2315 } 2316 2317 static void hardware_disable(void *junk) 2318 { 2319 raw_spin_lock(&kvm_lock); 2320 hardware_disable_nolock(junk); 2321 raw_spin_unlock(&kvm_lock); 2322 } 2323 2324 static void hardware_disable_all_nolock(void) 2325 { 2326 BUG_ON(!kvm_usage_count); 2327 2328 kvm_usage_count--; 2329 if (!kvm_usage_count) 2330 on_each_cpu(hardware_disable_nolock, NULL, 1); 2331 } 2332 2333 static void hardware_disable_all(void) 2334 { 2335 raw_spin_lock(&kvm_lock); 2336 hardware_disable_all_nolock(); 2337 raw_spin_unlock(&kvm_lock); 2338 } 2339 2340 static int hardware_enable_all(void) 2341 { 2342 int r = 0; 2343 2344 raw_spin_lock(&kvm_lock); 2345 2346 kvm_usage_count++; 2347 if (kvm_usage_count == 1) { 2348 atomic_set(&hardware_enable_failed, 0); 2349 on_each_cpu(hardware_enable_nolock, NULL, 1); 2350 2351 if (atomic_read(&hardware_enable_failed)) { 2352 hardware_disable_all_nolock(); 2353 r = -EBUSY; 2354 } 2355 } 2356 2357 raw_spin_unlock(&kvm_lock); 2358 2359 return r; 2360 } 2361 2362 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2363 void *v) 2364 { 2365 int cpu = (long)v; 2366 2367 if (!kvm_usage_count) 2368 return NOTIFY_OK; 2369 2370 val &= ~CPU_TASKS_FROZEN; 2371 switch (val) { 2372 case CPU_DYING: 2373 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2374 cpu); 2375 hardware_disable(NULL); 2376 break; 2377 case CPU_STARTING: 2378 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2379 cpu); 2380 hardware_enable(NULL); 2381 break; 2382 } 2383 return NOTIFY_OK; 2384 } 2385 2386 2387 asmlinkage void kvm_spurious_fault(void) 2388 { 2389 /* Fault while not rebooting. We want the trace. */ 2390 BUG(); 2391 } 2392 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2393 2394 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2395 void *v) 2396 { 2397 /* 2398 * Some (well, at least mine) BIOSes hang on reboot if 2399 * in vmx root mode. 2400 * 2401 * And Intel TXT required VMX off for all cpu when system shutdown. 2402 */ 2403 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2404 kvm_rebooting = true; 2405 on_each_cpu(hardware_disable_nolock, NULL, 1); 2406 return NOTIFY_OK; 2407 } 2408 2409 static struct notifier_block kvm_reboot_notifier = { 2410 .notifier_call = kvm_reboot, 2411 .priority = 0, 2412 }; 2413 2414 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2415 { 2416 int i; 2417 2418 for (i = 0; i < bus->dev_count; i++) { 2419 struct kvm_io_device *pos = bus->range[i].dev; 2420 2421 kvm_iodevice_destructor(pos); 2422 } 2423 kfree(bus); 2424 } 2425 2426 int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2427 { 2428 const struct kvm_io_range *r1 = p1; 2429 const struct kvm_io_range *r2 = p2; 2430 2431 if (r1->addr < r2->addr) 2432 return -1; 2433 if (r1->addr + r1->len > r2->addr + r2->len) 2434 return 1; 2435 return 0; 2436 } 2437 2438 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2439 gpa_t addr, int len) 2440 { 2441 bus->range[bus->dev_count++] = (struct kvm_io_range) { 2442 .addr = addr, 2443 .len = len, 2444 .dev = dev, 2445 }; 2446 2447 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 2448 kvm_io_bus_sort_cmp, NULL); 2449 2450 return 0; 2451 } 2452 2453 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2454 gpa_t addr, int len) 2455 { 2456 struct kvm_io_range *range, key; 2457 int off; 2458 2459 key = (struct kvm_io_range) { 2460 .addr = addr, 2461 .len = len, 2462 }; 2463 2464 range = bsearch(&key, bus->range, bus->dev_count, 2465 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 2466 if (range == NULL) 2467 return -ENOENT; 2468 2469 off = range - bus->range; 2470 2471 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) 2472 off--; 2473 2474 return off; 2475 } 2476 2477 /* kvm_io_bus_write - called under kvm->slots_lock */ 2478 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2479 int len, const void *val) 2480 { 2481 int idx; 2482 struct kvm_io_bus *bus; 2483 struct kvm_io_range range; 2484 2485 range = (struct kvm_io_range) { 2486 .addr = addr, 2487 .len = len, 2488 }; 2489 2490 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2491 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2492 if (idx < 0) 2493 return -EOPNOTSUPP; 2494 2495 while (idx < bus->dev_count && 2496 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2497 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) 2498 return 0; 2499 idx++; 2500 } 2501 2502 return -EOPNOTSUPP; 2503 } 2504 2505 /* kvm_io_bus_read - called under kvm->slots_lock */ 2506 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2507 int len, void *val) 2508 { 2509 int idx; 2510 struct kvm_io_bus *bus; 2511 struct kvm_io_range range; 2512 2513 range = (struct kvm_io_range) { 2514 .addr = addr, 2515 .len = len, 2516 }; 2517 2518 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2519 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2520 if (idx < 0) 2521 return -EOPNOTSUPP; 2522 2523 while (idx < bus->dev_count && 2524 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2525 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) 2526 return 0; 2527 idx++; 2528 } 2529 2530 return -EOPNOTSUPP; 2531 } 2532 2533 /* Caller must hold slots_lock. */ 2534 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2535 int len, struct kvm_io_device *dev) 2536 { 2537 struct kvm_io_bus *new_bus, *bus; 2538 2539 bus = kvm->buses[bus_idx]; 2540 if (bus->dev_count > NR_IOBUS_DEVS - 1) 2541 return -ENOSPC; 2542 2543 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 2544 sizeof(struct kvm_io_range)), GFP_KERNEL); 2545 if (!new_bus) 2546 return -ENOMEM; 2547 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 2548 sizeof(struct kvm_io_range))); 2549 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 2550 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2551 synchronize_srcu_expedited(&kvm->srcu); 2552 kfree(bus); 2553 2554 return 0; 2555 } 2556 2557 /* Caller must hold slots_lock. */ 2558 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2559 struct kvm_io_device *dev) 2560 { 2561 int i, r; 2562 struct kvm_io_bus *new_bus, *bus; 2563 2564 bus = kvm->buses[bus_idx]; 2565 r = -ENOENT; 2566 for (i = 0; i < bus->dev_count; i++) 2567 if (bus->range[i].dev == dev) { 2568 r = 0; 2569 break; 2570 } 2571 2572 if (r) 2573 return r; 2574 2575 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 2576 sizeof(struct kvm_io_range)), GFP_KERNEL); 2577 if (!new_bus) 2578 return -ENOMEM; 2579 2580 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 2581 new_bus->dev_count--; 2582 memcpy(new_bus->range + i, bus->range + i + 1, 2583 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 2584 2585 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2586 synchronize_srcu_expedited(&kvm->srcu); 2587 kfree(bus); 2588 return r; 2589 } 2590 2591 static struct notifier_block kvm_cpu_notifier = { 2592 .notifier_call = kvm_cpu_hotplug, 2593 }; 2594 2595 static int vm_stat_get(void *_offset, u64 *val) 2596 { 2597 unsigned offset = (long)_offset; 2598 struct kvm *kvm; 2599 2600 *val = 0; 2601 raw_spin_lock(&kvm_lock); 2602 list_for_each_entry(kvm, &vm_list, vm_list) 2603 *val += *(u32 *)((void *)kvm + offset); 2604 raw_spin_unlock(&kvm_lock); 2605 return 0; 2606 } 2607 2608 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2609 2610 static int vcpu_stat_get(void *_offset, u64 *val) 2611 { 2612 unsigned offset = (long)_offset; 2613 struct kvm *kvm; 2614 struct kvm_vcpu *vcpu; 2615 int i; 2616 2617 *val = 0; 2618 raw_spin_lock(&kvm_lock); 2619 list_for_each_entry(kvm, &vm_list, vm_list) 2620 kvm_for_each_vcpu(i, vcpu, kvm) 2621 *val += *(u32 *)((void *)vcpu + offset); 2622 2623 raw_spin_unlock(&kvm_lock); 2624 return 0; 2625 } 2626 2627 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2628 2629 static const struct file_operations *stat_fops[] = { 2630 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2631 [KVM_STAT_VM] = &vm_stat_fops, 2632 }; 2633 2634 static int kvm_init_debug(void) 2635 { 2636 int r = -EFAULT; 2637 struct kvm_stats_debugfs_item *p; 2638 2639 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2640 if (kvm_debugfs_dir == NULL) 2641 goto out; 2642 2643 for (p = debugfs_entries; p->name; ++p) { 2644 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2645 (void *)(long)p->offset, 2646 stat_fops[p->kind]); 2647 if (p->dentry == NULL) 2648 goto out_dir; 2649 } 2650 2651 return 0; 2652 2653 out_dir: 2654 debugfs_remove_recursive(kvm_debugfs_dir); 2655 out: 2656 return r; 2657 } 2658 2659 static void kvm_exit_debug(void) 2660 { 2661 struct kvm_stats_debugfs_item *p; 2662 2663 for (p = debugfs_entries; p->name; ++p) 2664 debugfs_remove(p->dentry); 2665 debugfs_remove(kvm_debugfs_dir); 2666 } 2667 2668 static int kvm_suspend(void) 2669 { 2670 if (kvm_usage_count) 2671 hardware_disable_nolock(NULL); 2672 return 0; 2673 } 2674 2675 static void kvm_resume(void) 2676 { 2677 if (kvm_usage_count) { 2678 WARN_ON(raw_spin_is_locked(&kvm_lock)); 2679 hardware_enable_nolock(NULL); 2680 } 2681 } 2682 2683 static struct syscore_ops kvm_syscore_ops = { 2684 .suspend = kvm_suspend, 2685 .resume = kvm_resume, 2686 }; 2687 2688 struct page *bad_page; 2689 pfn_t bad_pfn; 2690 2691 static inline 2692 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2693 { 2694 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2695 } 2696 2697 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2698 { 2699 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2700 2701 kvm_arch_vcpu_load(vcpu, cpu); 2702 } 2703 2704 static void kvm_sched_out(struct preempt_notifier *pn, 2705 struct task_struct *next) 2706 { 2707 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2708 2709 kvm_arch_vcpu_put(vcpu); 2710 } 2711 2712 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2713 struct module *module) 2714 { 2715 int r; 2716 int cpu; 2717 2718 r = kvm_arch_init(opaque); 2719 if (r) 2720 goto out_fail; 2721 2722 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2723 2724 if (bad_page == NULL) { 2725 r = -ENOMEM; 2726 goto out; 2727 } 2728 2729 bad_pfn = page_to_pfn(bad_page); 2730 2731 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2732 2733 if (hwpoison_page == NULL) { 2734 r = -ENOMEM; 2735 goto out_free_0; 2736 } 2737 2738 hwpoison_pfn = page_to_pfn(hwpoison_page); 2739 2740 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2741 2742 if (fault_page == NULL) { 2743 r = -ENOMEM; 2744 goto out_free_0; 2745 } 2746 2747 fault_pfn = page_to_pfn(fault_page); 2748 2749 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2750 r = -ENOMEM; 2751 goto out_free_0; 2752 } 2753 2754 r = kvm_arch_hardware_setup(); 2755 if (r < 0) 2756 goto out_free_0a; 2757 2758 for_each_online_cpu(cpu) { 2759 smp_call_function_single(cpu, 2760 kvm_arch_check_processor_compat, 2761 &r, 1); 2762 if (r < 0) 2763 goto out_free_1; 2764 } 2765 2766 r = register_cpu_notifier(&kvm_cpu_notifier); 2767 if (r) 2768 goto out_free_2; 2769 register_reboot_notifier(&kvm_reboot_notifier); 2770 2771 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2772 if (!vcpu_align) 2773 vcpu_align = __alignof__(struct kvm_vcpu); 2774 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2775 0, NULL); 2776 if (!kvm_vcpu_cache) { 2777 r = -ENOMEM; 2778 goto out_free_3; 2779 } 2780 2781 r = kvm_async_pf_init(); 2782 if (r) 2783 goto out_free; 2784 2785 kvm_chardev_ops.owner = module; 2786 kvm_vm_fops.owner = module; 2787 kvm_vcpu_fops.owner = module; 2788 2789 r = misc_register(&kvm_dev); 2790 if (r) { 2791 printk(KERN_ERR "kvm: misc device register failed\n"); 2792 goto out_unreg; 2793 } 2794 2795 register_syscore_ops(&kvm_syscore_ops); 2796 2797 kvm_preempt_ops.sched_in = kvm_sched_in; 2798 kvm_preempt_ops.sched_out = kvm_sched_out; 2799 2800 r = kvm_init_debug(); 2801 if (r) { 2802 printk(KERN_ERR "kvm: create debugfs files failed\n"); 2803 goto out_undebugfs; 2804 } 2805 2806 return 0; 2807 2808 out_undebugfs: 2809 unregister_syscore_ops(&kvm_syscore_ops); 2810 out_unreg: 2811 kvm_async_pf_deinit(); 2812 out_free: 2813 kmem_cache_destroy(kvm_vcpu_cache); 2814 out_free_3: 2815 unregister_reboot_notifier(&kvm_reboot_notifier); 2816 unregister_cpu_notifier(&kvm_cpu_notifier); 2817 out_free_2: 2818 out_free_1: 2819 kvm_arch_hardware_unsetup(); 2820 out_free_0a: 2821 free_cpumask_var(cpus_hardware_enabled); 2822 out_free_0: 2823 if (fault_page) 2824 __free_page(fault_page); 2825 if (hwpoison_page) 2826 __free_page(hwpoison_page); 2827 __free_page(bad_page); 2828 out: 2829 kvm_arch_exit(); 2830 out_fail: 2831 return r; 2832 } 2833 EXPORT_SYMBOL_GPL(kvm_init); 2834 2835 void kvm_exit(void) 2836 { 2837 kvm_exit_debug(); 2838 misc_deregister(&kvm_dev); 2839 kmem_cache_destroy(kvm_vcpu_cache); 2840 kvm_async_pf_deinit(); 2841 unregister_syscore_ops(&kvm_syscore_ops); 2842 unregister_reboot_notifier(&kvm_reboot_notifier); 2843 unregister_cpu_notifier(&kvm_cpu_notifier); 2844 on_each_cpu(hardware_disable_nolock, NULL, 1); 2845 kvm_arch_hardware_unsetup(); 2846 kvm_arch_exit(); 2847 free_cpumask_var(cpus_hardware_enabled); 2848 __free_page(fault_page); 2849 __free_page(hwpoison_page); 2850 __free_page(bad_page); 2851 } 2852 EXPORT_SYMBOL_GPL(kvm_exit); 2853