1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include "iodev.h" 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 51 #include <asm/processor.h> 52 #include <asm/io.h> 53 #include <asm/uaccess.h> 54 #include <asm/pgtable.h> 55 #include <asm-generic/bitops/le.h> 56 57 #include "coalesced_mmio.h" 58 59 #define CREATE_TRACE_POINTS 60 #include <trace/events/kvm.h> 61 62 MODULE_AUTHOR("Qumranet"); 63 MODULE_LICENSE("GPL"); 64 65 /* 66 * Ordering of locks: 67 * 68 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 69 */ 70 71 DEFINE_SPINLOCK(kvm_lock); 72 LIST_HEAD(vm_list); 73 74 static cpumask_var_t cpus_hardware_enabled; 75 static int kvm_usage_count = 0; 76 static atomic_t hardware_enable_failed; 77 78 struct kmem_cache *kvm_vcpu_cache; 79 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 80 81 static __read_mostly struct preempt_ops kvm_preempt_ops; 82 83 struct dentry *kvm_debugfs_dir; 84 85 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 86 unsigned long arg); 87 static int hardware_enable_all(void); 88 static void hardware_disable_all(void); 89 90 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 91 92 static bool kvm_rebooting; 93 94 static bool largepages_enabled = true; 95 96 static struct page *hwpoison_page; 97 static pfn_t hwpoison_pfn; 98 99 static struct page *fault_page; 100 static pfn_t fault_pfn; 101 102 inline int kvm_is_mmio_pfn(pfn_t pfn) 103 { 104 if (pfn_valid(pfn)) { 105 struct page *page = compound_head(pfn_to_page(pfn)); 106 return PageReserved(page); 107 } 108 109 return true; 110 } 111 112 /* 113 * Switches to specified vcpu, until a matching vcpu_put() 114 */ 115 void vcpu_load(struct kvm_vcpu *vcpu) 116 { 117 int cpu; 118 119 mutex_lock(&vcpu->mutex); 120 cpu = get_cpu(); 121 preempt_notifier_register(&vcpu->preempt_notifier); 122 kvm_arch_vcpu_load(vcpu, cpu); 123 put_cpu(); 124 } 125 126 void vcpu_put(struct kvm_vcpu *vcpu) 127 { 128 preempt_disable(); 129 kvm_arch_vcpu_put(vcpu); 130 preempt_notifier_unregister(&vcpu->preempt_notifier); 131 preempt_enable(); 132 mutex_unlock(&vcpu->mutex); 133 } 134 135 static void ack_flush(void *_completed) 136 { 137 } 138 139 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 140 { 141 int i, cpu, me; 142 cpumask_var_t cpus; 143 bool called = true; 144 struct kvm_vcpu *vcpu; 145 146 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 147 148 raw_spin_lock(&kvm->requests_lock); 149 me = smp_processor_id(); 150 kvm_for_each_vcpu(i, vcpu, kvm) { 151 if (kvm_make_check_request(req, vcpu)) 152 continue; 153 cpu = vcpu->cpu; 154 if (cpus != NULL && cpu != -1 && cpu != me) 155 cpumask_set_cpu(cpu, cpus); 156 } 157 if (unlikely(cpus == NULL)) 158 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 159 else if (!cpumask_empty(cpus)) 160 smp_call_function_many(cpus, ack_flush, NULL, 1); 161 else 162 called = false; 163 raw_spin_unlock(&kvm->requests_lock); 164 free_cpumask_var(cpus); 165 return called; 166 } 167 168 void kvm_flush_remote_tlbs(struct kvm *kvm) 169 { 170 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 171 ++kvm->stat.remote_tlb_flush; 172 } 173 174 void kvm_reload_remote_mmus(struct kvm *kvm) 175 { 176 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 177 } 178 179 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 180 { 181 struct page *page; 182 int r; 183 184 mutex_init(&vcpu->mutex); 185 vcpu->cpu = -1; 186 vcpu->kvm = kvm; 187 vcpu->vcpu_id = id; 188 init_waitqueue_head(&vcpu->wq); 189 190 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 191 if (!page) { 192 r = -ENOMEM; 193 goto fail; 194 } 195 vcpu->run = page_address(page); 196 197 r = kvm_arch_vcpu_init(vcpu); 198 if (r < 0) 199 goto fail_free_run; 200 return 0; 201 202 fail_free_run: 203 free_page((unsigned long)vcpu->run); 204 fail: 205 return r; 206 } 207 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 208 209 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 210 { 211 kvm_arch_vcpu_uninit(vcpu); 212 free_page((unsigned long)vcpu->run); 213 } 214 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 215 216 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 217 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 218 { 219 return container_of(mn, struct kvm, mmu_notifier); 220 } 221 222 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 223 struct mm_struct *mm, 224 unsigned long address) 225 { 226 struct kvm *kvm = mmu_notifier_to_kvm(mn); 227 int need_tlb_flush, idx; 228 229 /* 230 * When ->invalidate_page runs, the linux pte has been zapped 231 * already but the page is still allocated until 232 * ->invalidate_page returns. So if we increase the sequence 233 * here the kvm page fault will notice if the spte can't be 234 * established because the page is going to be freed. If 235 * instead the kvm page fault establishes the spte before 236 * ->invalidate_page runs, kvm_unmap_hva will release it 237 * before returning. 238 * 239 * The sequence increase only need to be seen at spin_unlock 240 * time, and not at spin_lock time. 241 * 242 * Increasing the sequence after the spin_unlock would be 243 * unsafe because the kvm page fault could then establish the 244 * pte after kvm_unmap_hva returned, without noticing the page 245 * is going to be freed. 246 */ 247 idx = srcu_read_lock(&kvm->srcu); 248 spin_lock(&kvm->mmu_lock); 249 kvm->mmu_notifier_seq++; 250 need_tlb_flush = kvm_unmap_hva(kvm, address); 251 spin_unlock(&kvm->mmu_lock); 252 srcu_read_unlock(&kvm->srcu, idx); 253 254 /* we've to flush the tlb before the pages can be freed */ 255 if (need_tlb_flush) 256 kvm_flush_remote_tlbs(kvm); 257 258 } 259 260 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 261 struct mm_struct *mm, 262 unsigned long address, 263 pte_t pte) 264 { 265 struct kvm *kvm = mmu_notifier_to_kvm(mn); 266 int idx; 267 268 idx = srcu_read_lock(&kvm->srcu); 269 spin_lock(&kvm->mmu_lock); 270 kvm->mmu_notifier_seq++; 271 kvm_set_spte_hva(kvm, address, pte); 272 spin_unlock(&kvm->mmu_lock); 273 srcu_read_unlock(&kvm->srcu, idx); 274 } 275 276 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 277 struct mm_struct *mm, 278 unsigned long start, 279 unsigned long end) 280 { 281 struct kvm *kvm = mmu_notifier_to_kvm(mn); 282 int need_tlb_flush = 0, idx; 283 284 idx = srcu_read_lock(&kvm->srcu); 285 spin_lock(&kvm->mmu_lock); 286 /* 287 * The count increase must become visible at unlock time as no 288 * spte can be established without taking the mmu_lock and 289 * count is also read inside the mmu_lock critical section. 290 */ 291 kvm->mmu_notifier_count++; 292 for (; start < end; start += PAGE_SIZE) 293 need_tlb_flush |= kvm_unmap_hva(kvm, start); 294 spin_unlock(&kvm->mmu_lock); 295 srcu_read_unlock(&kvm->srcu, idx); 296 297 /* we've to flush the tlb before the pages can be freed */ 298 if (need_tlb_flush) 299 kvm_flush_remote_tlbs(kvm); 300 } 301 302 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 303 struct mm_struct *mm, 304 unsigned long start, 305 unsigned long end) 306 { 307 struct kvm *kvm = mmu_notifier_to_kvm(mn); 308 309 spin_lock(&kvm->mmu_lock); 310 /* 311 * This sequence increase will notify the kvm page fault that 312 * the page that is going to be mapped in the spte could have 313 * been freed. 314 */ 315 kvm->mmu_notifier_seq++; 316 /* 317 * The above sequence increase must be visible before the 318 * below count decrease but both values are read by the kvm 319 * page fault under mmu_lock spinlock so we don't need to add 320 * a smb_wmb() here in between the two. 321 */ 322 kvm->mmu_notifier_count--; 323 spin_unlock(&kvm->mmu_lock); 324 325 BUG_ON(kvm->mmu_notifier_count < 0); 326 } 327 328 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 329 struct mm_struct *mm, 330 unsigned long address) 331 { 332 struct kvm *kvm = mmu_notifier_to_kvm(mn); 333 int young, idx; 334 335 idx = srcu_read_lock(&kvm->srcu); 336 spin_lock(&kvm->mmu_lock); 337 young = kvm_age_hva(kvm, address); 338 spin_unlock(&kvm->mmu_lock); 339 srcu_read_unlock(&kvm->srcu, idx); 340 341 if (young) 342 kvm_flush_remote_tlbs(kvm); 343 344 return young; 345 } 346 347 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 348 struct mm_struct *mm) 349 { 350 struct kvm *kvm = mmu_notifier_to_kvm(mn); 351 int idx; 352 353 idx = srcu_read_lock(&kvm->srcu); 354 kvm_arch_flush_shadow(kvm); 355 srcu_read_unlock(&kvm->srcu, idx); 356 } 357 358 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 359 .invalidate_page = kvm_mmu_notifier_invalidate_page, 360 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 361 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 362 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 363 .change_pte = kvm_mmu_notifier_change_pte, 364 .release = kvm_mmu_notifier_release, 365 }; 366 367 static int kvm_init_mmu_notifier(struct kvm *kvm) 368 { 369 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 370 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 371 } 372 373 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 374 375 static int kvm_init_mmu_notifier(struct kvm *kvm) 376 { 377 return 0; 378 } 379 380 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 381 382 static struct kvm *kvm_create_vm(void) 383 { 384 int r = 0, i; 385 struct kvm *kvm = kvm_arch_create_vm(); 386 387 if (IS_ERR(kvm)) 388 goto out; 389 390 r = hardware_enable_all(); 391 if (r) 392 goto out_err_nodisable; 393 394 #ifdef CONFIG_HAVE_KVM_IRQCHIP 395 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 396 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 397 #endif 398 399 r = -ENOMEM; 400 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 401 if (!kvm->memslots) 402 goto out_err; 403 if (init_srcu_struct(&kvm->srcu)) 404 goto out_err; 405 for (i = 0; i < KVM_NR_BUSES; i++) { 406 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 407 GFP_KERNEL); 408 if (!kvm->buses[i]) { 409 cleanup_srcu_struct(&kvm->srcu); 410 goto out_err; 411 } 412 } 413 414 r = kvm_init_mmu_notifier(kvm); 415 if (r) { 416 cleanup_srcu_struct(&kvm->srcu); 417 goto out_err; 418 } 419 420 kvm->mm = current->mm; 421 atomic_inc(&kvm->mm->mm_count); 422 spin_lock_init(&kvm->mmu_lock); 423 raw_spin_lock_init(&kvm->requests_lock); 424 kvm_eventfd_init(kvm); 425 mutex_init(&kvm->lock); 426 mutex_init(&kvm->irq_lock); 427 mutex_init(&kvm->slots_lock); 428 atomic_set(&kvm->users_count, 1); 429 spin_lock(&kvm_lock); 430 list_add(&kvm->vm_list, &vm_list); 431 spin_unlock(&kvm_lock); 432 out: 433 return kvm; 434 435 out_err: 436 hardware_disable_all(); 437 out_err_nodisable: 438 for (i = 0; i < KVM_NR_BUSES; i++) 439 kfree(kvm->buses[i]); 440 kfree(kvm->memslots); 441 kfree(kvm); 442 return ERR_PTR(r); 443 } 444 445 /* 446 * Free any memory in @free but not in @dont. 447 */ 448 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 449 struct kvm_memory_slot *dont) 450 { 451 int i; 452 453 if (!dont || free->rmap != dont->rmap) 454 vfree(free->rmap); 455 456 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 457 vfree(free->dirty_bitmap); 458 459 460 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 461 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 462 vfree(free->lpage_info[i]); 463 free->lpage_info[i] = NULL; 464 } 465 } 466 467 free->npages = 0; 468 free->dirty_bitmap = NULL; 469 free->rmap = NULL; 470 } 471 472 void kvm_free_physmem(struct kvm *kvm) 473 { 474 int i; 475 struct kvm_memslots *slots = kvm->memslots; 476 477 for (i = 0; i < slots->nmemslots; ++i) 478 kvm_free_physmem_slot(&slots->memslots[i], NULL); 479 480 kfree(kvm->memslots); 481 } 482 483 static void kvm_destroy_vm(struct kvm *kvm) 484 { 485 int i; 486 struct mm_struct *mm = kvm->mm; 487 488 kvm_arch_sync_events(kvm); 489 spin_lock(&kvm_lock); 490 list_del(&kvm->vm_list); 491 spin_unlock(&kvm_lock); 492 kvm_free_irq_routing(kvm); 493 for (i = 0; i < KVM_NR_BUSES; i++) 494 kvm_io_bus_destroy(kvm->buses[i]); 495 kvm_coalesced_mmio_free(kvm); 496 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 497 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 498 #else 499 kvm_arch_flush_shadow(kvm); 500 #endif 501 kvm_arch_destroy_vm(kvm); 502 hardware_disable_all(); 503 mmdrop(mm); 504 } 505 506 void kvm_get_kvm(struct kvm *kvm) 507 { 508 atomic_inc(&kvm->users_count); 509 } 510 EXPORT_SYMBOL_GPL(kvm_get_kvm); 511 512 void kvm_put_kvm(struct kvm *kvm) 513 { 514 if (atomic_dec_and_test(&kvm->users_count)) 515 kvm_destroy_vm(kvm); 516 } 517 EXPORT_SYMBOL_GPL(kvm_put_kvm); 518 519 520 static int kvm_vm_release(struct inode *inode, struct file *filp) 521 { 522 struct kvm *kvm = filp->private_data; 523 524 kvm_irqfd_release(kvm); 525 526 kvm_put_kvm(kvm); 527 return 0; 528 } 529 530 /* 531 * Allocate some memory and give it an address in the guest physical address 532 * space. 533 * 534 * Discontiguous memory is allowed, mostly for framebuffers. 535 * 536 * Must be called holding mmap_sem for write. 537 */ 538 int __kvm_set_memory_region(struct kvm *kvm, 539 struct kvm_userspace_memory_region *mem, 540 int user_alloc) 541 { 542 int r, flush_shadow = 0; 543 gfn_t base_gfn; 544 unsigned long npages; 545 unsigned long i; 546 struct kvm_memory_slot *memslot; 547 struct kvm_memory_slot old, new; 548 struct kvm_memslots *slots, *old_memslots; 549 550 r = -EINVAL; 551 /* General sanity checks */ 552 if (mem->memory_size & (PAGE_SIZE - 1)) 553 goto out; 554 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 555 goto out; 556 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 557 goto out; 558 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 559 goto out; 560 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 561 goto out; 562 563 memslot = &kvm->memslots->memslots[mem->slot]; 564 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 565 npages = mem->memory_size >> PAGE_SHIFT; 566 567 r = -EINVAL; 568 if (npages > KVM_MEM_MAX_NR_PAGES) 569 goto out; 570 571 if (!npages) 572 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 573 574 new = old = *memslot; 575 576 new.id = mem->slot; 577 new.base_gfn = base_gfn; 578 new.npages = npages; 579 new.flags = mem->flags; 580 581 /* Disallow changing a memory slot's size. */ 582 r = -EINVAL; 583 if (npages && old.npages && npages != old.npages) 584 goto out_free; 585 586 /* Check for overlaps */ 587 r = -EEXIST; 588 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 589 struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 590 591 if (s == memslot || !s->npages) 592 continue; 593 if (!((base_gfn + npages <= s->base_gfn) || 594 (base_gfn >= s->base_gfn + s->npages))) 595 goto out_free; 596 } 597 598 /* Free page dirty bitmap if unneeded */ 599 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 600 new.dirty_bitmap = NULL; 601 602 r = -ENOMEM; 603 604 /* Allocate if a slot is being created */ 605 #ifndef CONFIG_S390 606 if (npages && !new.rmap) { 607 new.rmap = vmalloc(npages * sizeof(*new.rmap)); 608 609 if (!new.rmap) 610 goto out_free; 611 612 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 613 614 new.user_alloc = user_alloc; 615 new.userspace_addr = mem->userspace_addr; 616 } 617 if (!npages) 618 goto skip_lpage; 619 620 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 621 unsigned long ugfn; 622 unsigned long j; 623 int lpages; 624 int level = i + 2; 625 626 /* Avoid unused variable warning if no large pages */ 627 (void)level; 628 629 if (new.lpage_info[i]) 630 continue; 631 632 lpages = 1 + ((base_gfn + npages - 1) 633 >> KVM_HPAGE_GFN_SHIFT(level)); 634 lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 635 636 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 637 638 if (!new.lpage_info[i]) 639 goto out_free; 640 641 memset(new.lpage_info[i], 0, 642 lpages * sizeof(*new.lpage_info[i])); 643 644 if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 645 new.lpage_info[i][0].write_count = 1; 646 if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 647 new.lpage_info[i][lpages - 1].write_count = 1; 648 ugfn = new.userspace_addr >> PAGE_SHIFT; 649 /* 650 * If the gfn and userspace address are not aligned wrt each 651 * other, or if explicitly asked to, disable large page 652 * support for this slot 653 */ 654 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 655 !largepages_enabled) 656 for (j = 0; j < lpages; ++j) 657 new.lpage_info[i][j].write_count = 1; 658 } 659 660 skip_lpage: 661 662 /* Allocate page dirty bitmap if needed */ 663 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 664 unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(&new); 665 666 new.dirty_bitmap = vmalloc(dirty_bytes); 667 if (!new.dirty_bitmap) 668 goto out_free; 669 memset(new.dirty_bitmap, 0, dirty_bytes); 670 /* destroy any largepage mappings for dirty tracking */ 671 if (old.npages) 672 flush_shadow = 1; 673 } 674 #else /* not defined CONFIG_S390 */ 675 new.user_alloc = user_alloc; 676 if (user_alloc) 677 new.userspace_addr = mem->userspace_addr; 678 #endif /* not defined CONFIG_S390 */ 679 680 if (!npages) { 681 r = -ENOMEM; 682 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 683 if (!slots) 684 goto out_free; 685 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 686 if (mem->slot >= slots->nmemslots) 687 slots->nmemslots = mem->slot + 1; 688 slots->memslots[mem->slot].flags |= KVM_MEMSLOT_INVALID; 689 690 old_memslots = kvm->memslots; 691 rcu_assign_pointer(kvm->memslots, slots); 692 synchronize_srcu_expedited(&kvm->srcu); 693 /* From this point no new shadow pages pointing to a deleted 694 * memslot will be created. 695 * 696 * validation of sp->gfn happens in: 697 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 698 * - kvm_is_visible_gfn (mmu_check_roots) 699 */ 700 kvm_arch_flush_shadow(kvm); 701 kfree(old_memslots); 702 } 703 704 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 705 if (r) 706 goto out_free; 707 708 /* map the pages in iommu page table */ 709 if (npages) { 710 r = kvm_iommu_map_pages(kvm, &new); 711 if (r) 712 goto out_free; 713 } 714 715 r = -ENOMEM; 716 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 717 if (!slots) 718 goto out_free; 719 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 720 if (mem->slot >= slots->nmemslots) 721 slots->nmemslots = mem->slot + 1; 722 723 /* actual memory is freed via old in kvm_free_physmem_slot below */ 724 if (!npages) { 725 new.rmap = NULL; 726 new.dirty_bitmap = NULL; 727 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) 728 new.lpage_info[i] = NULL; 729 } 730 731 slots->memslots[mem->slot] = new; 732 old_memslots = kvm->memslots; 733 rcu_assign_pointer(kvm->memslots, slots); 734 synchronize_srcu_expedited(&kvm->srcu); 735 736 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 737 738 kvm_free_physmem_slot(&old, &new); 739 kfree(old_memslots); 740 741 if (flush_shadow) 742 kvm_arch_flush_shadow(kvm); 743 744 return 0; 745 746 out_free: 747 kvm_free_physmem_slot(&new, &old); 748 out: 749 return r; 750 751 } 752 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 753 754 int kvm_set_memory_region(struct kvm *kvm, 755 struct kvm_userspace_memory_region *mem, 756 int user_alloc) 757 { 758 int r; 759 760 mutex_lock(&kvm->slots_lock); 761 r = __kvm_set_memory_region(kvm, mem, user_alloc); 762 mutex_unlock(&kvm->slots_lock); 763 return r; 764 } 765 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 766 767 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 768 struct 769 kvm_userspace_memory_region *mem, 770 int user_alloc) 771 { 772 if (mem->slot >= KVM_MEMORY_SLOTS) 773 return -EINVAL; 774 return kvm_set_memory_region(kvm, mem, user_alloc); 775 } 776 777 int kvm_get_dirty_log(struct kvm *kvm, 778 struct kvm_dirty_log *log, int *is_dirty) 779 { 780 struct kvm_memory_slot *memslot; 781 int r, i; 782 unsigned long n; 783 unsigned long any = 0; 784 785 r = -EINVAL; 786 if (log->slot >= KVM_MEMORY_SLOTS) 787 goto out; 788 789 memslot = &kvm->memslots->memslots[log->slot]; 790 r = -ENOENT; 791 if (!memslot->dirty_bitmap) 792 goto out; 793 794 n = kvm_dirty_bitmap_bytes(memslot); 795 796 for (i = 0; !any && i < n/sizeof(long); ++i) 797 any = memslot->dirty_bitmap[i]; 798 799 r = -EFAULT; 800 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 801 goto out; 802 803 if (any) 804 *is_dirty = 1; 805 806 r = 0; 807 out: 808 return r; 809 } 810 811 void kvm_disable_largepages(void) 812 { 813 largepages_enabled = false; 814 } 815 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 816 817 int is_error_page(struct page *page) 818 { 819 return page == bad_page || page == hwpoison_page || page == fault_page; 820 } 821 EXPORT_SYMBOL_GPL(is_error_page); 822 823 int is_error_pfn(pfn_t pfn) 824 { 825 return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; 826 } 827 EXPORT_SYMBOL_GPL(is_error_pfn); 828 829 int is_hwpoison_pfn(pfn_t pfn) 830 { 831 return pfn == hwpoison_pfn; 832 } 833 EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 834 835 int is_fault_pfn(pfn_t pfn) 836 { 837 return pfn == fault_pfn; 838 } 839 EXPORT_SYMBOL_GPL(is_fault_pfn); 840 841 static inline unsigned long bad_hva(void) 842 { 843 return PAGE_OFFSET; 844 } 845 846 int kvm_is_error_hva(unsigned long addr) 847 { 848 return addr == bad_hva(); 849 } 850 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 851 852 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 853 { 854 int i; 855 struct kvm_memslots *slots = kvm_memslots(kvm); 856 857 for (i = 0; i < slots->nmemslots; ++i) { 858 struct kvm_memory_slot *memslot = &slots->memslots[i]; 859 860 if (gfn >= memslot->base_gfn 861 && gfn < memslot->base_gfn + memslot->npages) 862 return memslot; 863 } 864 return NULL; 865 } 866 EXPORT_SYMBOL_GPL(gfn_to_memslot); 867 868 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 869 { 870 int i; 871 struct kvm_memslots *slots = kvm_memslots(kvm); 872 873 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 874 struct kvm_memory_slot *memslot = &slots->memslots[i]; 875 876 if (memslot->flags & KVM_MEMSLOT_INVALID) 877 continue; 878 879 if (gfn >= memslot->base_gfn 880 && gfn < memslot->base_gfn + memslot->npages) 881 return 1; 882 } 883 return 0; 884 } 885 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 886 887 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 888 { 889 struct vm_area_struct *vma; 890 unsigned long addr, size; 891 892 size = PAGE_SIZE; 893 894 addr = gfn_to_hva(kvm, gfn); 895 if (kvm_is_error_hva(addr)) 896 return PAGE_SIZE; 897 898 down_read(¤t->mm->mmap_sem); 899 vma = find_vma(current->mm, addr); 900 if (!vma) 901 goto out; 902 903 size = vma_kernel_pagesize(vma); 904 905 out: 906 up_read(¤t->mm->mmap_sem); 907 908 return size; 909 } 910 911 int memslot_id(struct kvm *kvm, gfn_t gfn) 912 { 913 int i; 914 struct kvm_memslots *slots = kvm_memslots(kvm); 915 struct kvm_memory_slot *memslot = NULL; 916 917 for (i = 0; i < slots->nmemslots; ++i) { 918 memslot = &slots->memslots[i]; 919 920 if (gfn >= memslot->base_gfn 921 && gfn < memslot->base_gfn + memslot->npages) 922 break; 923 } 924 925 return memslot - slots->memslots; 926 } 927 928 static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, 929 gfn_t *nr_pages) 930 { 931 struct kvm_memory_slot *slot; 932 933 slot = gfn_to_memslot(kvm, gfn); 934 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 935 return bad_hva(); 936 937 if (nr_pages) 938 *nr_pages = slot->npages - (gfn - slot->base_gfn); 939 940 return gfn_to_hva_memslot(slot, gfn); 941 } 942 943 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 944 { 945 return gfn_to_hva_many(kvm, gfn, NULL); 946 } 947 EXPORT_SYMBOL_GPL(gfn_to_hva); 948 949 static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic) 950 { 951 struct page *page[1]; 952 int npages; 953 pfn_t pfn; 954 955 if (atomic) 956 npages = __get_user_pages_fast(addr, 1, 1, page); 957 else { 958 might_sleep(); 959 npages = get_user_pages_fast(addr, 1, 1, page); 960 } 961 962 if (unlikely(npages != 1)) { 963 struct vm_area_struct *vma; 964 965 if (atomic) 966 goto return_fault_page; 967 968 down_read(¤t->mm->mmap_sem); 969 if (is_hwpoison_address(addr)) { 970 up_read(¤t->mm->mmap_sem); 971 get_page(hwpoison_page); 972 return page_to_pfn(hwpoison_page); 973 } 974 975 vma = find_vma(current->mm, addr); 976 977 if (vma == NULL || addr < vma->vm_start || 978 !(vma->vm_flags & VM_PFNMAP)) { 979 up_read(¤t->mm->mmap_sem); 980 return_fault_page: 981 get_page(fault_page); 982 return page_to_pfn(fault_page); 983 } 984 985 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 986 up_read(¤t->mm->mmap_sem); 987 BUG_ON(!kvm_is_mmio_pfn(pfn)); 988 } else 989 pfn = page_to_pfn(page[0]); 990 991 return pfn; 992 } 993 994 pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr) 995 { 996 return hva_to_pfn(kvm, addr, true); 997 } 998 EXPORT_SYMBOL_GPL(hva_to_pfn_atomic); 999 1000 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic) 1001 { 1002 unsigned long addr; 1003 1004 addr = gfn_to_hva(kvm, gfn); 1005 if (kvm_is_error_hva(addr)) { 1006 get_page(bad_page); 1007 return page_to_pfn(bad_page); 1008 } 1009 1010 return hva_to_pfn(kvm, addr, atomic); 1011 } 1012 1013 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1014 { 1015 return __gfn_to_pfn(kvm, gfn, true); 1016 } 1017 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1018 1019 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1020 { 1021 return __gfn_to_pfn(kvm, gfn, false); 1022 } 1023 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1024 1025 pfn_t gfn_to_pfn_memslot(struct kvm *kvm, 1026 struct kvm_memory_slot *slot, gfn_t gfn) 1027 { 1028 unsigned long addr = gfn_to_hva_memslot(slot, gfn); 1029 return hva_to_pfn(kvm, addr, false); 1030 } 1031 1032 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1033 int nr_pages) 1034 { 1035 unsigned long addr; 1036 gfn_t entry; 1037 1038 addr = gfn_to_hva_many(kvm, gfn, &entry); 1039 if (kvm_is_error_hva(addr)) 1040 return -1; 1041 1042 if (entry < nr_pages) 1043 return 0; 1044 1045 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1046 } 1047 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1048 1049 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1050 { 1051 pfn_t pfn; 1052 1053 pfn = gfn_to_pfn(kvm, gfn); 1054 if (!kvm_is_mmio_pfn(pfn)) 1055 return pfn_to_page(pfn); 1056 1057 WARN_ON(kvm_is_mmio_pfn(pfn)); 1058 1059 get_page(bad_page); 1060 return bad_page; 1061 } 1062 1063 EXPORT_SYMBOL_GPL(gfn_to_page); 1064 1065 void kvm_release_page_clean(struct page *page) 1066 { 1067 kvm_release_pfn_clean(page_to_pfn(page)); 1068 } 1069 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1070 1071 void kvm_release_pfn_clean(pfn_t pfn) 1072 { 1073 if (!kvm_is_mmio_pfn(pfn)) 1074 put_page(pfn_to_page(pfn)); 1075 } 1076 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1077 1078 void kvm_release_page_dirty(struct page *page) 1079 { 1080 kvm_release_pfn_dirty(page_to_pfn(page)); 1081 } 1082 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1083 1084 void kvm_release_pfn_dirty(pfn_t pfn) 1085 { 1086 kvm_set_pfn_dirty(pfn); 1087 kvm_release_pfn_clean(pfn); 1088 } 1089 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1090 1091 void kvm_set_page_dirty(struct page *page) 1092 { 1093 kvm_set_pfn_dirty(page_to_pfn(page)); 1094 } 1095 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1096 1097 void kvm_set_pfn_dirty(pfn_t pfn) 1098 { 1099 if (!kvm_is_mmio_pfn(pfn)) { 1100 struct page *page = pfn_to_page(pfn); 1101 if (!PageReserved(page)) 1102 SetPageDirty(page); 1103 } 1104 } 1105 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1106 1107 void kvm_set_pfn_accessed(pfn_t pfn) 1108 { 1109 if (!kvm_is_mmio_pfn(pfn)) 1110 mark_page_accessed(pfn_to_page(pfn)); 1111 } 1112 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1113 1114 void kvm_get_pfn(pfn_t pfn) 1115 { 1116 if (!kvm_is_mmio_pfn(pfn)) 1117 get_page(pfn_to_page(pfn)); 1118 } 1119 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1120 1121 static int next_segment(unsigned long len, int offset) 1122 { 1123 if (len > PAGE_SIZE - offset) 1124 return PAGE_SIZE - offset; 1125 else 1126 return len; 1127 } 1128 1129 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1130 int len) 1131 { 1132 int r; 1133 unsigned long addr; 1134 1135 addr = gfn_to_hva(kvm, gfn); 1136 if (kvm_is_error_hva(addr)) 1137 return -EFAULT; 1138 r = copy_from_user(data, (void __user *)addr + offset, len); 1139 if (r) 1140 return -EFAULT; 1141 return 0; 1142 } 1143 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1144 1145 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1146 { 1147 gfn_t gfn = gpa >> PAGE_SHIFT; 1148 int seg; 1149 int offset = offset_in_page(gpa); 1150 int ret; 1151 1152 while ((seg = next_segment(len, offset)) != 0) { 1153 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1154 if (ret < 0) 1155 return ret; 1156 offset = 0; 1157 len -= seg; 1158 data += seg; 1159 ++gfn; 1160 } 1161 return 0; 1162 } 1163 EXPORT_SYMBOL_GPL(kvm_read_guest); 1164 1165 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1166 unsigned long len) 1167 { 1168 int r; 1169 unsigned long addr; 1170 gfn_t gfn = gpa >> PAGE_SHIFT; 1171 int offset = offset_in_page(gpa); 1172 1173 addr = gfn_to_hva(kvm, gfn); 1174 if (kvm_is_error_hva(addr)) 1175 return -EFAULT; 1176 pagefault_disable(); 1177 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1178 pagefault_enable(); 1179 if (r) 1180 return -EFAULT; 1181 return 0; 1182 } 1183 EXPORT_SYMBOL(kvm_read_guest_atomic); 1184 1185 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1186 int offset, int len) 1187 { 1188 int r; 1189 unsigned long addr; 1190 1191 addr = gfn_to_hva(kvm, gfn); 1192 if (kvm_is_error_hva(addr)) 1193 return -EFAULT; 1194 r = copy_to_user((void __user *)addr + offset, data, len); 1195 if (r) 1196 return -EFAULT; 1197 mark_page_dirty(kvm, gfn); 1198 return 0; 1199 } 1200 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1201 1202 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1203 unsigned long len) 1204 { 1205 gfn_t gfn = gpa >> PAGE_SHIFT; 1206 int seg; 1207 int offset = offset_in_page(gpa); 1208 int ret; 1209 1210 while ((seg = next_segment(len, offset)) != 0) { 1211 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1212 if (ret < 0) 1213 return ret; 1214 offset = 0; 1215 len -= seg; 1216 data += seg; 1217 ++gfn; 1218 } 1219 return 0; 1220 } 1221 1222 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1223 { 1224 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1225 } 1226 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1227 1228 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1229 { 1230 gfn_t gfn = gpa >> PAGE_SHIFT; 1231 int seg; 1232 int offset = offset_in_page(gpa); 1233 int ret; 1234 1235 while ((seg = next_segment(len, offset)) != 0) { 1236 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1237 if (ret < 0) 1238 return ret; 1239 offset = 0; 1240 len -= seg; 1241 ++gfn; 1242 } 1243 return 0; 1244 } 1245 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1246 1247 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1248 { 1249 struct kvm_memory_slot *memslot; 1250 1251 memslot = gfn_to_memslot(kvm, gfn); 1252 if (memslot && memslot->dirty_bitmap) { 1253 unsigned long rel_gfn = gfn - memslot->base_gfn; 1254 1255 generic___set_le_bit(rel_gfn, memslot->dirty_bitmap); 1256 } 1257 } 1258 1259 /* 1260 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1261 */ 1262 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1263 { 1264 DEFINE_WAIT(wait); 1265 1266 for (;;) { 1267 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1268 1269 if (kvm_arch_vcpu_runnable(vcpu)) { 1270 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1271 break; 1272 } 1273 if (kvm_cpu_has_pending_timer(vcpu)) 1274 break; 1275 if (signal_pending(current)) 1276 break; 1277 1278 schedule(); 1279 } 1280 1281 finish_wait(&vcpu->wq, &wait); 1282 } 1283 1284 void kvm_resched(struct kvm_vcpu *vcpu) 1285 { 1286 if (!need_resched()) 1287 return; 1288 cond_resched(); 1289 } 1290 EXPORT_SYMBOL_GPL(kvm_resched); 1291 1292 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) 1293 { 1294 ktime_t expires; 1295 DEFINE_WAIT(wait); 1296 1297 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1298 1299 /* Sleep for 100 us, and hope lock-holder got scheduled */ 1300 expires = ktime_add_ns(ktime_get(), 100000UL); 1301 schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); 1302 1303 finish_wait(&vcpu->wq, &wait); 1304 } 1305 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1306 1307 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1308 { 1309 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1310 struct page *page; 1311 1312 if (vmf->pgoff == 0) 1313 page = virt_to_page(vcpu->run); 1314 #ifdef CONFIG_X86 1315 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1316 page = virt_to_page(vcpu->arch.pio_data); 1317 #endif 1318 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1319 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1320 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1321 #endif 1322 else 1323 return VM_FAULT_SIGBUS; 1324 get_page(page); 1325 vmf->page = page; 1326 return 0; 1327 } 1328 1329 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1330 .fault = kvm_vcpu_fault, 1331 }; 1332 1333 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1334 { 1335 vma->vm_ops = &kvm_vcpu_vm_ops; 1336 return 0; 1337 } 1338 1339 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1340 { 1341 struct kvm_vcpu *vcpu = filp->private_data; 1342 1343 kvm_put_kvm(vcpu->kvm); 1344 return 0; 1345 } 1346 1347 static struct file_operations kvm_vcpu_fops = { 1348 .release = kvm_vcpu_release, 1349 .unlocked_ioctl = kvm_vcpu_ioctl, 1350 .compat_ioctl = kvm_vcpu_ioctl, 1351 .mmap = kvm_vcpu_mmap, 1352 .llseek = noop_llseek, 1353 }; 1354 1355 /* 1356 * Allocates an inode for the vcpu. 1357 */ 1358 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1359 { 1360 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1361 } 1362 1363 /* 1364 * Creates some virtual cpus. Good luck creating more than one. 1365 */ 1366 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1367 { 1368 int r; 1369 struct kvm_vcpu *vcpu, *v; 1370 1371 vcpu = kvm_arch_vcpu_create(kvm, id); 1372 if (IS_ERR(vcpu)) 1373 return PTR_ERR(vcpu); 1374 1375 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1376 1377 r = kvm_arch_vcpu_setup(vcpu); 1378 if (r) 1379 return r; 1380 1381 mutex_lock(&kvm->lock); 1382 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1383 r = -EINVAL; 1384 goto vcpu_destroy; 1385 } 1386 1387 kvm_for_each_vcpu(r, v, kvm) 1388 if (v->vcpu_id == id) { 1389 r = -EEXIST; 1390 goto vcpu_destroy; 1391 } 1392 1393 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1394 1395 /* Now it's all set up, let userspace reach it */ 1396 kvm_get_kvm(kvm); 1397 r = create_vcpu_fd(vcpu); 1398 if (r < 0) { 1399 kvm_put_kvm(kvm); 1400 goto vcpu_destroy; 1401 } 1402 1403 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1404 smp_wmb(); 1405 atomic_inc(&kvm->online_vcpus); 1406 1407 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1408 if (kvm->bsp_vcpu_id == id) 1409 kvm->bsp_vcpu = vcpu; 1410 #endif 1411 mutex_unlock(&kvm->lock); 1412 return r; 1413 1414 vcpu_destroy: 1415 mutex_unlock(&kvm->lock); 1416 kvm_arch_vcpu_destroy(vcpu); 1417 return r; 1418 } 1419 1420 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1421 { 1422 if (sigset) { 1423 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1424 vcpu->sigset_active = 1; 1425 vcpu->sigset = *sigset; 1426 } else 1427 vcpu->sigset_active = 0; 1428 return 0; 1429 } 1430 1431 static long kvm_vcpu_ioctl(struct file *filp, 1432 unsigned int ioctl, unsigned long arg) 1433 { 1434 struct kvm_vcpu *vcpu = filp->private_data; 1435 void __user *argp = (void __user *)arg; 1436 int r; 1437 struct kvm_fpu *fpu = NULL; 1438 struct kvm_sregs *kvm_sregs = NULL; 1439 1440 if (vcpu->kvm->mm != current->mm) 1441 return -EIO; 1442 1443 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1444 /* 1445 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1446 * so vcpu_load() would break it. 1447 */ 1448 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1449 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1450 #endif 1451 1452 1453 vcpu_load(vcpu); 1454 switch (ioctl) { 1455 case KVM_RUN: 1456 r = -EINVAL; 1457 if (arg) 1458 goto out; 1459 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1460 break; 1461 case KVM_GET_REGS: { 1462 struct kvm_regs *kvm_regs; 1463 1464 r = -ENOMEM; 1465 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1466 if (!kvm_regs) 1467 goto out; 1468 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1469 if (r) 1470 goto out_free1; 1471 r = -EFAULT; 1472 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1473 goto out_free1; 1474 r = 0; 1475 out_free1: 1476 kfree(kvm_regs); 1477 break; 1478 } 1479 case KVM_SET_REGS: { 1480 struct kvm_regs *kvm_regs; 1481 1482 r = -ENOMEM; 1483 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1484 if (!kvm_regs) 1485 goto out; 1486 r = -EFAULT; 1487 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1488 goto out_free2; 1489 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1490 if (r) 1491 goto out_free2; 1492 r = 0; 1493 out_free2: 1494 kfree(kvm_regs); 1495 break; 1496 } 1497 case KVM_GET_SREGS: { 1498 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1499 r = -ENOMEM; 1500 if (!kvm_sregs) 1501 goto out; 1502 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1503 if (r) 1504 goto out; 1505 r = -EFAULT; 1506 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1507 goto out; 1508 r = 0; 1509 break; 1510 } 1511 case KVM_SET_SREGS: { 1512 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1513 r = -ENOMEM; 1514 if (!kvm_sregs) 1515 goto out; 1516 r = -EFAULT; 1517 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1518 goto out; 1519 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1520 if (r) 1521 goto out; 1522 r = 0; 1523 break; 1524 } 1525 case KVM_GET_MP_STATE: { 1526 struct kvm_mp_state mp_state; 1527 1528 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1529 if (r) 1530 goto out; 1531 r = -EFAULT; 1532 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1533 goto out; 1534 r = 0; 1535 break; 1536 } 1537 case KVM_SET_MP_STATE: { 1538 struct kvm_mp_state mp_state; 1539 1540 r = -EFAULT; 1541 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1542 goto out; 1543 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1544 if (r) 1545 goto out; 1546 r = 0; 1547 break; 1548 } 1549 case KVM_TRANSLATE: { 1550 struct kvm_translation tr; 1551 1552 r = -EFAULT; 1553 if (copy_from_user(&tr, argp, sizeof tr)) 1554 goto out; 1555 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1556 if (r) 1557 goto out; 1558 r = -EFAULT; 1559 if (copy_to_user(argp, &tr, sizeof tr)) 1560 goto out; 1561 r = 0; 1562 break; 1563 } 1564 case KVM_SET_GUEST_DEBUG: { 1565 struct kvm_guest_debug dbg; 1566 1567 r = -EFAULT; 1568 if (copy_from_user(&dbg, argp, sizeof dbg)) 1569 goto out; 1570 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1571 if (r) 1572 goto out; 1573 r = 0; 1574 break; 1575 } 1576 case KVM_SET_SIGNAL_MASK: { 1577 struct kvm_signal_mask __user *sigmask_arg = argp; 1578 struct kvm_signal_mask kvm_sigmask; 1579 sigset_t sigset, *p; 1580 1581 p = NULL; 1582 if (argp) { 1583 r = -EFAULT; 1584 if (copy_from_user(&kvm_sigmask, argp, 1585 sizeof kvm_sigmask)) 1586 goto out; 1587 r = -EINVAL; 1588 if (kvm_sigmask.len != sizeof sigset) 1589 goto out; 1590 r = -EFAULT; 1591 if (copy_from_user(&sigset, sigmask_arg->sigset, 1592 sizeof sigset)) 1593 goto out; 1594 p = &sigset; 1595 } 1596 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1597 break; 1598 } 1599 case KVM_GET_FPU: { 1600 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1601 r = -ENOMEM; 1602 if (!fpu) 1603 goto out; 1604 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1605 if (r) 1606 goto out; 1607 r = -EFAULT; 1608 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1609 goto out; 1610 r = 0; 1611 break; 1612 } 1613 case KVM_SET_FPU: { 1614 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1615 r = -ENOMEM; 1616 if (!fpu) 1617 goto out; 1618 r = -EFAULT; 1619 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1620 goto out; 1621 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1622 if (r) 1623 goto out; 1624 r = 0; 1625 break; 1626 } 1627 default: 1628 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1629 } 1630 out: 1631 vcpu_put(vcpu); 1632 kfree(fpu); 1633 kfree(kvm_sregs); 1634 return r; 1635 } 1636 1637 static long kvm_vm_ioctl(struct file *filp, 1638 unsigned int ioctl, unsigned long arg) 1639 { 1640 struct kvm *kvm = filp->private_data; 1641 void __user *argp = (void __user *)arg; 1642 int r; 1643 1644 if (kvm->mm != current->mm) 1645 return -EIO; 1646 switch (ioctl) { 1647 case KVM_CREATE_VCPU: 1648 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1649 if (r < 0) 1650 goto out; 1651 break; 1652 case KVM_SET_USER_MEMORY_REGION: { 1653 struct kvm_userspace_memory_region kvm_userspace_mem; 1654 1655 r = -EFAULT; 1656 if (copy_from_user(&kvm_userspace_mem, argp, 1657 sizeof kvm_userspace_mem)) 1658 goto out; 1659 1660 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1661 if (r) 1662 goto out; 1663 break; 1664 } 1665 case KVM_GET_DIRTY_LOG: { 1666 struct kvm_dirty_log log; 1667 1668 r = -EFAULT; 1669 if (copy_from_user(&log, argp, sizeof log)) 1670 goto out; 1671 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1672 if (r) 1673 goto out; 1674 break; 1675 } 1676 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1677 case KVM_REGISTER_COALESCED_MMIO: { 1678 struct kvm_coalesced_mmio_zone zone; 1679 r = -EFAULT; 1680 if (copy_from_user(&zone, argp, sizeof zone)) 1681 goto out; 1682 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1683 if (r) 1684 goto out; 1685 r = 0; 1686 break; 1687 } 1688 case KVM_UNREGISTER_COALESCED_MMIO: { 1689 struct kvm_coalesced_mmio_zone zone; 1690 r = -EFAULT; 1691 if (copy_from_user(&zone, argp, sizeof zone)) 1692 goto out; 1693 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1694 if (r) 1695 goto out; 1696 r = 0; 1697 break; 1698 } 1699 #endif 1700 case KVM_IRQFD: { 1701 struct kvm_irqfd data; 1702 1703 r = -EFAULT; 1704 if (copy_from_user(&data, argp, sizeof data)) 1705 goto out; 1706 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 1707 break; 1708 } 1709 case KVM_IOEVENTFD: { 1710 struct kvm_ioeventfd data; 1711 1712 r = -EFAULT; 1713 if (copy_from_user(&data, argp, sizeof data)) 1714 goto out; 1715 r = kvm_ioeventfd(kvm, &data); 1716 break; 1717 } 1718 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1719 case KVM_SET_BOOT_CPU_ID: 1720 r = 0; 1721 mutex_lock(&kvm->lock); 1722 if (atomic_read(&kvm->online_vcpus) != 0) 1723 r = -EBUSY; 1724 else 1725 kvm->bsp_vcpu_id = arg; 1726 mutex_unlock(&kvm->lock); 1727 break; 1728 #endif 1729 default: 1730 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1731 if (r == -ENOTTY) 1732 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 1733 } 1734 out: 1735 return r; 1736 } 1737 1738 #ifdef CONFIG_COMPAT 1739 struct compat_kvm_dirty_log { 1740 __u32 slot; 1741 __u32 padding1; 1742 union { 1743 compat_uptr_t dirty_bitmap; /* one bit per page */ 1744 __u64 padding2; 1745 }; 1746 }; 1747 1748 static long kvm_vm_compat_ioctl(struct file *filp, 1749 unsigned int ioctl, unsigned long arg) 1750 { 1751 struct kvm *kvm = filp->private_data; 1752 int r; 1753 1754 if (kvm->mm != current->mm) 1755 return -EIO; 1756 switch (ioctl) { 1757 case KVM_GET_DIRTY_LOG: { 1758 struct compat_kvm_dirty_log compat_log; 1759 struct kvm_dirty_log log; 1760 1761 r = -EFAULT; 1762 if (copy_from_user(&compat_log, (void __user *)arg, 1763 sizeof(compat_log))) 1764 goto out; 1765 log.slot = compat_log.slot; 1766 log.padding1 = compat_log.padding1; 1767 log.padding2 = compat_log.padding2; 1768 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 1769 1770 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1771 if (r) 1772 goto out; 1773 break; 1774 } 1775 default: 1776 r = kvm_vm_ioctl(filp, ioctl, arg); 1777 } 1778 1779 out: 1780 return r; 1781 } 1782 #endif 1783 1784 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1785 { 1786 struct page *page[1]; 1787 unsigned long addr; 1788 int npages; 1789 gfn_t gfn = vmf->pgoff; 1790 struct kvm *kvm = vma->vm_file->private_data; 1791 1792 addr = gfn_to_hva(kvm, gfn); 1793 if (kvm_is_error_hva(addr)) 1794 return VM_FAULT_SIGBUS; 1795 1796 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1797 NULL); 1798 if (unlikely(npages != 1)) 1799 return VM_FAULT_SIGBUS; 1800 1801 vmf->page = page[0]; 1802 return 0; 1803 } 1804 1805 static const struct vm_operations_struct kvm_vm_vm_ops = { 1806 .fault = kvm_vm_fault, 1807 }; 1808 1809 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1810 { 1811 vma->vm_ops = &kvm_vm_vm_ops; 1812 return 0; 1813 } 1814 1815 static struct file_operations kvm_vm_fops = { 1816 .release = kvm_vm_release, 1817 .unlocked_ioctl = kvm_vm_ioctl, 1818 #ifdef CONFIG_COMPAT 1819 .compat_ioctl = kvm_vm_compat_ioctl, 1820 #endif 1821 .mmap = kvm_vm_mmap, 1822 .llseek = noop_llseek, 1823 }; 1824 1825 static int kvm_dev_ioctl_create_vm(void) 1826 { 1827 int fd, r; 1828 struct kvm *kvm; 1829 1830 kvm = kvm_create_vm(); 1831 if (IS_ERR(kvm)) 1832 return PTR_ERR(kvm); 1833 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1834 r = kvm_coalesced_mmio_init(kvm); 1835 if (r < 0) { 1836 kvm_put_kvm(kvm); 1837 return r; 1838 } 1839 #endif 1840 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 1841 if (fd < 0) 1842 kvm_put_kvm(kvm); 1843 1844 return fd; 1845 } 1846 1847 static long kvm_dev_ioctl_check_extension_generic(long arg) 1848 { 1849 switch (arg) { 1850 case KVM_CAP_USER_MEMORY: 1851 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1852 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 1853 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1854 case KVM_CAP_SET_BOOT_CPU_ID: 1855 #endif 1856 case KVM_CAP_INTERNAL_ERROR_DATA: 1857 return 1; 1858 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1859 case KVM_CAP_IRQ_ROUTING: 1860 return KVM_MAX_IRQ_ROUTES; 1861 #endif 1862 default: 1863 break; 1864 } 1865 return kvm_dev_ioctl_check_extension(arg); 1866 } 1867 1868 static long kvm_dev_ioctl(struct file *filp, 1869 unsigned int ioctl, unsigned long arg) 1870 { 1871 long r = -EINVAL; 1872 1873 switch (ioctl) { 1874 case KVM_GET_API_VERSION: 1875 r = -EINVAL; 1876 if (arg) 1877 goto out; 1878 r = KVM_API_VERSION; 1879 break; 1880 case KVM_CREATE_VM: 1881 r = -EINVAL; 1882 if (arg) 1883 goto out; 1884 r = kvm_dev_ioctl_create_vm(); 1885 break; 1886 case KVM_CHECK_EXTENSION: 1887 r = kvm_dev_ioctl_check_extension_generic(arg); 1888 break; 1889 case KVM_GET_VCPU_MMAP_SIZE: 1890 r = -EINVAL; 1891 if (arg) 1892 goto out; 1893 r = PAGE_SIZE; /* struct kvm_run */ 1894 #ifdef CONFIG_X86 1895 r += PAGE_SIZE; /* pio data page */ 1896 #endif 1897 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1898 r += PAGE_SIZE; /* coalesced mmio ring page */ 1899 #endif 1900 break; 1901 case KVM_TRACE_ENABLE: 1902 case KVM_TRACE_PAUSE: 1903 case KVM_TRACE_DISABLE: 1904 r = -EOPNOTSUPP; 1905 break; 1906 default: 1907 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1908 } 1909 out: 1910 return r; 1911 } 1912 1913 static struct file_operations kvm_chardev_ops = { 1914 .unlocked_ioctl = kvm_dev_ioctl, 1915 .compat_ioctl = kvm_dev_ioctl, 1916 .llseek = noop_llseek, 1917 }; 1918 1919 static struct miscdevice kvm_dev = { 1920 KVM_MINOR, 1921 "kvm", 1922 &kvm_chardev_ops, 1923 }; 1924 1925 static void hardware_enable(void *junk) 1926 { 1927 int cpu = raw_smp_processor_id(); 1928 int r; 1929 1930 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1931 return; 1932 1933 cpumask_set_cpu(cpu, cpus_hardware_enabled); 1934 1935 r = kvm_arch_hardware_enable(NULL); 1936 1937 if (r) { 1938 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1939 atomic_inc(&hardware_enable_failed); 1940 printk(KERN_INFO "kvm: enabling virtualization on " 1941 "CPU%d failed\n", cpu); 1942 } 1943 } 1944 1945 static void hardware_disable(void *junk) 1946 { 1947 int cpu = raw_smp_processor_id(); 1948 1949 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 1950 return; 1951 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 1952 kvm_arch_hardware_disable(NULL); 1953 } 1954 1955 static void hardware_disable_all_nolock(void) 1956 { 1957 BUG_ON(!kvm_usage_count); 1958 1959 kvm_usage_count--; 1960 if (!kvm_usage_count) 1961 on_each_cpu(hardware_disable, NULL, 1); 1962 } 1963 1964 static void hardware_disable_all(void) 1965 { 1966 spin_lock(&kvm_lock); 1967 hardware_disable_all_nolock(); 1968 spin_unlock(&kvm_lock); 1969 } 1970 1971 static int hardware_enable_all(void) 1972 { 1973 int r = 0; 1974 1975 spin_lock(&kvm_lock); 1976 1977 kvm_usage_count++; 1978 if (kvm_usage_count == 1) { 1979 atomic_set(&hardware_enable_failed, 0); 1980 on_each_cpu(hardware_enable, NULL, 1); 1981 1982 if (atomic_read(&hardware_enable_failed)) { 1983 hardware_disable_all_nolock(); 1984 r = -EBUSY; 1985 } 1986 } 1987 1988 spin_unlock(&kvm_lock); 1989 1990 return r; 1991 } 1992 1993 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1994 void *v) 1995 { 1996 int cpu = (long)v; 1997 1998 if (!kvm_usage_count) 1999 return NOTIFY_OK; 2000 2001 val &= ~CPU_TASKS_FROZEN; 2002 switch (val) { 2003 case CPU_DYING: 2004 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2005 cpu); 2006 hardware_disable(NULL); 2007 break; 2008 case CPU_STARTING: 2009 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2010 cpu); 2011 spin_lock(&kvm_lock); 2012 hardware_enable(NULL); 2013 spin_unlock(&kvm_lock); 2014 break; 2015 } 2016 return NOTIFY_OK; 2017 } 2018 2019 2020 asmlinkage void kvm_handle_fault_on_reboot(void) 2021 { 2022 if (kvm_rebooting) { 2023 /* spin while reset goes on */ 2024 local_irq_enable(); 2025 while (true) 2026 cpu_relax(); 2027 } 2028 /* Fault while not rebooting. We want the trace. */ 2029 BUG(); 2030 } 2031 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2032 2033 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2034 void *v) 2035 { 2036 /* 2037 * Some (well, at least mine) BIOSes hang on reboot if 2038 * in vmx root mode. 2039 * 2040 * And Intel TXT required VMX off for all cpu when system shutdown. 2041 */ 2042 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2043 kvm_rebooting = true; 2044 on_each_cpu(hardware_disable, NULL, 1); 2045 return NOTIFY_OK; 2046 } 2047 2048 static struct notifier_block kvm_reboot_notifier = { 2049 .notifier_call = kvm_reboot, 2050 .priority = 0, 2051 }; 2052 2053 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2054 { 2055 int i; 2056 2057 for (i = 0; i < bus->dev_count; i++) { 2058 struct kvm_io_device *pos = bus->devs[i]; 2059 2060 kvm_iodevice_destructor(pos); 2061 } 2062 kfree(bus); 2063 } 2064 2065 /* kvm_io_bus_write - called under kvm->slots_lock */ 2066 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2067 int len, const void *val) 2068 { 2069 int i; 2070 struct kvm_io_bus *bus; 2071 2072 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2073 for (i = 0; i < bus->dev_count; i++) 2074 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2075 return 0; 2076 return -EOPNOTSUPP; 2077 } 2078 2079 /* kvm_io_bus_read - called under kvm->slots_lock */ 2080 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2081 int len, void *val) 2082 { 2083 int i; 2084 struct kvm_io_bus *bus; 2085 2086 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2087 for (i = 0; i < bus->dev_count; i++) 2088 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2089 return 0; 2090 return -EOPNOTSUPP; 2091 } 2092 2093 /* Caller must hold slots_lock. */ 2094 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2095 struct kvm_io_device *dev) 2096 { 2097 struct kvm_io_bus *new_bus, *bus; 2098 2099 bus = kvm->buses[bus_idx]; 2100 if (bus->dev_count > NR_IOBUS_DEVS-1) 2101 return -ENOSPC; 2102 2103 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2104 if (!new_bus) 2105 return -ENOMEM; 2106 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2107 new_bus->devs[new_bus->dev_count++] = dev; 2108 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2109 synchronize_srcu_expedited(&kvm->srcu); 2110 kfree(bus); 2111 2112 return 0; 2113 } 2114 2115 /* Caller must hold slots_lock. */ 2116 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2117 struct kvm_io_device *dev) 2118 { 2119 int i, r; 2120 struct kvm_io_bus *new_bus, *bus; 2121 2122 new_bus = kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL); 2123 if (!new_bus) 2124 return -ENOMEM; 2125 2126 bus = kvm->buses[bus_idx]; 2127 memcpy(new_bus, bus, sizeof(struct kvm_io_bus)); 2128 2129 r = -ENOENT; 2130 for (i = 0; i < new_bus->dev_count; i++) 2131 if (new_bus->devs[i] == dev) { 2132 r = 0; 2133 new_bus->devs[i] = new_bus->devs[--new_bus->dev_count]; 2134 break; 2135 } 2136 2137 if (r) { 2138 kfree(new_bus); 2139 return r; 2140 } 2141 2142 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2143 synchronize_srcu_expedited(&kvm->srcu); 2144 kfree(bus); 2145 return r; 2146 } 2147 2148 static struct notifier_block kvm_cpu_notifier = { 2149 .notifier_call = kvm_cpu_hotplug, 2150 }; 2151 2152 static int vm_stat_get(void *_offset, u64 *val) 2153 { 2154 unsigned offset = (long)_offset; 2155 struct kvm *kvm; 2156 2157 *val = 0; 2158 spin_lock(&kvm_lock); 2159 list_for_each_entry(kvm, &vm_list, vm_list) 2160 *val += *(u32 *)((void *)kvm + offset); 2161 spin_unlock(&kvm_lock); 2162 return 0; 2163 } 2164 2165 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2166 2167 static int vcpu_stat_get(void *_offset, u64 *val) 2168 { 2169 unsigned offset = (long)_offset; 2170 struct kvm *kvm; 2171 struct kvm_vcpu *vcpu; 2172 int i; 2173 2174 *val = 0; 2175 spin_lock(&kvm_lock); 2176 list_for_each_entry(kvm, &vm_list, vm_list) 2177 kvm_for_each_vcpu(i, vcpu, kvm) 2178 *val += *(u32 *)((void *)vcpu + offset); 2179 2180 spin_unlock(&kvm_lock); 2181 return 0; 2182 } 2183 2184 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2185 2186 static const struct file_operations *stat_fops[] = { 2187 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2188 [KVM_STAT_VM] = &vm_stat_fops, 2189 }; 2190 2191 static void kvm_init_debug(void) 2192 { 2193 struct kvm_stats_debugfs_item *p; 2194 2195 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2196 for (p = debugfs_entries; p->name; ++p) 2197 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2198 (void *)(long)p->offset, 2199 stat_fops[p->kind]); 2200 } 2201 2202 static void kvm_exit_debug(void) 2203 { 2204 struct kvm_stats_debugfs_item *p; 2205 2206 for (p = debugfs_entries; p->name; ++p) 2207 debugfs_remove(p->dentry); 2208 debugfs_remove(kvm_debugfs_dir); 2209 } 2210 2211 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2212 { 2213 if (kvm_usage_count) 2214 hardware_disable(NULL); 2215 return 0; 2216 } 2217 2218 static int kvm_resume(struct sys_device *dev) 2219 { 2220 if (kvm_usage_count) { 2221 WARN_ON(spin_is_locked(&kvm_lock)); 2222 hardware_enable(NULL); 2223 } 2224 return 0; 2225 } 2226 2227 static struct sysdev_class kvm_sysdev_class = { 2228 .name = "kvm", 2229 .suspend = kvm_suspend, 2230 .resume = kvm_resume, 2231 }; 2232 2233 static struct sys_device kvm_sysdev = { 2234 .id = 0, 2235 .cls = &kvm_sysdev_class, 2236 }; 2237 2238 struct page *bad_page; 2239 pfn_t bad_pfn; 2240 2241 static inline 2242 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2243 { 2244 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2245 } 2246 2247 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2248 { 2249 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2250 2251 kvm_arch_vcpu_load(vcpu, cpu); 2252 } 2253 2254 static void kvm_sched_out(struct preempt_notifier *pn, 2255 struct task_struct *next) 2256 { 2257 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2258 2259 kvm_arch_vcpu_put(vcpu); 2260 } 2261 2262 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2263 struct module *module) 2264 { 2265 int r; 2266 int cpu; 2267 2268 r = kvm_arch_init(opaque); 2269 if (r) 2270 goto out_fail; 2271 2272 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2273 2274 if (bad_page == NULL) { 2275 r = -ENOMEM; 2276 goto out; 2277 } 2278 2279 bad_pfn = page_to_pfn(bad_page); 2280 2281 hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2282 2283 if (hwpoison_page == NULL) { 2284 r = -ENOMEM; 2285 goto out_free_0; 2286 } 2287 2288 hwpoison_pfn = page_to_pfn(hwpoison_page); 2289 2290 fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2291 2292 if (fault_page == NULL) { 2293 r = -ENOMEM; 2294 goto out_free_0; 2295 } 2296 2297 fault_pfn = page_to_pfn(fault_page); 2298 2299 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2300 r = -ENOMEM; 2301 goto out_free_0; 2302 } 2303 2304 r = kvm_arch_hardware_setup(); 2305 if (r < 0) 2306 goto out_free_0a; 2307 2308 for_each_online_cpu(cpu) { 2309 smp_call_function_single(cpu, 2310 kvm_arch_check_processor_compat, 2311 &r, 1); 2312 if (r < 0) 2313 goto out_free_1; 2314 } 2315 2316 r = register_cpu_notifier(&kvm_cpu_notifier); 2317 if (r) 2318 goto out_free_2; 2319 register_reboot_notifier(&kvm_reboot_notifier); 2320 2321 r = sysdev_class_register(&kvm_sysdev_class); 2322 if (r) 2323 goto out_free_3; 2324 2325 r = sysdev_register(&kvm_sysdev); 2326 if (r) 2327 goto out_free_4; 2328 2329 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2330 if (!vcpu_align) 2331 vcpu_align = __alignof__(struct kvm_vcpu); 2332 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2333 0, NULL); 2334 if (!kvm_vcpu_cache) { 2335 r = -ENOMEM; 2336 goto out_free_5; 2337 } 2338 2339 kvm_chardev_ops.owner = module; 2340 kvm_vm_fops.owner = module; 2341 kvm_vcpu_fops.owner = module; 2342 2343 r = misc_register(&kvm_dev); 2344 if (r) { 2345 printk(KERN_ERR "kvm: misc device register failed\n"); 2346 goto out_free; 2347 } 2348 2349 kvm_preempt_ops.sched_in = kvm_sched_in; 2350 kvm_preempt_ops.sched_out = kvm_sched_out; 2351 2352 kvm_init_debug(); 2353 2354 return 0; 2355 2356 out_free: 2357 kmem_cache_destroy(kvm_vcpu_cache); 2358 out_free_5: 2359 sysdev_unregister(&kvm_sysdev); 2360 out_free_4: 2361 sysdev_class_unregister(&kvm_sysdev_class); 2362 out_free_3: 2363 unregister_reboot_notifier(&kvm_reboot_notifier); 2364 unregister_cpu_notifier(&kvm_cpu_notifier); 2365 out_free_2: 2366 out_free_1: 2367 kvm_arch_hardware_unsetup(); 2368 out_free_0a: 2369 free_cpumask_var(cpus_hardware_enabled); 2370 out_free_0: 2371 if (fault_page) 2372 __free_page(fault_page); 2373 if (hwpoison_page) 2374 __free_page(hwpoison_page); 2375 __free_page(bad_page); 2376 out: 2377 kvm_arch_exit(); 2378 out_fail: 2379 return r; 2380 } 2381 EXPORT_SYMBOL_GPL(kvm_init); 2382 2383 void kvm_exit(void) 2384 { 2385 kvm_exit_debug(); 2386 misc_deregister(&kvm_dev); 2387 kmem_cache_destroy(kvm_vcpu_cache); 2388 sysdev_unregister(&kvm_sysdev); 2389 sysdev_class_unregister(&kvm_sysdev_class); 2390 unregister_reboot_notifier(&kvm_reboot_notifier); 2391 unregister_cpu_notifier(&kvm_cpu_notifier); 2392 on_each_cpu(hardware_disable, NULL, 1); 2393 kvm_arch_hardware_unsetup(); 2394 kvm_arch_exit(); 2395 free_cpumask_var(cpus_hardware_enabled); 2396 __free_page(hwpoison_page); 2397 __free_page(bad_page); 2398 } 2399 EXPORT_SYMBOL_GPL(kvm_exit); 2400