1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 45 #include <asm/processor.h> 46 #include <asm/io.h> 47 #include <asm/uaccess.h> 48 #include <asm/pgtable.h> 49 50 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 51 #include "coalesced_mmio.h" 52 #endif 53 54 MODULE_AUTHOR("Qumranet"); 55 MODULE_LICENSE("GPL"); 56 57 DEFINE_SPINLOCK(kvm_lock); 58 LIST_HEAD(vm_list); 59 60 static cpumask_t cpus_hardware_enabled; 61 62 struct kmem_cache *kvm_vcpu_cache; 63 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 64 65 static __read_mostly struct preempt_ops kvm_preempt_ops; 66 67 struct dentry *kvm_debugfs_dir; 68 69 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 70 unsigned long arg); 71 72 bool kvm_rebooting; 73 74 static inline int valid_vcpu(int n) 75 { 76 return likely(n >= 0 && n < KVM_MAX_VCPUS); 77 } 78 79 /* 80 * Switches to specified vcpu, until a matching vcpu_put() 81 */ 82 void vcpu_load(struct kvm_vcpu *vcpu) 83 { 84 int cpu; 85 86 mutex_lock(&vcpu->mutex); 87 cpu = get_cpu(); 88 preempt_notifier_register(&vcpu->preempt_notifier); 89 kvm_arch_vcpu_load(vcpu, cpu); 90 put_cpu(); 91 } 92 93 void vcpu_put(struct kvm_vcpu *vcpu) 94 { 95 preempt_disable(); 96 kvm_arch_vcpu_put(vcpu); 97 preempt_notifier_unregister(&vcpu->preempt_notifier); 98 preempt_enable(); 99 mutex_unlock(&vcpu->mutex); 100 } 101 102 static void ack_flush(void *_completed) 103 { 104 } 105 106 void kvm_flush_remote_tlbs(struct kvm *kvm) 107 { 108 int i, cpu, me; 109 cpumask_t cpus; 110 struct kvm_vcpu *vcpu; 111 112 me = get_cpu(); 113 cpus_clear(cpus); 114 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 115 vcpu = kvm->vcpus[i]; 116 if (!vcpu) 117 continue; 118 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 119 continue; 120 cpu = vcpu->cpu; 121 if (cpu != -1 && cpu != me) 122 cpu_set(cpu, cpus); 123 } 124 if (cpus_empty(cpus)) 125 goto out; 126 ++kvm->stat.remote_tlb_flush; 127 smp_call_function_mask(cpus, ack_flush, NULL, 1); 128 out: 129 put_cpu(); 130 } 131 132 void kvm_reload_remote_mmus(struct kvm *kvm) 133 { 134 int i, cpu, me; 135 cpumask_t cpus; 136 struct kvm_vcpu *vcpu; 137 138 me = get_cpu(); 139 cpus_clear(cpus); 140 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 141 vcpu = kvm->vcpus[i]; 142 if (!vcpu) 143 continue; 144 if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 145 continue; 146 cpu = vcpu->cpu; 147 if (cpu != -1 && cpu != me) 148 cpu_set(cpu, cpus); 149 } 150 if (cpus_empty(cpus)) 151 goto out; 152 smp_call_function_mask(cpus, ack_flush, NULL, 1); 153 out: 154 put_cpu(); 155 } 156 157 158 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 159 { 160 struct page *page; 161 int r; 162 163 mutex_init(&vcpu->mutex); 164 vcpu->cpu = -1; 165 vcpu->kvm = kvm; 166 vcpu->vcpu_id = id; 167 init_waitqueue_head(&vcpu->wq); 168 169 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 170 if (!page) { 171 r = -ENOMEM; 172 goto fail; 173 } 174 vcpu->run = page_address(page); 175 176 r = kvm_arch_vcpu_init(vcpu); 177 if (r < 0) 178 goto fail_free_run; 179 return 0; 180 181 fail_free_run: 182 free_page((unsigned long)vcpu->run); 183 fail: 184 return r; 185 } 186 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 187 188 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 189 { 190 kvm_arch_vcpu_uninit(vcpu); 191 free_page((unsigned long)vcpu->run); 192 } 193 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 194 195 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 196 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 197 { 198 return container_of(mn, struct kvm, mmu_notifier); 199 } 200 201 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 202 struct mm_struct *mm, 203 unsigned long address) 204 { 205 struct kvm *kvm = mmu_notifier_to_kvm(mn); 206 int need_tlb_flush; 207 208 /* 209 * When ->invalidate_page runs, the linux pte has been zapped 210 * already but the page is still allocated until 211 * ->invalidate_page returns. So if we increase the sequence 212 * here the kvm page fault will notice if the spte can't be 213 * established because the page is going to be freed. If 214 * instead the kvm page fault establishes the spte before 215 * ->invalidate_page runs, kvm_unmap_hva will release it 216 * before returning. 217 * 218 * The sequence increase only need to be seen at spin_unlock 219 * time, and not at spin_lock time. 220 * 221 * Increasing the sequence after the spin_unlock would be 222 * unsafe because the kvm page fault could then establish the 223 * pte after kvm_unmap_hva returned, without noticing the page 224 * is going to be freed. 225 */ 226 spin_lock(&kvm->mmu_lock); 227 kvm->mmu_notifier_seq++; 228 need_tlb_flush = kvm_unmap_hva(kvm, address); 229 spin_unlock(&kvm->mmu_lock); 230 231 /* we've to flush the tlb before the pages can be freed */ 232 if (need_tlb_flush) 233 kvm_flush_remote_tlbs(kvm); 234 235 } 236 237 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 238 struct mm_struct *mm, 239 unsigned long start, 240 unsigned long end) 241 { 242 struct kvm *kvm = mmu_notifier_to_kvm(mn); 243 int need_tlb_flush = 0; 244 245 spin_lock(&kvm->mmu_lock); 246 /* 247 * The count increase must become visible at unlock time as no 248 * spte can be established without taking the mmu_lock and 249 * count is also read inside the mmu_lock critical section. 250 */ 251 kvm->mmu_notifier_count++; 252 for (; start < end; start += PAGE_SIZE) 253 need_tlb_flush |= kvm_unmap_hva(kvm, start); 254 spin_unlock(&kvm->mmu_lock); 255 256 /* we've to flush the tlb before the pages can be freed */ 257 if (need_tlb_flush) 258 kvm_flush_remote_tlbs(kvm); 259 } 260 261 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 262 struct mm_struct *mm, 263 unsigned long start, 264 unsigned long end) 265 { 266 struct kvm *kvm = mmu_notifier_to_kvm(mn); 267 268 spin_lock(&kvm->mmu_lock); 269 /* 270 * This sequence increase will notify the kvm page fault that 271 * the page that is going to be mapped in the spte could have 272 * been freed. 273 */ 274 kvm->mmu_notifier_seq++; 275 /* 276 * The above sequence increase must be visible before the 277 * below count decrease but both values are read by the kvm 278 * page fault under mmu_lock spinlock so we don't need to add 279 * a smb_wmb() here in between the two. 280 */ 281 kvm->mmu_notifier_count--; 282 spin_unlock(&kvm->mmu_lock); 283 284 BUG_ON(kvm->mmu_notifier_count < 0); 285 } 286 287 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 288 struct mm_struct *mm, 289 unsigned long address) 290 { 291 struct kvm *kvm = mmu_notifier_to_kvm(mn); 292 int young; 293 294 spin_lock(&kvm->mmu_lock); 295 young = kvm_age_hva(kvm, address); 296 spin_unlock(&kvm->mmu_lock); 297 298 if (young) 299 kvm_flush_remote_tlbs(kvm); 300 301 return young; 302 } 303 304 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 305 .invalidate_page = kvm_mmu_notifier_invalidate_page, 306 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 307 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 308 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 309 }; 310 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 311 312 static struct kvm *kvm_create_vm(void) 313 { 314 struct kvm *kvm = kvm_arch_create_vm(); 315 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 316 struct page *page; 317 #endif 318 319 if (IS_ERR(kvm)) 320 goto out; 321 322 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 323 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 324 if (!page) { 325 kfree(kvm); 326 return ERR_PTR(-ENOMEM); 327 } 328 kvm->coalesced_mmio_ring = 329 (struct kvm_coalesced_mmio_ring *)page_address(page); 330 #endif 331 332 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 333 { 334 int err; 335 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 336 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 337 if (err) { 338 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 339 put_page(page); 340 #endif 341 kfree(kvm); 342 return ERR_PTR(err); 343 } 344 } 345 #endif 346 347 kvm->mm = current->mm; 348 atomic_inc(&kvm->mm->mm_count); 349 spin_lock_init(&kvm->mmu_lock); 350 kvm_io_bus_init(&kvm->pio_bus); 351 mutex_init(&kvm->lock); 352 kvm_io_bus_init(&kvm->mmio_bus); 353 init_rwsem(&kvm->slots_lock); 354 atomic_set(&kvm->users_count, 1); 355 spin_lock(&kvm_lock); 356 list_add(&kvm->vm_list, &vm_list); 357 spin_unlock(&kvm_lock); 358 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 359 kvm_coalesced_mmio_init(kvm); 360 #endif 361 out: 362 return kvm; 363 } 364 365 /* 366 * Free any memory in @free but not in @dont. 367 */ 368 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 369 struct kvm_memory_slot *dont) 370 { 371 if (!dont || free->rmap != dont->rmap) 372 vfree(free->rmap); 373 374 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 375 vfree(free->dirty_bitmap); 376 377 if (!dont || free->lpage_info != dont->lpage_info) 378 vfree(free->lpage_info); 379 380 free->npages = 0; 381 free->dirty_bitmap = NULL; 382 free->rmap = NULL; 383 free->lpage_info = NULL; 384 } 385 386 void kvm_free_physmem(struct kvm *kvm) 387 { 388 int i; 389 390 for (i = 0; i < kvm->nmemslots; ++i) 391 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 392 } 393 394 static void kvm_destroy_vm(struct kvm *kvm) 395 { 396 struct mm_struct *mm = kvm->mm; 397 398 spin_lock(&kvm_lock); 399 list_del(&kvm->vm_list); 400 spin_unlock(&kvm_lock); 401 kvm_io_bus_destroy(&kvm->pio_bus); 402 kvm_io_bus_destroy(&kvm->mmio_bus); 403 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 404 if (kvm->coalesced_mmio_ring != NULL) 405 free_page((unsigned long)kvm->coalesced_mmio_ring); 406 #endif 407 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 408 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 409 #endif 410 kvm_arch_destroy_vm(kvm); 411 mmdrop(mm); 412 } 413 414 void kvm_get_kvm(struct kvm *kvm) 415 { 416 atomic_inc(&kvm->users_count); 417 } 418 EXPORT_SYMBOL_GPL(kvm_get_kvm); 419 420 void kvm_put_kvm(struct kvm *kvm) 421 { 422 if (atomic_dec_and_test(&kvm->users_count)) 423 kvm_destroy_vm(kvm); 424 } 425 EXPORT_SYMBOL_GPL(kvm_put_kvm); 426 427 428 static int kvm_vm_release(struct inode *inode, struct file *filp) 429 { 430 struct kvm *kvm = filp->private_data; 431 432 kvm_put_kvm(kvm); 433 return 0; 434 } 435 436 /* 437 * Allocate some memory and give it an address in the guest physical address 438 * space. 439 * 440 * Discontiguous memory is allowed, mostly for framebuffers. 441 * 442 * Must be called holding mmap_sem for write. 443 */ 444 int __kvm_set_memory_region(struct kvm *kvm, 445 struct kvm_userspace_memory_region *mem, 446 int user_alloc) 447 { 448 int r; 449 gfn_t base_gfn; 450 unsigned long npages; 451 unsigned long i; 452 struct kvm_memory_slot *memslot; 453 struct kvm_memory_slot old, new; 454 455 r = -EINVAL; 456 /* General sanity checks */ 457 if (mem->memory_size & (PAGE_SIZE - 1)) 458 goto out; 459 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 460 goto out; 461 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 462 goto out; 463 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 464 goto out; 465 466 memslot = &kvm->memslots[mem->slot]; 467 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 468 npages = mem->memory_size >> PAGE_SHIFT; 469 470 if (!npages) 471 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 472 473 new = old = *memslot; 474 475 new.base_gfn = base_gfn; 476 new.npages = npages; 477 new.flags = mem->flags; 478 479 /* Disallow changing a memory slot's size. */ 480 r = -EINVAL; 481 if (npages && old.npages && npages != old.npages) 482 goto out_free; 483 484 /* Check for overlaps */ 485 r = -EEXIST; 486 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 487 struct kvm_memory_slot *s = &kvm->memslots[i]; 488 489 if (s == memslot) 490 continue; 491 if (!((base_gfn + npages <= s->base_gfn) || 492 (base_gfn >= s->base_gfn + s->npages))) 493 goto out_free; 494 } 495 496 /* Free page dirty bitmap if unneeded */ 497 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 498 new.dirty_bitmap = NULL; 499 500 r = -ENOMEM; 501 502 /* Allocate if a slot is being created */ 503 #ifndef CONFIG_S390 504 if (npages && !new.rmap) { 505 new.rmap = vmalloc(npages * sizeof(struct page *)); 506 507 if (!new.rmap) 508 goto out_free; 509 510 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 511 512 new.user_alloc = user_alloc; 513 /* 514 * hva_to_rmmap() serialzies with the mmu_lock and to be 515 * safe it has to ignore memslots with !user_alloc && 516 * !userspace_addr. 517 */ 518 if (user_alloc) 519 new.userspace_addr = mem->userspace_addr; 520 else 521 new.userspace_addr = 0; 522 } 523 if (npages && !new.lpage_info) { 524 int largepages = npages / KVM_PAGES_PER_HPAGE; 525 if (npages % KVM_PAGES_PER_HPAGE) 526 largepages++; 527 if (base_gfn % KVM_PAGES_PER_HPAGE) 528 largepages++; 529 530 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 531 532 if (!new.lpage_info) 533 goto out_free; 534 535 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 536 537 if (base_gfn % KVM_PAGES_PER_HPAGE) 538 new.lpage_info[0].write_count = 1; 539 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 540 new.lpage_info[largepages-1].write_count = 1; 541 } 542 543 /* Allocate page dirty bitmap if needed */ 544 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 545 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 546 547 new.dirty_bitmap = vmalloc(dirty_bytes); 548 if (!new.dirty_bitmap) 549 goto out_free; 550 memset(new.dirty_bitmap, 0, dirty_bytes); 551 } 552 #endif /* not defined CONFIG_S390 */ 553 554 if (!npages) 555 kvm_arch_flush_shadow(kvm); 556 557 spin_lock(&kvm->mmu_lock); 558 if (mem->slot >= kvm->nmemslots) 559 kvm->nmemslots = mem->slot + 1; 560 561 *memslot = new; 562 spin_unlock(&kvm->mmu_lock); 563 564 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 565 if (r) { 566 spin_lock(&kvm->mmu_lock); 567 *memslot = old; 568 spin_unlock(&kvm->mmu_lock); 569 goto out_free; 570 } 571 572 kvm_free_physmem_slot(&old, &new); 573 return 0; 574 575 out_free: 576 kvm_free_physmem_slot(&new, &old); 577 out: 578 return r; 579 580 } 581 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 582 583 int kvm_set_memory_region(struct kvm *kvm, 584 struct kvm_userspace_memory_region *mem, 585 int user_alloc) 586 { 587 int r; 588 589 down_write(&kvm->slots_lock); 590 r = __kvm_set_memory_region(kvm, mem, user_alloc); 591 up_write(&kvm->slots_lock); 592 return r; 593 } 594 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 595 596 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 597 struct 598 kvm_userspace_memory_region *mem, 599 int user_alloc) 600 { 601 if (mem->slot >= KVM_MEMORY_SLOTS) 602 return -EINVAL; 603 return kvm_set_memory_region(kvm, mem, user_alloc); 604 } 605 606 int kvm_get_dirty_log(struct kvm *kvm, 607 struct kvm_dirty_log *log, int *is_dirty) 608 { 609 struct kvm_memory_slot *memslot; 610 int r, i; 611 int n; 612 unsigned long any = 0; 613 614 r = -EINVAL; 615 if (log->slot >= KVM_MEMORY_SLOTS) 616 goto out; 617 618 memslot = &kvm->memslots[log->slot]; 619 r = -ENOENT; 620 if (!memslot->dirty_bitmap) 621 goto out; 622 623 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 624 625 for (i = 0; !any && i < n/sizeof(long); ++i) 626 any = memslot->dirty_bitmap[i]; 627 628 r = -EFAULT; 629 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 630 goto out; 631 632 if (any) 633 *is_dirty = 1; 634 635 r = 0; 636 out: 637 return r; 638 } 639 640 int is_error_page(struct page *page) 641 { 642 return page == bad_page; 643 } 644 EXPORT_SYMBOL_GPL(is_error_page); 645 646 int is_error_pfn(pfn_t pfn) 647 { 648 return pfn == bad_pfn; 649 } 650 EXPORT_SYMBOL_GPL(is_error_pfn); 651 652 static inline unsigned long bad_hva(void) 653 { 654 return PAGE_OFFSET; 655 } 656 657 int kvm_is_error_hva(unsigned long addr) 658 { 659 return addr == bad_hva(); 660 } 661 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 662 663 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 664 { 665 int i; 666 667 for (i = 0; i < kvm->nmemslots; ++i) { 668 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 669 670 if (gfn >= memslot->base_gfn 671 && gfn < memslot->base_gfn + memslot->npages) 672 return memslot; 673 } 674 return NULL; 675 } 676 677 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 678 { 679 gfn = unalias_gfn(kvm, gfn); 680 return __gfn_to_memslot(kvm, gfn); 681 } 682 683 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 684 { 685 int i; 686 687 gfn = unalias_gfn(kvm, gfn); 688 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 689 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 690 691 if (gfn >= memslot->base_gfn 692 && gfn < memslot->base_gfn + memslot->npages) 693 return 1; 694 } 695 return 0; 696 } 697 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 698 699 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 700 { 701 struct kvm_memory_slot *slot; 702 703 gfn = unalias_gfn(kvm, gfn); 704 slot = __gfn_to_memslot(kvm, gfn); 705 if (!slot) 706 return bad_hva(); 707 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 708 } 709 EXPORT_SYMBOL_GPL(gfn_to_hva); 710 711 /* 712 * Requires current->mm->mmap_sem to be held 713 */ 714 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 715 { 716 struct page *page[1]; 717 unsigned long addr; 718 int npages; 719 pfn_t pfn; 720 721 might_sleep(); 722 723 addr = gfn_to_hva(kvm, gfn); 724 if (kvm_is_error_hva(addr)) { 725 get_page(bad_page); 726 return page_to_pfn(bad_page); 727 } 728 729 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, 730 NULL); 731 732 if (unlikely(npages != 1)) { 733 struct vm_area_struct *vma; 734 735 vma = find_vma(current->mm, addr); 736 if (vma == NULL || addr < vma->vm_start || 737 !(vma->vm_flags & VM_PFNMAP)) { 738 get_page(bad_page); 739 return page_to_pfn(bad_page); 740 } 741 742 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 743 BUG_ON(pfn_valid(pfn)); 744 } else 745 pfn = page_to_pfn(page[0]); 746 747 return pfn; 748 } 749 750 EXPORT_SYMBOL_GPL(gfn_to_pfn); 751 752 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 753 { 754 pfn_t pfn; 755 756 pfn = gfn_to_pfn(kvm, gfn); 757 if (pfn_valid(pfn)) 758 return pfn_to_page(pfn); 759 760 WARN_ON(!pfn_valid(pfn)); 761 762 get_page(bad_page); 763 return bad_page; 764 } 765 766 EXPORT_SYMBOL_GPL(gfn_to_page); 767 768 void kvm_release_page_clean(struct page *page) 769 { 770 kvm_release_pfn_clean(page_to_pfn(page)); 771 } 772 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 773 774 void kvm_release_pfn_clean(pfn_t pfn) 775 { 776 if (pfn_valid(pfn)) 777 put_page(pfn_to_page(pfn)); 778 } 779 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 780 781 void kvm_release_page_dirty(struct page *page) 782 { 783 kvm_release_pfn_dirty(page_to_pfn(page)); 784 } 785 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 786 787 void kvm_release_pfn_dirty(pfn_t pfn) 788 { 789 kvm_set_pfn_dirty(pfn); 790 kvm_release_pfn_clean(pfn); 791 } 792 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 793 794 void kvm_set_page_dirty(struct page *page) 795 { 796 kvm_set_pfn_dirty(page_to_pfn(page)); 797 } 798 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 799 800 void kvm_set_pfn_dirty(pfn_t pfn) 801 { 802 if (pfn_valid(pfn)) { 803 struct page *page = pfn_to_page(pfn); 804 if (!PageReserved(page)) 805 SetPageDirty(page); 806 } 807 } 808 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 809 810 void kvm_set_pfn_accessed(pfn_t pfn) 811 { 812 if (pfn_valid(pfn)) 813 mark_page_accessed(pfn_to_page(pfn)); 814 } 815 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 816 817 void kvm_get_pfn(pfn_t pfn) 818 { 819 if (pfn_valid(pfn)) 820 get_page(pfn_to_page(pfn)); 821 } 822 EXPORT_SYMBOL_GPL(kvm_get_pfn); 823 824 static int next_segment(unsigned long len, int offset) 825 { 826 if (len > PAGE_SIZE - offset) 827 return PAGE_SIZE - offset; 828 else 829 return len; 830 } 831 832 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 833 int len) 834 { 835 int r; 836 unsigned long addr; 837 838 addr = gfn_to_hva(kvm, gfn); 839 if (kvm_is_error_hva(addr)) 840 return -EFAULT; 841 r = copy_from_user(data, (void __user *)addr + offset, len); 842 if (r) 843 return -EFAULT; 844 return 0; 845 } 846 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 847 848 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 849 { 850 gfn_t gfn = gpa >> PAGE_SHIFT; 851 int seg; 852 int offset = offset_in_page(gpa); 853 int ret; 854 855 while ((seg = next_segment(len, offset)) != 0) { 856 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 857 if (ret < 0) 858 return ret; 859 offset = 0; 860 len -= seg; 861 data += seg; 862 ++gfn; 863 } 864 return 0; 865 } 866 EXPORT_SYMBOL_GPL(kvm_read_guest); 867 868 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 869 unsigned long len) 870 { 871 int r; 872 unsigned long addr; 873 gfn_t gfn = gpa >> PAGE_SHIFT; 874 int offset = offset_in_page(gpa); 875 876 addr = gfn_to_hva(kvm, gfn); 877 if (kvm_is_error_hva(addr)) 878 return -EFAULT; 879 pagefault_disable(); 880 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 881 pagefault_enable(); 882 if (r) 883 return -EFAULT; 884 return 0; 885 } 886 EXPORT_SYMBOL(kvm_read_guest_atomic); 887 888 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 889 int offset, int len) 890 { 891 int r; 892 unsigned long addr; 893 894 addr = gfn_to_hva(kvm, gfn); 895 if (kvm_is_error_hva(addr)) 896 return -EFAULT; 897 r = copy_to_user((void __user *)addr + offset, data, len); 898 if (r) 899 return -EFAULT; 900 mark_page_dirty(kvm, gfn); 901 return 0; 902 } 903 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 904 905 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 906 unsigned long len) 907 { 908 gfn_t gfn = gpa >> PAGE_SHIFT; 909 int seg; 910 int offset = offset_in_page(gpa); 911 int ret; 912 913 while ((seg = next_segment(len, offset)) != 0) { 914 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 915 if (ret < 0) 916 return ret; 917 offset = 0; 918 len -= seg; 919 data += seg; 920 ++gfn; 921 } 922 return 0; 923 } 924 925 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 926 { 927 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 928 } 929 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 930 931 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 932 { 933 gfn_t gfn = gpa >> PAGE_SHIFT; 934 int seg; 935 int offset = offset_in_page(gpa); 936 int ret; 937 938 while ((seg = next_segment(len, offset)) != 0) { 939 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 940 if (ret < 0) 941 return ret; 942 offset = 0; 943 len -= seg; 944 ++gfn; 945 } 946 return 0; 947 } 948 EXPORT_SYMBOL_GPL(kvm_clear_guest); 949 950 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 951 { 952 struct kvm_memory_slot *memslot; 953 954 gfn = unalias_gfn(kvm, gfn); 955 memslot = __gfn_to_memslot(kvm, gfn); 956 if (memslot && memslot->dirty_bitmap) { 957 unsigned long rel_gfn = gfn - memslot->base_gfn; 958 959 /* avoid RMW */ 960 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 961 set_bit(rel_gfn, memslot->dirty_bitmap); 962 } 963 } 964 965 /* 966 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 967 */ 968 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 969 { 970 DEFINE_WAIT(wait); 971 972 for (;;) { 973 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 974 975 if (kvm_cpu_has_interrupt(vcpu)) 976 break; 977 if (kvm_cpu_has_pending_timer(vcpu)) 978 break; 979 if (kvm_arch_vcpu_runnable(vcpu)) 980 break; 981 if (signal_pending(current)) 982 break; 983 984 vcpu_put(vcpu); 985 schedule(); 986 vcpu_load(vcpu); 987 } 988 989 finish_wait(&vcpu->wq, &wait); 990 } 991 992 void kvm_resched(struct kvm_vcpu *vcpu) 993 { 994 if (!need_resched()) 995 return; 996 cond_resched(); 997 } 998 EXPORT_SYMBOL_GPL(kvm_resched); 999 1000 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1001 { 1002 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1003 struct page *page; 1004 1005 if (vmf->pgoff == 0) 1006 page = virt_to_page(vcpu->run); 1007 #ifdef CONFIG_X86 1008 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1009 page = virt_to_page(vcpu->arch.pio_data); 1010 #endif 1011 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1012 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1013 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1014 #endif 1015 else 1016 return VM_FAULT_SIGBUS; 1017 get_page(page); 1018 vmf->page = page; 1019 return 0; 1020 } 1021 1022 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1023 .fault = kvm_vcpu_fault, 1024 }; 1025 1026 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1027 { 1028 vma->vm_ops = &kvm_vcpu_vm_ops; 1029 return 0; 1030 } 1031 1032 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1033 { 1034 struct kvm_vcpu *vcpu = filp->private_data; 1035 1036 kvm_put_kvm(vcpu->kvm); 1037 return 0; 1038 } 1039 1040 static const struct file_operations kvm_vcpu_fops = { 1041 .release = kvm_vcpu_release, 1042 .unlocked_ioctl = kvm_vcpu_ioctl, 1043 .compat_ioctl = kvm_vcpu_ioctl, 1044 .mmap = kvm_vcpu_mmap, 1045 }; 1046 1047 /* 1048 * Allocates an inode for the vcpu. 1049 */ 1050 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1051 { 1052 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1053 if (fd < 0) 1054 kvm_put_kvm(vcpu->kvm); 1055 return fd; 1056 } 1057 1058 /* 1059 * Creates some virtual cpus. Good luck creating more than one. 1060 */ 1061 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1062 { 1063 int r; 1064 struct kvm_vcpu *vcpu; 1065 1066 if (!valid_vcpu(n)) 1067 return -EINVAL; 1068 1069 vcpu = kvm_arch_vcpu_create(kvm, n); 1070 if (IS_ERR(vcpu)) 1071 return PTR_ERR(vcpu); 1072 1073 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1074 1075 r = kvm_arch_vcpu_setup(vcpu); 1076 if (r) 1077 goto vcpu_destroy; 1078 1079 mutex_lock(&kvm->lock); 1080 if (kvm->vcpus[n]) { 1081 r = -EEXIST; 1082 mutex_unlock(&kvm->lock); 1083 goto vcpu_destroy; 1084 } 1085 kvm->vcpus[n] = vcpu; 1086 mutex_unlock(&kvm->lock); 1087 1088 /* Now it's all set up, let userspace reach it */ 1089 kvm_get_kvm(kvm); 1090 r = create_vcpu_fd(vcpu); 1091 if (r < 0) 1092 goto unlink; 1093 return r; 1094 1095 unlink: 1096 mutex_lock(&kvm->lock); 1097 kvm->vcpus[n] = NULL; 1098 mutex_unlock(&kvm->lock); 1099 vcpu_destroy: 1100 kvm_arch_vcpu_destroy(vcpu); 1101 return r; 1102 } 1103 1104 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1105 { 1106 if (sigset) { 1107 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1108 vcpu->sigset_active = 1; 1109 vcpu->sigset = *sigset; 1110 } else 1111 vcpu->sigset_active = 0; 1112 return 0; 1113 } 1114 1115 static long kvm_vcpu_ioctl(struct file *filp, 1116 unsigned int ioctl, unsigned long arg) 1117 { 1118 struct kvm_vcpu *vcpu = filp->private_data; 1119 void __user *argp = (void __user *)arg; 1120 int r; 1121 1122 if (vcpu->kvm->mm != current->mm) 1123 return -EIO; 1124 switch (ioctl) { 1125 case KVM_RUN: 1126 r = -EINVAL; 1127 if (arg) 1128 goto out; 1129 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1130 break; 1131 case KVM_GET_REGS: { 1132 struct kvm_regs *kvm_regs; 1133 1134 r = -ENOMEM; 1135 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1136 if (!kvm_regs) 1137 goto out; 1138 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1139 if (r) 1140 goto out_free1; 1141 r = -EFAULT; 1142 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1143 goto out_free1; 1144 r = 0; 1145 out_free1: 1146 kfree(kvm_regs); 1147 break; 1148 } 1149 case KVM_SET_REGS: { 1150 struct kvm_regs *kvm_regs; 1151 1152 r = -ENOMEM; 1153 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1154 if (!kvm_regs) 1155 goto out; 1156 r = -EFAULT; 1157 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1158 goto out_free2; 1159 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1160 if (r) 1161 goto out_free2; 1162 r = 0; 1163 out_free2: 1164 kfree(kvm_regs); 1165 break; 1166 } 1167 case KVM_GET_SREGS: { 1168 struct kvm_sregs kvm_sregs; 1169 1170 memset(&kvm_sregs, 0, sizeof kvm_sregs); 1171 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 1172 if (r) 1173 goto out; 1174 r = -EFAULT; 1175 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 1176 goto out; 1177 r = 0; 1178 break; 1179 } 1180 case KVM_SET_SREGS: { 1181 struct kvm_sregs kvm_sregs; 1182 1183 r = -EFAULT; 1184 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 1185 goto out; 1186 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 1187 if (r) 1188 goto out; 1189 r = 0; 1190 break; 1191 } 1192 case KVM_GET_MP_STATE: { 1193 struct kvm_mp_state mp_state; 1194 1195 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1196 if (r) 1197 goto out; 1198 r = -EFAULT; 1199 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1200 goto out; 1201 r = 0; 1202 break; 1203 } 1204 case KVM_SET_MP_STATE: { 1205 struct kvm_mp_state mp_state; 1206 1207 r = -EFAULT; 1208 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1209 goto out; 1210 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1211 if (r) 1212 goto out; 1213 r = 0; 1214 break; 1215 } 1216 case KVM_TRANSLATE: { 1217 struct kvm_translation tr; 1218 1219 r = -EFAULT; 1220 if (copy_from_user(&tr, argp, sizeof tr)) 1221 goto out; 1222 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1223 if (r) 1224 goto out; 1225 r = -EFAULT; 1226 if (copy_to_user(argp, &tr, sizeof tr)) 1227 goto out; 1228 r = 0; 1229 break; 1230 } 1231 case KVM_DEBUG_GUEST: { 1232 struct kvm_debug_guest dbg; 1233 1234 r = -EFAULT; 1235 if (copy_from_user(&dbg, argp, sizeof dbg)) 1236 goto out; 1237 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); 1238 if (r) 1239 goto out; 1240 r = 0; 1241 break; 1242 } 1243 case KVM_SET_SIGNAL_MASK: { 1244 struct kvm_signal_mask __user *sigmask_arg = argp; 1245 struct kvm_signal_mask kvm_sigmask; 1246 sigset_t sigset, *p; 1247 1248 p = NULL; 1249 if (argp) { 1250 r = -EFAULT; 1251 if (copy_from_user(&kvm_sigmask, argp, 1252 sizeof kvm_sigmask)) 1253 goto out; 1254 r = -EINVAL; 1255 if (kvm_sigmask.len != sizeof sigset) 1256 goto out; 1257 r = -EFAULT; 1258 if (copy_from_user(&sigset, sigmask_arg->sigset, 1259 sizeof sigset)) 1260 goto out; 1261 p = &sigset; 1262 } 1263 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1264 break; 1265 } 1266 case KVM_GET_FPU: { 1267 struct kvm_fpu fpu; 1268 1269 memset(&fpu, 0, sizeof fpu); 1270 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); 1271 if (r) 1272 goto out; 1273 r = -EFAULT; 1274 if (copy_to_user(argp, &fpu, sizeof fpu)) 1275 goto out; 1276 r = 0; 1277 break; 1278 } 1279 case KVM_SET_FPU: { 1280 struct kvm_fpu fpu; 1281 1282 r = -EFAULT; 1283 if (copy_from_user(&fpu, argp, sizeof fpu)) 1284 goto out; 1285 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); 1286 if (r) 1287 goto out; 1288 r = 0; 1289 break; 1290 } 1291 default: 1292 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1293 } 1294 out: 1295 return r; 1296 } 1297 1298 static long kvm_vm_ioctl(struct file *filp, 1299 unsigned int ioctl, unsigned long arg) 1300 { 1301 struct kvm *kvm = filp->private_data; 1302 void __user *argp = (void __user *)arg; 1303 int r; 1304 1305 if (kvm->mm != current->mm) 1306 return -EIO; 1307 switch (ioctl) { 1308 case KVM_CREATE_VCPU: 1309 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1310 if (r < 0) 1311 goto out; 1312 break; 1313 case KVM_SET_USER_MEMORY_REGION: { 1314 struct kvm_userspace_memory_region kvm_userspace_mem; 1315 1316 r = -EFAULT; 1317 if (copy_from_user(&kvm_userspace_mem, argp, 1318 sizeof kvm_userspace_mem)) 1319 goto out; 1320 1321 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1322 if (r) 1323 goto out; 1324 break; 1325 } 1326 case KVM_GET_DIRTY_LOG: { 1327 struct kvm_dirty_log log; 1328 1329 r = -EFAULT; 1330 if (copy_from_user(&log, argp, sizeof log)) 1331 goto out; 1332 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1333 if (r) 1334 goto out; 1335 break; 1336 } 1337 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1338 case KVM_REGISTER_COALESCED_MMIO: { 1339 struct kvm_coalesced_mmio_zone zone; 1340 r = -EFAULT; 1341 if (copy_from_user(&zone, argp, sizeof zone)) 1342 goto out; 1343 r = -ENXIO; 1344 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1345 if (r) 1346 goto out; 1347 r = 0; 1348 break; 1349 } 1350 case KVM_UNREGISTER_COALESCED_MMIO: { 1351 struct kvm_coalesced_mmio_zone zone; 1352 r = -EFAULT; 1353 if (copy_from_user(&zone, argp, sizeof zone)) 1354 goto out; 1355 r = -ENXIO; 1356 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1357 if (r) 1358 goto out; 1359 r = 0; 1360 break; 1361 } 1362 #endif 1363 default: 1364 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1365 } 1366 out: 1367 return r; 1368 } 1369 1370 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1371 { 1372 struct kvm *kvm = vma->vm_file->private_data; 1373 struct page *page; 1374 1375 if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) 1376 return VM_FAULT_SIGBUS; 1377 page = gfn_to_page(kvm, vmf->pgoff); 1378 if (is_error_page(page)) { 1379 kvm_release_page_clean(page); 1380 return VM_FAULT_SIGBUS; 1381 } 1382 vmf->page = page; 1383 return 0; 1384 } 1385 1386 static struct vm_operations_struct kvm_vm_vm_ops = { 1387 .fault = kvm_vm_fault, 1388 }; 1389 1390 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1391 { 1392 vma->vm_ops = &kvm_vm_vm_ops; 1393 return 0; 1394 } 1395 1396 static const struct file_operations kvm_vm_fops = { 1397 .release = kvm_vm_release, 1398 .unlocked_ioctl = kvm_vm_ioctl, 1399 .compat_ioctl = kvm_vm_ioctl, 1400 .mmap = kvm_vm_mmap, 1401 }; 1402 1403 static int kvm_dev_ioctl_create_vm(void) 1404 { 1405 int fd; 1406 struct kvm *kvm; 1407 1408 kvm = kvm_create_vm(); 1409 if (IS_ERR(kvm)) 1410 return PTR_ERR(kvm); 1411 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 1412 if (fd < 0) 1413 kvm_put_kvm(kvm); 1414 1415 return fd; 1416 } 1417 1418 static long kvm_dev_ioctl(struct file *filp, 1419 unsigned int ioctl, unsigned long arg) 1420 { 1421 long r = -EINVAL; 1422 1423 switch (ioctl) { 1424 case KVM_GET_API_VERSION: 1425 r = -EINVAL; 1426 if (arg) 1427 goto out; 1428 r = KVM_API_VERSION; 1429 break; 1430 case KVM_CREATE_VM: 1431 r = -EINVAL; 1432 if (arg) 1433 goto out; 1434 r = kvm_dev_ioctl_create_vm(); 1435 break; 1436 case KVM_CHECK_EXTENSION: 1437 r = kvm_dev_ioctl_check_extension(arg); 1438 break; 1439 case KVM_GET_VCPU_MMAP_SIZE: 1440 r = -EINVAL; 1441 if (arg) 1442 goto out; 1443 r = PAGE_SIZE; /* struct kvm_run */ 1444 #ifdef CONFIG_X86 1445 r += PAGE_SIZE; /* pio data page */ 1446 #endif 1447 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1448 r += PAGE_SIZE; /* coalesced mmio ring page */ 1449 #endif 1450 break; 1451 case KVM_TRACE_ENABLE: 1452 case KVM_TRACE_PAUSE: 1453 case KVM_TRACE_DISABLE: 1454 r = kvm_trace_ioctl(ioctl, arg); 1455 break; 1456 default: 1457 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1458 } 1459 out: 1460 return r; 1461 } 1462 1463 static struct file_operations kvm_chardev_ops = { 1464 .unlocked_ioctl = kvm_dev_ioctl, 1465 .compat_ioctl = kvm_dev_ioctl, 1466 }; 1467 1468 static struct miscdevice kvm_dev = { 1469 KVM_MINOR, 1470 "kvm", 1471 &kvm_chardev_ops, 1472 }; 1473 1474 static void hardware_enable(void *junk) 1475 { 1476 int cpu = raw_smp_processor_id(); 1477 1478 if (cpu_isset(cpu, cpus_hardware_enabled)) 1479 return; 1480 cpu_set(cpu, cpus_hardware_enabled); 1481 kvm_arch_hardware_enable(NULL); 1482 } 1483 1484 static void hardware_disable(void *junk) 1485 { 1486 int cpu = raw_smp_processor_id(); 1487 1488 if (!cpu_isset(cpu, cpus_hardware_enabled)) 1489 return; 1490 cpu_clear(cpu, cpus_hardware_enabled); 1491 kvm_arch_hardware_disable(NULL); 1492 } 1493 1494 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1495 void *v) 1496 { 1497 int cpu = (long)v; 1498 1499 val &= ~CPU_TASKS_FROZEN; 1500 switch (val) { 1501 case CPU_DYING: 1502 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1503 cpu); 1504 hardware_disable(NULL); 1505 break; 1506 case CPU_UP_CANCELED: 1507 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1508 cpu); 1509 smp_call_function_single(cpu, hardware_disable, NULL, 1); 1510 break; 1511 case CPU_ONLINE: 1512 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 1513 cpu); 1514 smp_call_function_single(cpu, hardware_enable, NULL, 1); 1515 break; 1516 } 1517 return NOTIFY_OK; 1518 } 1519 1520 1521 asmlinkage void kvm_handle_fault_on_reboot(void) 1522 { 1523 if (kvm_rebooting) 1524 /* spin while reset goes on */ 1525 while (true) 1526 ; 1527 /* Fault while not rebooting. We want the trace. */ 1528 BUG(); 1529 } 1530 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 1531 1532 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 1533 void *v) 1534 { 1535 if (val == SYS_RESTART) { 1536 /* 1537 * Some (well, at least mine) BIOSes hang on reboot if 1538 * in vmx root mode. 1539 */ 1540 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 1541 kvm_rebooting = true; 1542 on_each_cpu(hardware_disable, NULL, 1); 1543 } 1544 return NOTIFY_OK; 1545 } 1546 1547 static struct notifier_block kvm_reboot_notifier = { 1548 .notifier_call = kvm_reboot, 1549 .priority = 0, 1550 }; 1551 1552 void kvm_io_bus_init(struct kvm_io_bus *bus) 1553 { 1554 memset(bus, 0, sizeof(*bus)); 1555 } 1556 1557 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 1558 { 1559 int i; 1560 1561 for (i = 0; i < bus->dev_count; i++) { 1562 struct kvm_io_device *pos = bus->devs[i]; 1563 1564 kvm_iodevice_destructor(pos); 1565 } 1566 } 1567 1568 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 1569 gpa_t addr, int len, int is_write) 1570 { 1571 int i; 1572 1573 for (i = 0; i < bus->dev_count; i++) { 1574 struct kvm_io_device *pos = bus->devs[i]; 1575 1576 if (pos->in_range(pos, addr, len, is_write)) 1577 return pos; 1578 } 1579 1580 return NULL; 1581 } 1582 1583 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 1584 { 1585 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 1586 1587 bus->devs[bus->dev_count++] = dev; 1588 } 1589 1590 static struct notifier_block kvm_cpu_notifier = { 1591 .notifier_call = kvm_cpu_hotplug, 1592 .priority = 20, /* must be > scheduler priority */ 1593 }; 1594 1595 static int vm_stat_get(void *_offset, u64 *val) 1596 { 1597 unsigned offset = (long)_offset; 1598 struct kvm *kvm; 1599 1600 *val = 0; 1601 spin_lock(&kvm_lock); 1602 list_for_each_entry(kvm, &vm_list, vm_list) 1603 *val += *(u32 *)((void *)kvm + offset); 1604 spin_unlock(&kvm_lock); 1605 return 0; 1606 } 1607 1608 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 1609 1610 static int vcpu_stat_get(void *_offset, u64 *val) 1611 { 1612 unsigned offset = (long)_offset; 1613 struct kvm *kvm; 1614 struct kvm_vcpu *vcpu; 1615 int i; 1616 1617 *val = 0; 1618 spin_lock(&kvm_lock); 1619 list_for_each_entry(kvm, &vm_list, vm_list) 1620 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 1621 vcpu = kvm->vcpus[i]; 1622 if (vcpu) 1623 *val += *(u32 *)((void *)vcpu + offset); 1624 } 1625 spin_unlock(&kvm_lock); 1626 return 0; 1627 } 1628 1629 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 1630 1631 static struct file_operations *stat_fops[] = { 1632 [KVM_STAT_VCPU] = &vcpu_stat_fops, 1633 [KVM_STAT_VM] = &vm_stat_fops, 1634 }; 1635 1636 static void kvm_init_debug(void) 1637 { 1638 struct kvm_stats_debugfs_item *p; 1639 1640 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 1641 for (p = debugfs_entries; p->name; ++p) 1642 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 1643 (void *)(long)p->offset, 1644 stat_fops[p->kind]); 1645 } 1646 1647 static void kvm_exit_debug(void) 1648 { 1649 struct kvm_stats_debugfs_item *p; 1650 1651 for (p = debugfs_entries; p->name; ++p) 1652 debugfs_remove(p->dentry); 1653 debugfs_remove(kvm_debugfs_dir); 1654 } 1655 1656 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 1657 { 1658 hardware_disable(NULL); 1659 return 0; 1660 } 1661 1662 static int kvm_resume(struct sys_device *dev) 1663 { 1664 hardware_enable(NULL); 1665 return 0; 1666 } 1667 1668 static struct sysdev_class kvm_sysdev_class = { 1669 .name = "kvm", 1670 .suspend = kvm_suspend, 1671 .resume = kvm_resume, 1672 }; 1673 1674 static struct sys_device kvm_sysdev = { 1675 .id = 0, 1676 .cls = &kvm_sysdev_class, 1677 }; 1678 1679 struct page *bad_page; 1680 pfn_t bad_pfn; 1681 1682 static inline 1683 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 1684 { 1685 return container_of(pn, struct kvm_vcpu, preempt_notifier); 1686 } 1687 1688 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 1689 { 1690 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1691 1692 kvm_arch_vcpu_load(vcpu, cpu); 1693 } 1694 1695 static void kvm_sched_out(struct preempt_notifier *pn, 1696 struct task_struct *next) 1697 { 1698 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1699 1700 kvm_arch_vcpu_put(vcpu); 1701 } 1702 1703 int kvm_init(void *opaque, unsigned int vcpu_size, 1704 struct module *module) 1705 { 1706 int r; 1707 int cpu; 1708 1709 kvm_init_debug(); 1710 1711 r = kvm_arch_init(opaque); 1712 if (r) 1713 goto out_fail; 1714 1715 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1716 1717 if (bad_page == NULL) { 1718 r = -ENOMEM; 1719 goto out; 1720 } 1721 1722 bad_pfn = page_to_pfn(bad_page); 1723 1724 r = kvm_arch_hardware_setup(); 1725 if (r < 0) 1726 goto out_free_0; 1727 1728 for_each_online_cpu(cpu) { 1729 smp_call_function_single(cpu, 1730 kvm_arch_check_processor_compat, 1731 &r, 1); 1732 if (r < 0) 1733 goto out_free_1; 1734 } 1735 1736 on_each_cpu(hardware_enable, NULL, 1); 1737 r = register_cpu_notifier(&kvm_cpu_notifier); 1738 if (r) 1739 goto out_free_2; 1740 register_reboot_notifier(&kvm_reboot_notifier); 1741 1742 r = sysdev_class_register(&kvm_sysdev_class); 1743 if (r) 1744 goto out_free_3; 1745 1746 r = sysdev_register(&kvm_sysdev); 1747 if (r) 1748 goto out_free_4; 1749 1750 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 1751 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 1752 __alignof__(struct kvm_vcpu), 1753 0, NULL); 1754 if (!kvm_vcpu_cache) { 1755 r = -ENOMEM; 1756 goto out_free_5; 1757 } 1758 1759 kvm_chardev_ops.owner = module; 1760 1761 r = misc_register(&kvm_dev); 1762 if (r) { 1763 printk(KERN_ERR "kvm: misc device register failed\n"); 1764 goto out_free; 1765 } 1766 1767 kvm_preempt_ops.sched_in = kvm_sched_in; 1768 kvm_preempt_ops.sched_out = kvm_sched_out; 1769 1770 return 0; 1771 1772 out_free: 1773 kmem_cache_destroy(kvm_vcpu_cache); 1774 out_free_5: 1775 sysdev_unregister(&kvm_sysdev); 1776 out_free_4: 1777 sysdev_class_unregister(&kvm_sysdev_class); 1778 out_free_3: 1779 unregister_reboot_notifier(&kvm_reboot_notifier); 1780 unregister_cpu_notifier(&kvm_cpu_notifier); 1781 out_free_2: 1782 on_each_cpu(hardware_disable, NULL, 1); 1783 out_free_1: 1784 kvm_arch_hardware_unsetup(); 1785 out_free_0: 1786 __free_page(bad_page); 1787 out: 1788 kvm_arch_exit(); 1789 kvm_exit_debug(); 1790 out_fail: 1791 return r; 1792 } 1793 EXPORT_SYMBOL_GPL(kvm_init); 1794 1795 void kvm_exit(void) 1796 { 1797 kvm_trace_cleanup(); 1798 misc_deregister(&kvm_dev); 1799 kmem_cache_destroy(kvm_vcpu_cache); 1800 sysdev_unregister(&kvm_sysdev); 1801 sysdev_class_unregister(&kvm_sysdev_class); 1802 unregister_reboot_notifier(&kvm_reboot_notifier); 1803 unregister_cpu_notifier(&kvm_cpu_notifier); 1804 on_each_cpu(hardware_disable, NULL, 1); 1805 kvm_arch_hardware_unsetup(); 1806 kvm_arch_exit(); 1807 kvm_exit_debug(); 1808 __free_page(bad_page); 1809 } 1810 EXPORT_SYMBOL_GPL(kvm_exit); 1811