1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include "iodev.h" 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 #include <linux/compat.h> 47 #include <linux/srcu.h> 48 #include <linux/hugetlb.h> 49 #include <linux/slab.h> 50 #include <linux/sort.h> 51 #include <linux/bsearch.h> 52 53 #include <asm/processor.h> 54 #include <asm/io.h> 55 #include <asm/uaccess.h> 56 #include <asm/pgtable.h> 57 58 #include "coalesced_mmio.h" 59 #include "async_pf.h" 60 61 #define CREATE_TRACE_POINTS 62 #include <trace/events/kvm.h> 63 64 MODULE_AUTHOR("Qumranet"); 65 MODULE_LICENSE("GPL"); 66 67 /* 68 * Ordering of locks: 69 * 70 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 71 */ 72 73 DEFINE_RAW_SPINLOCK(kvm_lock); 74 LIST_HEAD(vm_list); 75 76 static cpumask_var_t cpus_hardware_enabled; 77 static int kvm_usage_count = 0; 78 static atomic_t hardware_enable_failed; 79 80 struct kmem_cache *kvm_vcpu_cache; 81 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 82 83 static __read_mostly struct preempt_ops kvm_preempt_ops; 84 85 struct dentry *kvm_debugfs_dir; 86 87 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 88 unsigned long arg); 89 #ifdef CONFIG_COMPAT 90 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 91 unsigned long arg); 92 #endif 93 static int hardware_enable_all(void); 94 static void hardware_disable_all(void); 95 96 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 97 98 bool kvm_rebooting; 99 EXPORT_SYMBOL_GPL(kvm_rebooting); 100 101 static bool largepages_enabled = true; 102 103 bool kvm_is_mmio_pfn(pfn_t pfn) 104 { 105 if (pfn_valid(pfn)) { 106 int reserved; 107 struct page *tail = pfn_to_page(pfn); 108 struct page *head = compound_trans_head(tail); 109 reserved = PageReserved(head); 110 if (head != tail) { 111 /* 112 * "head" is not a dangling pointer 113 * (compound_trans_head takes care of that) 114 * but the hugepage may have been splitted 115 * from under us (and we may not hold a 116 * reference count on the head page so it can 117 * be reused before we run PageReferenced), so 118 * we've to check PageTail before returning 119 * what we just read. 120 */ 121 smp_rmb(); 122 if (PageTail(tail)) 123 return reserved; 124 } 125 return PageReserved(tail); 126 } 127 128 return true; 129 } 130 131 /* 132 * Switches to specified vcpu, until a matching vcpu_put() 133 */ 134 int vcpu_load(struct kvm_vcpu *vcpu) 135 { 136 int cpu; 137 138 if (mutex_lock_killable(&vcpu->mutex)) 139 return -EINTR; 140 if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { 141 /* The thread running this VCPU changed. */ 142 struct pid *oldpid = vcpu->pid; 143 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 144 rcu_assign_pointer(vcpu->pid, newpid); 145 synchronize_rcu(); 146 put_pid(oldpid); 147 } 148 cpu = get_cpu(); 149 preempt_notifier_register(&vcpu->preempt_notifier); 150 kvm_arch_vcpu_load(vcpu, cpu); 151 put_cpu(); 152 return 0; 153 } 154 155 void vcpu_put(struct kvm_vcpu *vcpu) 156 { 157 preempt_disable(); 158 kvm_arch_vcpu_put(vcpu); 159 preempt_notifier_unregister(&vcpu->preempt_notifier); 160 preempt_enable(); 161 mutex_unlock(&vcpu->mutex); 162 } 163 164 static void ack_flush(void *_completed) 165 { 166 } 167 168 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 169 { 170 int i, cpu, me; 171 cpumask_var_t cpus; 172 bool called = true; 173 struct kvm_vcpu *vcpu; 174 175 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 176 177 me = get_cpu(); 178 kvm_for_each_vcpu(i, vcpu, kvm) { 179 kvm_make_request(req, vcpu); 180 cpu = vcpu->cpu; 181 182 /* Set ->requests bit before we read ->mode */ 183 smp_mb(); 184 185 if (cpus != NULL && cpu != -1 && cpu != me && 186 kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) 187 cpumask_set_cpu(cpu, cpus); 188 } 189 if (unlikely(cpus == NULL)) 190 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 191 else if (!cpumask_empty(cpus)) 192 smp_call_function_many(cpus, ack_flush, NULL, 1); 193 else 194 called = false; 195 put_cpu(); 196 free_cpumask_var(cpus); 197 return called; 198 } 199 200 void kvm_flush_remote_tlbs(struct kvm *kvm) 201 { 202 long dirty_count = kvm->tlbs_dirty; 203 204 smp_mb(); 205 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 206 ++kvm->stat.remote_tlb_flush; 207 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 208 } 209 210 void kvm_reload_remote_mmus(struct kvm *kvm) 211 { 212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 213 } 214 215 void kvm_make_mclock_inprogress_request(struct kvm *kvm) 216 { 217 make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 218 } 219 220 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 221 { 222 struct page *page; 223 int r; 224 225 mutex_init(&vcpu->mutex); 226 vcpu->cpu = -1; 227 vcpu->kvm = kvm; 228 vcpu->vcpu_id = id; 229 vcpu->pid = NULL; 230 init_waitqueue_head(&vcpu->wq); 231 kvm_async_pf_vcpu_init(vcpu); 232 233 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 234 if (!page) { 235 r = -ENOMEM; 236 goto fail; 237 } 238 vcpu->run = page_address(page); 239 240 kvm_vcpu_set_in_spin_loop(vcpu, false); 241 kvm_vcpu_set_dy_eligible(vcpu, false); 242 243 r = kvm_arch_vcpu_init(vcpu); 244 if (r < 0) 245 goto fail_free_run; 246 return 0; 247 248 fail_free_run: 249 free_page((unsigned long)vcpu->run); 250 fail: 251 return r; 252 } 253 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 254 255 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 256 { 257 put_pid(vcpu->pid); 258 kvm_arch_vcpu_uninit(vcpu); 259 free_page((unsigned long)vcpu->run); 260 } 261 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 262 263 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 264 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 265 { 266 return container_of(mn, struct kvm, mmu_notifier); 267 } 268 269 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 270 struct mm_struct *mm, 271 unsigned long address) 272 { 273 struct kvm *kvm = mmu_notifier_to_kvm(mn); 274 int need_tlb_flush, idx; 275 276 /* 277 * When ->invalidate_page runs, the linux pte has been zapped 278 * already but the page is still allocated until 279 * ->invalidate_page returns. So if we increase the sequence 280 * here the kvm page fault will notice if the spte can't be 281 * established because the page is going to be freed. If 282 * instead the kvm page fault establishes the spte before 283 * ->invalidate_page runs, kvm_unmap_hva will release it 284 * before returning. 285 * 286 * The sequence increase only need to be seen at spin_unlock 287 * time, and not at spin_lock time. 288 * 289 * Increasing the sequence after the spin_unlock would be 290 * unsafe because the kvm page fault could then establish the 291 * pte after kvm_unmap_hva returned, without noticing the page 292 * is going to be freed. 293 */ 294 idx = srcu_read_lock(&kvm->srcu); 295 spin_lock(&kvm->mmu_lock); 296 297 kvm->mmu_notifier_seq++; 298 need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty; 299 /* we've to flush the tlb before the pages can be freed */ 300 if (need_tlb_flush) 301 kvm_flush_remote_tlbs(kvm); 302 303 spin_unlock(&kvm->mmu_lock); 304 srcu_read_unlock(&kvm->srcu, idx); 305 } 306 307 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 308 struct mm_struct *mm, 309 unsigned long address, 310 pte_t pte) 311 { 312 struct kvm *kvm = mmu_notifier_to_kvm(mn); 313 int idx; 314 315 idx = srcu_read_lock(&kvm->srcu); 316 spin_lock(&kvm->mmu_lock); 317 kvm->mmu_notifier_seq++; 318 kvm_set_spte_hva(kvm, address, pte); 319 spin_unlock(&kvm->mmu_lock); 320 srcu_read_unlock(&kvm->srcu, idx); 321 } 322 323 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 324 struct mm_struct *mm, 325 unsigned long start, 326 unsigned long end) 327 { 328 struct kvm *kvm = mmu_notifier_to_kvm(mn); 329 int need_tlb_flush = 0, idx; 330 331 idx = srcu_read_lock(&kvm->srcu); 332 spin_lock(&kvm->mmu_lock); 333 /* 334 * The count increase must become visible at unlock time as no 335 * spte can be established without taking the mmu_lock and 336 * count is also read inside the mmu_lock critical section. 337 */ 338 kvm->mmu_notifier_count++; 339 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 340 need_tlb_flush |= kvm->tlbs_dirty; 341 /* we've to flush the tlb before the pages can be freed */ 342 if (need_tlb_flush) 343 kvm_flush_remote_tlbs(kvm); 344 345 spin_unlock(&kvm->mmu_lock); 346 srcu_read_unlock(&kvm->srcu, idx); 347 } 348 349 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 350 struct mm_struct *mm, 351 unsigned long start, 352 unsigned long end) 353 { 354 struct kvm *kvm = mmu_notifier_to_kvm(mn); 355 356 spin_lock(&kvm->mmu_lock); 357 /* 358 * This sequence increase will notify the kvm page fault that 359 * the page that is going to be mapped in the spte could have 360 * been freed. 361 */ 362 kvm->mmu_notifier_seq++; 363 smp_wmb(); 364 /* 365 * The above sequence increase must be visible before the 366 * below count decrease, which is ensured by the smp_wmb above 367 * in conjunction with the smp_rmb in mmu_notifier_retry(). 368 */ 369 kvm->mmu_notifier_count--; 370 spin_unlock(&kvm->mmu_lock); 371 372 BUG_ON(kvm->mmu_notifier_count < 0); 373 } 374 375 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 376 struct mm_struct *mm, 377 unsigned long address) 378 { 379 struct kvm *kvm = mmu_notifier_to_kvm(mn); 380 int young, idx; 381 382 idx = srcu_read_lock(&kvm->srcu); 383 spin_lock(&kvm->mmu_lock); 384 385 young = kvm_age_hva(kvm, address); 386 if (young) 387 kvm_flush_remote_tlbs(kvm); 388 389 spin_unlock(&kvm->mmu_lock); 390 srcu_read_unlock(&kvm->srcu, idx); 391 392 return young; 393 } 394 395 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 396 struct mm_struct *mm, 397 unsigned long address) 398 { 399 struct kvm *kvm = mmu_notifier_to_kvm(mn); 400 int young, idx; 401 402 idx = srcu_read_lock(&kvm->srcu); 403 spin_lock(&kvm->mmu_lock); 404 young = kvm_test_age_hva(kvm, address); 405 spin_unlock(&kvm->mmu_lock); 406 srcu_read_unlock(&kvm->srcu, idx); 407 408 return young; 409 } 410 411 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 412 struct mm_struct *mm) 413 { 414 struct kvm *kvm = mmu_notifier_to_kvm(mn); 415 int idx; 416 417 idx = srcu_read_lock(&kvm->srcu); 418 kvm_arch_flush_shadow_all(kvm); 419 srcu_read_unlock(&kvm->srcu, idx); 420 } 421 422 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 423 .invalidate_page = kvm_mmu_notifier_invalidate_page, 424 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 425 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 426 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 427 .test_young = kvm_mmu_notifier_test_young, 428 .change_pte = kvm_mmu_notifier_change_pte, 429 .release = kvm_mmu_notifier_release, 430 }; 431 432 static int kvm_init_mmu_notifier(struct kvm *kvm) 433 { 434 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 435 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 436 } 437 438 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 439 440 static int kvm_init_mmu_notifier(struct kvm *kvm) 441 { 442 return 0; 443 } 444 445 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 446 447 static void kvm_init_memslots_id(struct kvm *kvm) 448 { 449 int i; 450 struct kvm_memslots *slots = kvm->memslots; 451 452 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 453 slots->id_to_index[i] = slots->memslots[i].id = i; 454 } 455 456 static struct kvm *kvm_create_vm(unsigned long type) 457 { 458 int r, i; 459 struct kvm *kvm = kvm_arch_alloc_vm(); 460 461 if (!kvm) 462 return ERR_PTR(-ENOMEM); 463 464 r = kvm_arch_init_vm(kvm, type); 465 if (r) 466 goto out_err_nodisable; 467 468 r = hardware_enable_all(); 469 if (r) 470 goto out_err_nodisable; 471 472 #ifdef CONFIG_HAVE_KVM_IRQCHIP 473 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 474 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 475 #endif 476 477 r = -ENOMEM; 478 kvm->memslots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 479 if (!kvm->memslots) 480 goto out_err_nosrcu; 481 kvm_init_memslots_id(kvm); 482 if (init_srcu_struct(&kvm->srcu)) 483 goto out_err_nosrcu; 484 for (i = 0; i < KVM_NR_BUSES; i++) { 485 kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus), 486 GFP_KERNEL); 487 if (!kvm->buses[i]) 488 goto out_err; 489 } 490 491 spin_lock_init(&kvm->mmu_lock); 492 kvm->mm = current->mm; 493 atomic_inc(&kvm->mm->mm_count); 494 kvm_eventfd_init(kvm); 495 mutex_init(&kvm->lock); 496 mutex_init(&kvm->irq_lock); 497 mutex_init(&kvm->slots_lock); 498 atomic_set(&kvm->users_count, 1); 499 500 r = kvm_init_mmu_notifier(kvm); 501 if (r) 502 goto out_err; 503 504 raw_spin_lock(&kvm_lock); 505 list_add(&kvm->vm_list, &vm_list); 506 raw_spin_unlock(&kvm_lock); 507 508 return kvm; 509 510 out_err: 511 cleanup_srcu_struct(&kvm->srcu); 512 out_err_nosrcu: 513 hardware_disable_all(); 514 out_err_nodisable: 515 for (i = 0; i < KVM_NR_BUSES; i++) 516 kfree(kvm->buses[i]); 517 kfree(kvm->memslots); 518 kvm_arch_free_vm(kvm); 519 return ERR_PTR(r); 520 } 521 522 /* 523 * Avoid using vmalloc for a small buffer. 524 * Should not be used when the size is statically known. 525 */ 526 void *kvm_kvzalloc(unsigned long size) 527 { 528 if (size > PAGE_SIZE) 529 return vzalloc(size); 530 else 531 return kzalloc(size, GFP_KERNEL); 532 } 533 534 void kvm_kvfree(const void *addr) 535 { 536 if (is_vmalloc_addr(addr)) 537 vfree(addr); 538 else 539 kfree(addr); 540 } 541 542 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 543 { 544 if (!memslot->dirty_bitmap) 545 return; 546 547 kvm_kvfree(memslot->dirty_bitmap); 548 memslot->dirty_bitmap = NULL; 549 } 550 551 /* 552 * Free any memory in @free but not in @dont. 553 */ 554 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 555 struct kvm_memory_slot *dont) 556 { 557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 558 kvm_destroy_dirty_bitmap(free); 559 560 kvm_arch_free_memslot(free, dont); 561 562 free->npages = 0; 563 } 564 565 void kvm_free_physmem(struct kvm *kvm) 566 { 567 struct kvm_memslots *slots = kvm->memslots; 568 struct kvm_memory_slot *memslot; 569 570 kvm_for_each_memslot(memslot, slots) 571 kvm_free_physmem_slot(memslot, NULL); 572 573 kfree(kvm->memslots); 574 } 575 576 static void kvm_destroy_vm(struct kvm *kvm) 577 { 578 int i; 579 struct mm_struct *mm = kvm->mm; 580 581 kvm_arch_sync_events(kvm); 582 raw_spin_lock(&kvm_lock); 583 list_del(&kvm->vm_list); 584 raw_spin_unlock(&kvm_lock); 585 kvm_free_irq_routing(kvm); 586 for (i = 0; i < KVM_NR_BUSES; i++) 587 kvm_io_bus_destroy(kvm->buses[i]); 588 kvm_coalesced_mmio_free(kvm); 589 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 590 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 591 #else 592 kvm_arch_flush_shadow_all(kvm); 593 #endif 594 kvm_arch_destroy_vm(kvm); 595 kvm_free_physmem(kvm); 596 cleanup_srcu_struct(&kvm->srcu); 597 kvm_arch_free_vm(kvm); 598 hardware_disable_all(); 599 mmdrop(mm); 600 } 601 602 void kvm_get_kvm(struct kvm *kvm) 603 { 604 atomic_inc(&kvm->users_count); 605 } 606 EXPORT_SYMBOL_GPL(kvm_get_kvm); 607 608 void kvm_put_kvm(struct kvm *kvm) 609 { 610 if (atomic_dec_and_test(&kvm->users_count)) 611 kvm_destroy_vm(kvm); 612 } 613 EXPORT_SYMBOL_GPL(kvm_put_kvm); 614 615 616 static int kvm_vm_release(struct inode *inode, struct file *filp) 617 { 618 struct kvm *kvm = filp->private_data; 619 620 kvm_irqfd_release(kvm); 621 622 kvm_put_kvm(kvm); 623 return 0; 624 } 625 626 /* 627 * Allocation size is twice as large as the actual dirty bitmap size. 628 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 629 */ 630 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 631 { 632 #ifndef CONFIG_S390 633 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 634 635 memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes); 636 if (!memslot->dirty_bitmap) 637 return -ENOMEM; 638 639 #endif /* !CONFIG_S390 */ 640 return 0; 641 } 642 643 static int cmp_memslot(const void *slot1, const void *slot2) 644 { 645 struct kvm_memory_slot *s1, *s2; 646 647 s1 = (struct kvm_memory_slot *)slot1; 648 s2 = (struct kvm_memory_slot *)slot2; 649 650 if (s1->npages < s2->npages) 651 return 1; 652 if (s1->npages > s2->npages) 653 return -1; 654 655 return 0; 656 } 657 658 /* 659 * Sort the memslots base on its size, so the larger slots 660 * will get better fit. 661 */ 662 static void sort_memslots(struct kvm_memslots *slots) 663 { 664 int i; 665 666 sort(slots->memslots, KVM_MEM_SLOTS_NUM, 667 sizeof(struct kvm_memory_slot), cmp_memslot, NULL); 668 669 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 670 slots->id_to_index[slots->memslots[i].id] = i; 671 } 672 673 void update_memslots(struct kvm_memslots *slots, struct kvm_memory_slot *new) 674 { 675 if (new) { 676 int id = new->id; 677 struct kvm_memory_slot *old = id_to_memslot(slots, id); 678 unsigned long npages = old->npages; 679 680 *old = *new; 681 if (new->npages != npages) 682 sort_memslots(slots); 683 } 684 685 slots->generation++; 686 } 687 688 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem) 689 { 690 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 691 692 #ifdef KVM_CAP_READONLY_MEM 693 valid_flags |= KVM_MEM_READONLY; 694 #endif 695 696 if (mem->flags & ~valid_flags) 697 return -EINVAL; 698 699 return 0; 700 } 701 702 /* 703 * Allocate some memory and give it an address in the guest physical address 704 * space. 705 * 706 * Discontiguous memory is allowed, mostly for framebuffers. 707 * 708 * Must be called holding mmap_sem for write. 709 */ 710 int __kvm_set_memory_region(struct kvm *kvm, 711 struct kvm_userspace_memory_region *mem, 712 int user_alloc) 713 { 714 int r; 715 gfn_t base_gfn; 716 unsigned long npages; 717 struct kvm_memory_slot *memslot, *slot; 718 struct kvm_memory_slot old, new; 719 struct kvm_memslots *slots, *old_memslots; 720 721 r = check_memory_region_flags(mem); 722 if (r) 723 goto out; 724 725 r = -EINVAL; 726 /* General sanity checks */ 727 if (mem->memory_size & (PAGE_SIZE - 1)) 728 goto out; 729 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 730 goto out; 731 /* We can read the guest memory with __xxx_user() later on. */ 732 if (user_alloc && 733 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 734 !access_ok(VERIFY_WRITE, 735 (void __user *)(unsigned long)mem->userspace_addr, 736 mem->memory_size))) 737 goto out; 738 if (mem->slot >= KVM_MEM_SLOTS_NUM) 739 goto out; 740 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 741 goto out; 742 743 memslot = id_to_memslot(kvm->memslots, mem->slot); 744 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 745 npages = mem->memory_size >> PAGE_SHIFT; 746 747 r = -EINVAL; 748 if (npages > KVM_MEM_MAX_NR_PAGES) 749 goto out; 750 751 if (!npages) 752 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 753 754 new = old = *memslot; 755 756 new.id = mem->slot; 757 new.base_gfn = base_gfn; 758 new.npages = npages; 759 new.flags = mem->flags; 760 761 /* Disallow changing a memory slot's size. */ 762 r = -EINVAL; 763 if (npages && old.npages && npages != old.npages) 764 goto out_free; 765 766 /* Check for overlaps */ 767 r = -EEXIST; 768 kvm_for_each_memslot(slot, kvm->memslots) { 769 if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot) 770 continue; 771 if (!((base_gfn + npages <= slot->base_gfn) || 772 (base_gfn >= slot->base_gfn + slot->npages))) 773 goto out_free; 774 } 775 776 /* Free page dirty bitmap if unneeded */ 777 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 778 new.dirty_bitmap = NULL; 779 780 r = -ENOMEM; 781 782 /* Allocate if a slot is being created */ 783 if (npages && !old.npages) { 784 new.user_alloc = user_alloc; 785 new.userspace_addr = mem->userspace_addr; 786 787 if (kvm_arch_create_memslot(&new, npages)) 788 goto out_free; 789 } 790 791 /* Allocate page dirty bitmap if needed */ 792 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 793 if (kvm_create_dirty_bitmap(&new) < 0) 794 goto out_free; 795 /* destroy any largepage mappings for dirty tracking */ 796 } 797 798 if (!npages || base_gfn != old.base_gfn) { 799 struct kvm_memory_slot *slot; 800 801 r = -ENOMEM; 802 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 803 GFP_KERNEL); 804 if (!slots) 805 goto out_free; 806 slot = id_to_memslot(slots, mem->slot); 807 slot->flags |= KVM_MEMSLOT_INVALID; 808 809 update_memslots(slots, NULL); 810 811 old_memslots = kvm->memslots; 812 rcu_assign_pointer(kvm->memslots, slots); 813 synchronize_srcu_expedited(&kvm->srcu); 814 /* From this point no new shadow pages pointing to a deleted, 815 * or moved, memslot will be created. 816 * 817 * validation of sp->gfn happens in: 818 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 819 * - kvm_is_visible_gfn (mmu_check_roots) 820 */ 821 kvm_arch_flush_shadow_memslot(kvm, slot); 822 kfree(old_memslots); 823 } 824 825 r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc); 826 if (r) 827 goto out_free; 828 829 /* map/unmap the pages in iommu page table */ 830 if (npages) { 831 r = kvm_iommu_map_pages(kvm, &new); 832 if (r) 833 goto out_free; 834 } else 835 kvm_iommu_unmap_pages(kvm, &old); 836 837 r = -ENOMEM; 838 slots = kmemdup(kvm->memslots, sizeof(struct kvm_memslots), 839 GFP_KERNEL); 840 if (!slots) 841 goto out_free; 842 843 /* actual memory is freed via old in kvm_free_physmem_slot below */ 844 if (!npages) { 845 new.dirty_bitmap = NULL; 846 memset(&new.arch, 0, sizeof(new.arch)); 847 } 848 849 update_memslots(slots, &new); 850 old_memslots = kvm->memslots; 851 rcu_assign_pointer(kvm->memslots, slots); 852 synchronize_srcu_expedited(&kvm->srcu); 853 854 kvm_arch_commit_memory_region(kvm, mem, old, user_alloc); 855 856 kvm_free_physmem_slot(&old, &new); 857 kfree(old_memslots); 858 859 return 0; 860 861 out_free: 862 kvm_free_physmem_slot(&new, &old); 863 out: 864 return r; 865 866 } 867 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 868 869 int kvm_set_memory_region(struct kvm *kvm, 870 struct kvm_userspace_memory_region *mem, 871 int user_alloc) 872 { 873 int r; 874 875 mutex_lock(&kvm->slots_lock); 876 r = __kvm_set_memory_region(kvm, mem, user_alloc); 877 mutex_unlock(&kvm->slots_lock); 878 return r; 879 } 880 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 881 882 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 883 struct 884 kvm_userspace_memory_region *mem, 885 int user_alloc) 886 { 887 if (mem->slot >= KVM_MEMORY_SLOTS) 888 return -EINVAL; 889 return kvm_set_memory_region(kvm, mem, user_alloc); 890 } 891 892 int kvm_get_dirty_log(struct kvm *kvm, 893 struct kvm_dirty_log *log, int *is_dirty) 894 { 895 struct kvm_memory_slot *memslot; 896 int r, i; 897 unsigned long n; 898 unsigned long any = 0; 899 900 r = -EINVAL; 901 if (log->slot >= KVM_MEMORY_SLOTS) 902 goto out; 903 904 memslot = id_to_memslot(kvm->memslots, log->slot); 905 r = -ENOENT; 906 if (!memslot->dirty_bitmap) 907 goto out; 908 909 n = kvm_dirty_bitmap_bytes(memslot); 910 911 for (i = 0; !any && i < n/sizeof(long); ++i) 912 any = memslot->dirty_bitmap[i]; 913 914 r = -EFAULT; 915 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 916 goto out; 917 918 if (any) 919 *is_dirty = 1; 920 921 r = 0; 922 out: 923 return r; 924 } 925 926 bool kvm_largepages_enabled(void) 927 { 928 return largepages_enabled; 929 } 930 931 void kvm_disable_largepages(void) 932 { 933 largepages_enabled = false; 934 } 935 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 936 937 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 938 { 939 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 940 } 941 EXPORT_SYMBOL_GPL(gfn_to_memslot); 942 943 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 944 { 945 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 946 947 if (!memslot || memslot->id >= KVM_MEMORY_SLOTS || 948 memslot->flags & KVM_MEMSLOT_INVALID) 949 return 0; 950 951 return 1; 952 } 953 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 954 955 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 956 { 957 struct vm_area_struct *vma; 958 unsigned long addr, size; 959 960 size = PAGE_SIZE; 961 962 addr = gfn_to_hva(kvm, gfn); 963 if (kvm_is_error_hva(addr)) 964 return PAGE_SIZE; 965 966 down_read(¤t->mm->mmap_sem); 967 vma = find_vma(current->mm, addr); 968 if (!vma) 969 goto out; 970 971 size = vma_kernel_pagesize(vma); 972 973 out: 974 up_read(¤t->mm->mmap_sem); 975 976 return size; 977 } 978 979 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 980 { 981 return slot->flags & KVM_MEM_READONLY; 982 } 983 984 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 985 gfn_t *nr_pages, bool write) 986 { 987 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 988 return KVM_HVA_ERR_BAD; 989 990 if (memslot_is_readonly(slot) && write) 991 return KVM_HVA_ERR_RO_BAD; 992 993 if (nr_pages) 994 *nr_pages = slot->npages - (gfn - slot->base_gfn); 995 996 return __gfn_to_hva_memslot(slot, gfn); 997 } 998 999 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1000 gfn_t *nr_pages) 1001 { 1002 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1003 } 1004 1005 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1006 gfn_t gfn) 1007 { 1008 return gfn_to_hva_many(slot, gfn, NULL); 1009 } 1010 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1011 1012 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1013 { 1014 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1015 } 1016 EXPORT_SYMBOL_GPL(gfn_to_hva); 1017 1018 /* 1019 * The hva returned by this function is only allowed to be read. 1020 * It should pair with kvm_read_hva() or kvm_read_hva_atomic(). 1021 */ 1022 static unsigned long gfn_to_hva_read(struct kvm *kvm, gfn_t gfn) 1023 { 1024 return __gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL, false); 1025 } 1026 1027 static int kvm_read_hva(void *data, void __user *hva, int len) 1028 { 1029 return __copy_from_user(data, hva, len); 1030 } 1031 1032 static int kvm_read_hva_atomic(void *data, void __user *hva, int len) 1033 { 1034 return __copy_from_user_inatomic(data, hva, len); 1035 } 1036 1037 int get_user_page_nowait(struct task_struct *tsk, struct mm_struct *mm, 1038 unsigned long start, int write, struct page **page) 1039 { 1040 int flags = FOLL_TOUCH | FOLL_NOWAIT | FOLL_HWPOISON | FOLL_GET; 1041 1042 if (write) 1043 flags |= FOLL_WRITE; 1044 1045 return __get_user_pages(tsk, mm, start, 1, flags, page, NULL, NULL); 1046 } 1047 1048 static inline int check_user_page_hwpoison(unsigned long addr) 1049 { 1050 int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; 1051 1052 rc = __get_user_pages(current, current->mm, addr, 1, 1053 flags, NULL, NULL, NULL); 1054 return rc == -EHWPOISON; 1055 } 1056 1057 /* 1058 * The atomic path to get the writable pfn which will be stored in @pfn, 1059 * true indicates success, otherwise false is returned. 1060 */ 1061 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1062 bool write_fault, bool *writable, pfn_t *pfn) 1063 { 1064 struct page *page[1]; 1065 int npages; 1066 1067 if (!(async || atomic)) 1068 return false; 1069 1070 /* 1071 * Fast pin a writable pfn only if it is a write fault request 1072 * or the caller allows to map a writable pfn for a read fault 1073 * request. 1074 */ 1075 if (!(write_fault || writable)) 1076 return false; 1077 1078 npages = __get_user_pages_fast(addr, 1, 1, page); 1079 if (npages == 1) { 1080 *pfn = page_to_pfn(page[0]); 1081 1082 if (writable) 1083 *writable = true; 1084 return true; 1085 } 1086 1087 return false; 1088 } 1089 1090 /* 1091 * The slow path to get the pfn of the specified host virtual address, 1092 * 1 indicates success, -errno is returned if error is detected. 1093 */ 1094 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1095 bool *writable, pfn_t *pfn) 1096 { 1097 struct page *page[1]; 1098 int npages = 0; 1099 1100 might_sleep(); 1101 1102 if (writable) 1103 *writable = write_fault; 1104 1105 if (async) { 1106 down_read(¤t->mm->mmap_sem); 1107 npages = get_user_page_nowait(current, current->mm, 1108 addr, write_fault, page); 1109 up_read(¤t->mm->mmap_sem); 1110 } else 1111 npages = get_user_pages_fast(addr, 1, write_fault, 1112 page); 1113 if (npages != 1) 1114 return npages; 1115 1116 /* map read fault as writable if possible */ 1117 if (unlikely(!write_fault) && writable) { 1118 struct page *wpage[1]; 1119 1120 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1121 if (npages == 1) { 1122 *writable = true; 1123 put_page(page[0]); 1124 page[0] = wpage[0]; 1125 } 1126 1127 npages = 1; 1128 } 1129 *pfn = page_to_pfn(page[0]); 1130 return npages; 1131 } 1132 1133 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1134 { 1135 if (unlikely(!(vma->vm_flags & VM_READ))) 1136 return false; 1137 1138 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1139 return false; 1140 1141 return true; 1142 } 1143 1144 /* 1145 * Pin guest page in memory and return its pfn. 1146 * @addr: host virtual address which maps memory to the guest 1147 * @atomic: whether this function can sleep 1148 * @async: whether this function need to wait IO complete if the 1149 * host page is not in the memory 1150 * @write_fault: whether we should get a writable host page 1151 * @writable: whether it allows to map a writable host page for !@write_fault 1152 * 1153 * The function will map a writable host page for these two cases: 1154 * 1): @write_fault = true 1155 * 2): @write_fault = false && @writable, @writable will tell the caller 1156 * whether the mapping is writable. 1157 */ 1158 static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1159 bool write_fault, bool *writable) 1160 { 1161 struct vm_area_struct *vma; 1162 pfn_t pfn = 0; 1163 int npages; 1164 1165 /* we can do it either atomically or asynchronously, not both */ 1166 BUG_ON(atomic && async); 1167 1168 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1169 return pfn; 1170 1171 if (atomic) 1172 return KVM_PFN_ERR_FAULT; 1173 1174 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1175 if (npages == 1) 1176 return pfn; 1177 1178 down_read(¤t->mm->mmap_sem); 1179 if (npages == -EHWPOISON || 1180 (!async && check_user_page_hwpoison(addr))) { 1181 pfn = KVM_PFN_ERR_HWPOISON; 1182 goto exit; 1183 } 1184 1185 vma = find_vma_intersection(current->mm, addr, addr + 1); 1186 1187 if (vma == NULL) 1188 pfn = KVM_PFN_ERR_FAULT; 1189 else if ((vma->vm_flags & VM_PFNMAP)) { 1190 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + 1191 vma->vm_pgoff; 1192 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1193 } else { 1194 if (async && vma_is_valid(vma, write_fault)) 1195 *async = true; 1196 pfn = KVM_PFN_ERR_FAULT; 1197 } 1198 exit: 1199 up_read(¤t->mm->mmap_sem); 1200 return pfn; 1201 } 1202 1203 static pfn_t 1204 __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, bool atomic, 1205 bool *async, bool write_fault, bool *writable) 1206 { 1207 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1208 1209 if (addr == KVM_HVA_ERR_RO_BAD) 1210 return KVM_PFN_ERR_RO_FAULT; 1211 1212 if (kvm_is_error_hva(addr)) 1213 return KVM_PFN_NOSLOT; 1214 1215 /* Do not map writable pfn in the readonly memslot. */ 1216 if (writable && memslot_is_readonly(slot)) { 1217 *writable = false; 1218 writable = NULL; 1219 } 1220 1221 return hva_to_pfn(addr, atomic, async, write_fault, 1222 writable); 1223 } 1224 1225 static pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic, bool *async, 1226 bool write_fault, bool *writable) 1227 { 1228 struct kvm_memory_slot *slot; 1229 1230 if (async) 1231 *async = false; 1232 1233 slot = gfn_to_memslot(kvm, gfn); 1234 1235 return __gfn_to_pfn_memslot(slot, gfn, atomic, async, write_fault, 1236 writable); 1237 } 1238 1239 pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1240 { 1241 return __gfn_to_pfn(kvm, gfn, true, NULL, true, NULL); 1242 } 1243 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1244 1245 pfn_t gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, bool *async, 1246 bool write_fault, bool *writable) 1247 { 1248 return __gfn_to_pfn(kvm, gfn, false, async, write_fault, writable); 1249 } 1250 EXPORT_SYMBOL_GPL(gfn_to_pfn_async); 1251 1252 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1253 { 1254 return __gfn_to_pfn(kvm, gfn, false, NULL, true, NULL); 1255 } 1256 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1257 1258 pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1259 bool *writable) 1260 { 1261 return __gfn_to_pfn(kvm, gfn, false, NULL, write_fault, writable); 1262 } 1263 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1264 1265 pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1266 { 1267 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1268 } 1269 1270 pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1271 { 1272 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1273 } 1274 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1275 1276 int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn, struct page **pages, 1277 int nr_pages) 1278 { 1279 unsigned long addr; 1280 gfn_t entry; 1281 1282 addr = gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, &entry); 1283 if (kvm_is_error_hva(addr)) 1284 return -1; 1285 1286 if (entry < nr_pages) 1287 return 0; 1288 1289 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1290 } 1291 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1292 1293 static struct page *kvm_pfn_to_page(pfn_t pfn) 1294 { 1295 if (is_error_noslot_pfn(pfn)) 1296 return KVM_ERR_PTR_BAD_PAGE; 1297 1298 if (kvm_is_mmio_pfn(pfn)) { 1299 WARN_ON(1); 1300 return KVM_ERR_PTR_BAD_PAGE; 1301 } 1302 1303 return pfn_to_page(pfn); 1304 } 1305 1306 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1307 { 1308 pfn_t pfn; 1309 1310 pfn = gfn_to_pfn(kvm, gfn); 1311 1312 return kvm_pfn_to_page(pfn); 1313 } 1314 1315 EXPORT_SYMBOL_GPL(gfn_to_page); 1316 1317 void kvm_release_page_clean(struct page *page) 1318 { 1319 WARN_ON(is_error_page(page)); 1320 1321 kvm_release_pfn_clean(page_to_pfn(page)); 1322 } 1323 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1324 1325 void kvm_release_pfn_clean(pfn_t pfn) 1326 { 1327 if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1328 put_page(pfn_to_page(pfn)); 1329 } 1330 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1331 1332 void kvm_release_page_dirty(struct page *page) 1333 { 1334 WARN_ON(is_error_page(page)); 1335 1336 kvm_release_pfn_dirty(page_to_pfn(page)); 1337 } 1338 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1339 1340 void kvm_release_pfn_dirty(pfn_t pfn) 1341 { 1342 kvm_set_pfn_dirty(pfn); 1343 kvm_release_pfn_clean(pfn); 1344 } 1345 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1346 1347 void kvm_set_page_dirty(struct page *page) 1348 { 1349 kvm_set_pfn_dirty(page_to_pfn(page)); 1350 } 1351 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1352 1353 void kvm_set_pfn_dirty(pfn_t pfn) 1354 { 1355 if (!kvm_is_mmio_pfn(pfn)) { 1356 struct page *page = pfn_to_page(pfn); 1357 if (!PageReserved(page)) 1358 SetPageDirty(page); 1359 } 1360 } 1361 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1362 1363 void kvm_set_pfn_accessed(pfn_t pfn) 1364 { 1365 if (!kvm_is_mmio_pfn(pfn)) 1366 mark_page_accessed(pfn_to_page(pfn)); 1367 } 1368 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1369 1370 void kvm_get_pfn(pfn_t pfn) 1371 { 1372 if (!kvm_is_mmio_pfn(pfn)) 1373 get_page(pfn_to_page(pfn)); 1374 } 1375 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1376 1377 static int next_segment(unsigned long len, int offset) 1378 { 1379 if (len > PAGE_SIZE - offset) 1380 return PAGE_SIZE - offset; 1381 else 1382 return len; 1383 } 1384 1385 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1386 int len) 1387 { 1388 int r; 1389 unsigned long addr; 1390 1391 addr = gfn_to_hva_read(kvm, gfn); 1392 if (kvm_is_error_hva(addr)) 1393 return -EFAULT; 1394 r = kvm_read_hva(data, (void __user *)addr + offset, len); 1395 if (r) 1396 return -EFAULT; 1397 return 0; 1398 } 1399 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1400 1401 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1402 { 1403 gfn_t gfn = gpa >> PAGE_SHIFT; 1404 int seg; 1405 int offset = offset_in_page(gpa); 1406 int ret; 1407 1408 while ((seg = next_segment(len, offset)) != 0) { 1409 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1410 if (ret < 0) 1411 return ret; 1412 offset = 0; 1413 len -= seg; 1414 data += seg; 1415 ++gfn; 1416 } 1417 return 0; 1418 } 1419 EXPORT_SYMBOL_GPL(kvm_read_guest); 1420 1421 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1422 unsigned long len) 1423 { 1424 int r; 1425 unsigned long addr; 1426 gfn_t gfn = gpa >> PAGE_SHIFT; 1427 int offset = offset_in_page(gpa); 1428 1429 addr = gfn_to_hva_read(kvm, gfn); 1430 if (kvm_is_error_hva(addr)) 1431 return -EFAULT; 1432 pagefault_disable(); 1433 r = kvm_read_hva_atomic(data, (void __user *)addr + offset, len); 1434 pagefault_enable(); 1435 if (r) 1436 return -EFAULT; 1437 return 0; 1438 } 1439 EXPORT_SYMBOL(kvm_read_guest_atomic); 1440 1441 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1442 int offset, int len) 1443 { 1444 int r; 1445 unsigned long addr; 1446 1447 addr = gfn_to_hva(kvm, gfn); 1448 if (kvm_is_error_hva(addr)) 1449 return -EFAULT; 1450 r = __copy_to_user((void __user *)addr + offset, data, len); 1451 if (r) 1452 return -EFAULT; 1453 mark_page_dirty(kvm, gfn); 1454 return 0; 1455 } 1456 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1457 1458 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1459 unsigned long len) 1460 { 1461 gfn_t gfn = gpa >> PAGE_SHIFT; 1462 int seg; 1463 int offset = offset_in_page(gpa); 1464 int ret; 1465 1466 while ((seg = next_segment(len, offset)) != 0) { 1467 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1468 if (ret < 0) 1469 return ret; 1470 offset = 0; 1471 len -= seg; 1472 data += seg; 1473 ++gfn; 1474 } 1475 return 0; 1476 } 1477 1478 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1479 gpa_t gpa) 1480 { 1481 struct kvm_memslots *slots = kvm_memslots(kvm); 1482 int offset = offset_in_page(gpa); 1483 gfn_t gfn = gpa >> PAGE_SHIFT; 1484 1485 ghc->gpa = gpa; 1486 ghc->generation = slots->generation; 1487 ghc->memslot = gfn_to_memslot(kvm, gfn); 1488 ghc->hva = gfn_to_hva_many(ghc->memslot, gfn, NULL); 1489 if (!kvm_is_error_hva(ghc->hva)) 1490 ghc->hva += offset; 1491 else 1492 return -EFAULT; 1493 1494 return 0; 1495 } 1496 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1497 1498 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1499 void *data, unsigned long len) 1500 { 1501 struct kvm_memslots *slots = kvm_memslots(kvm); 1502 int r; 1503 1504 if (slots->generation != ghc->generation) 1505 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1506 1507 if (kvm_is_error_hva(ghc->hva)) 1508 return -EFAULT; 1509 1510 r = __copy_to_user((void __user *)ghc->hva, data, len); 1511 if (r) 1512 return -EFAULT; 1513 mark_page_dirty_in_slot(kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); 1514 1515 return 0; 1516 } 1517 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1518 1519 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1520 void *data, unsigned long len) 1521 { 1522 struct kvm_memslots *slots = kvm_memslots(kvm); 1523 int r; 1524 1525 if (slots->generation != ghc->generation) 1526 kvm_gfn_to_hva_cache_init(kvm, ghc, ghc->gpa); 1527 1528 if (kvm_is_error_hva(ghc->hva)) 1529 return -EFAULT; 1530 1531 r = __copy_from_user(data, (void __user *)ghc->hva, len); 1532 if (r) 1533 return -EFAULT; 1534 1535 return 0; 1536 } 1537 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 1538 1539 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1540 { 1541 return kvm_write_guest_page(kvm, gfn, (const void *) empty_zero_page, 1542 offset, len); 1543 } 1544 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1545 1546 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1547 { 1548 gfn_t gfn = gpa >> PAGE_SHIFT; 1549 int seg; 1550 int offset = offset_in_page(gpa); 1551 int ret; 1552 1553 while ((seg = next_segment(len, offset)) != 0) { 1554 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1555 if (ret < 0) 1556 return ret; 1557 offset = 0; 1558 len -= seg; 1559 ++gfn; 1560 } 1561 return 0; 1562 } 1563 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1564 1565 void mark_page_dirty_in_slot(struct kvm *kvm, struct kvm_memory_slot *memslot, 1566 gfn_t gfn) 1567 { 1568 if (memslot && memslot->dirty_bitmap) { 1569 unsigned long rel_gfn = gfn - memslot->base_gfn; 1570 1571 set_bit_le(rel_gfn, memslot->dirty_bitmap); 1572 } 1573 } 1574 1575 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1576 { 1577 struct kvm_memory_slot *memslot; 1578 1579 memslot = gfn_to_memslot(kvm, gfn); 1580 mark_page_dirty_in_slot(kvm, memslot, gfn); 1581 } 1582 1583 /* 1584 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1585 */ 1586 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1587 { 1588 DEFINE_WAIT(wait); 1589 1590 for (;;) { 1591 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1592 1593 if (kvm_arch_vcpu_runnable(vcpu)) { 1594 kvm_make_request(KVM_REQ_UNHALT, vcpu); 1595 break; 1596 } 1597 if (kvm_cpu_has_pending_timer(vcpu)) 1598 break; 1599 if (signal_pending(current)) 1600 break; 1601 1602 schedule(); 1603 } 1604 1605 finish_wait(&vcpu->wq, &wait); 1606 } 1607 1608 #ifndef CONFIG_S390 1609 /* 1610 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 1611 */ 1612 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 1613 { 1614 int me; 1615 int cpu = vcpu->cpu; 1616 wait_queue_head_t *wqp; 1617 1618 wqp = kvm_arch_vcpu_wq(vcpu); 1619 if (waitqueue_active(wqp)) { 1620 wake_up_interruptible(wqp); 1621 ++vcpu->stat.halt_wakeup; 1622 } 1623 1624 me = get_cpu(); 1625 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 1626 if (kvm_arch_vcpu_should_kick(vcpu)) 1627 smp_send_reschedule(cpu); 1628 put_cpu(); 1629 } 1630 #endif /* !CONFIG_S390 */ 1631 1632 void kvm_resched(struct kvm_vcpu *vcpu) 1633 { 1634 if (!need_resched()) 1635 return; 1636 cond_resched(); 1637 } 1638 EXPORT_SYMBOL_GPL(kvm_resched); 1639 1640 bool kvm_vcpu_yield_to(struct kvm_vcpu *target) 1641 { 1642 struct pid *pid; 1643 struct task_struct *task = NULL; 1644 1645 rcu_read_lock(); 1646 pid = rcu_dereference(target->pid); 1647 if (pid) 1648 task = get_pid_task(target->pid, PIDTYPE_PID); 1649 rcu_read_unlock(); 1650 if (!task) 1651 return false; 1652 if (task->flags & PF_VCPU) { 1653 put_task_struct(task); 1654 return false; 1655 } 1656 if (yield_to(task, 1)) { 1657 put_task_struct(task); 1658 return true; 1659 } 1660 put_task_struct(task); 1661 return false; 1662 } 1663 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 1664 1665 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 1666 /* 1667 * Helper that checks whether a VCPU is eligible for directed yield. 1668 * Most eligible candidate to yield is decided by following heuristics: 1669 * 1670 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 1671 * (preempted lock holder), indicated by @in_spin_loop. 1672 * Set at the beiginning and cleared at the end of interception/PLE handler. 1673 * 1674 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 1675 * chance last time (mostly it has become eligible now since we have probably 1676 * yielded to lockholder in last iteration. This is done by toggling 1677 * @dy_eligible each time a VCPU checked for eligibility.) 1678 * 1679 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 1680 * to preempted lock-holder could result in wrong VCPU selection and CPU 1681 * burning. Giving priority for a potential lock-holder increases lock 1682 * progress. 1683 * 1684 * Since algorithm is based on heuristics, accessing another VCPU data without 1685 * locking does not harm. It may result in trying to yield to same VCPU, fail 1686 * and continue with next VCPU and so on. 1687 */ 1688 bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 1689 { 1690 bool eligible; 1691 1692 eligible = !vcpu->spin_loop.in_spin_loop || 1693 (vcpu->spin_loop.in_spin_loop && 1694 vcpu->spin_loop.dy_eligible); 1695 1696 if (vcpu->spin_loop.in_spin_loop) 1697 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 1698 1699 return eligible; 1700 } 1701 #endif 1702 void kvm_vcpu_on_spin(struct kvm_vcpu *me) 1703 { 1704 struct kvm *kvm = me->kvm; 1705 struct kvm_vcpu *vcpu; 1706 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 1707 int yielded = 0; 1708 int pass; 1709 int i; 1710 1711 kvm_vcpu_set_in_spin_loop(me, true); 1712 /* 1713 * We boost the priority of a VCPU that is runnable but not 1714 * currently running, because it got preempted by something 1715 * else and called schedule in __vcpu_run. Hopefully that 1716 * VCPU is holding the lock that we need and will release it. 1717 * We approximate round-robin by starting at the last boosted VCPU. 1718 */ 1719 for (pass = 0; pass < 2 && !yielded; pass++) { 1720 kvm_for_each_vcpu(i, vcpu, kvm) { 1721 if (!pass && i <= last_boosted_vcpu) { 1722 i = last_boosted_vcpu; 1723 continue; 1724 } else if (pass && i > last_boosted_vcpu) 1725 break; 1726 if (vcpu == me) 1727 continue; 1728 if (waitqueue_active(&vcpu->wq)) 1729 continue; 1730 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 1731 continue; 1732 if (kvm_vcpu_yield_to(vcpu)) { 1733 kvm->last_boosted_vcpu = i; 1734 yielded = 1; 1735 break; 1736 } 1737 } 1738 } 1739 kvm_vcpu_set_in_spin_loop(me, false); 1740 1741 /* Ensure vcpu is not eligible during next spinloop */ 1742 kvm_vcpu_set_dy_eligible(me, false); 1743 } 1744 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 1745 1746 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1747 { 1748 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1749 struct page *page; 1750 1751 if (vmf->pgoff == 0) 1752 page = virt_to_page(vcpu->run); 1753 #ifdef CONFIG_X86 1754 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1755 page = virt_to_page(vcpu->arch.pio_data); 1756 #endif 1757 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1758 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1759 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1760 #endif 1761 else 1762 return kvm_arch_vcpu_fault(vcpu, vmf); 1763 get_page(page); 1764 vmf->page = page; 1765 return 0; 1766 } 1767 1768 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1769 .fault = kvm_vcpu_fault, 1770 }; 1771 1772 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1773 { 1774 vma->vm_ops = &kvm_vcpu_vm_ops; 1775 return 0; 1776 } 1777 1778 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1779 { 1780 struct kvm_vcpu *vcpu = filp->private_data; 1781 1782 kvm_put_kvm(vcpu->kvm); 1783 return 0; 1784 } 1785 1786 static struct file_operations kvm_vcpu_fops = { 1787 .release = kvm_vcpu_release, 1788 .unlocked_ioctl = kvm_vcpu_ioctl, 1789 #ifdef CONFIG_COMPAT 1790 .compat_ioctl = kvm_vcpu_compat_ioctl, 1791 #endif 1792 .mmap = kvm_vcpu_mmap, 1793 .llseek = noop_llseek, 1794 }; 1795 1796 /* 1797 * Allocates an inode for the vcpu. 1798 */ 1799 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1800 { 1801 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR); 1802 } 1803 1804 /* 1805 * Creates some virtual cpus. Good luck creating more than one. 1806 */ 1807 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1808 { 1809 int r; 1810 struct kvm_vcpu *vcpu, *v; 1811 1812 vcpu = kvm_arch_vcpu_create(kvm, id); 1813 if (IS_ERR(vcpu)) 1814 return PTR_ERR(vcpu); 1815 1816 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1817 1818 r = kvm_arch_vcpu_setup(vcpu); 1819 if (r) 1820 goto vcpu_destroy; 1821 1822 mutex_lock(&kvm->lock); 1823 if (!kvm_vcpu_compatible(vcpu)) { 1824 r = -EINVAL; 1825 goto unlock_vcpu_destroy; 1826 } 1827 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1828 r = -EINVAL; 1829 goto unlock_vcpu_destroy; 1830 } 1831 1832 kvm_for_each_vcpu(r, v, kvm) 1833 if (v->vcpu_id == id) { 1834 r = -EEXIST; 1835 goto unlock_vcpu_destroy; 1836 } 1837 1838 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1839 1840 /* Now it's all set up, let userspace reach it */ 1841 kvm_get_kvm(kvm); 1842 r = create_vcpu_fd(vcpu); 1843 if (r < 0) { 1844 kvm_put_kvm(kvm); 1845 goto unlock_vcpu_destroy; 1846 } 1847 1848 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1849 smp_wmb(); 1850 atomic_inc(&kvm->online_vcpus); 1851 1852 mutex_unlock(&kvm->lock); 1853 kvm_arch_vcpu_postcreate(vcpu); 1854 return r; 1855 1856 unlock_vcpu_destroy: 1857 mutex_unlock(&kvm->lock); 1858 vcpu_destroy: 1859 kvm_arch_vcpu_destroy(vcpu); 1860 return r; 1861 } 1862 1863 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1864 { 1865 if (sigset) { 1866 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1867 vcpu->sigset_active = 1; 1868 vcpu->sigset = *sigset; 1869 } else 1870 vcpu->sigset_active = 0; 1871 return 0; 1872 } 1873 1874 static long kvm_vcpu_ioctl(struct file *filp, 1875 unsigned int ioctl, unsigned long arg) 1876 { 1877 struct kvm_vcpu *vcpu = filp->private_data; 1878 void __user *argp = (void __user *)arg; 1879 int r; 1880 struct kvm_fpu *fpu = NULL; 1881 struct kvm_sregs *kvm_sregs = NULL; 1882 1883 if (vcpu->kvm->mm != current->mm) 1884 return -EIO; 1885 1886 #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1887 /* 1888 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1889 * so vcpu_load() would break it. 1890 */ 1891 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1892 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1893 #endif 1894 1895 1896 r = vcpu_load(vcpu); 1897 if (r) 1898 return r; 1899 switch (ioctl) { 1900 case KVM_RUN: 1901 r = -EINVAL; 1902 if (arg) 1903 goto out; 1904 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1905 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 1906 break; 1907 case KVM_GET_REGS: { 1908 struct kvm_regs *kvm_regs; 1909 1910 r = -ENOMEM; 1911 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1912 if (!kvm_regs) 1913 goto out; 1914 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1915 if (r) 1916 goto out_free1; 1917 r = -EFAULT; 1918 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1919 goto out_free1; 1920 r = 0; 1921 out_free1: 1922 kfree(kvm_regs); 1923 break; 1924 } 1925 case KVM_SET_REGS: { 1926 struct kvm_regs *kvm_regs; 1927 1928 r = -ENOMEM; 1929 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 1930 if (IS_ERR(kvm_regs)) { 1931 r = PTR_ERR(kvm_regs); 1932 goto out; 1933 } 1934 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1935 kfree(kvm_regs); 1936 break; 1937 } 1938 case KVM_GET_SREGS: { 1939 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1940 r = -ENOMEM; 1941 if (!kvm_sregs) 1942 goto out; 1943 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1944 if (r) 1945 goto out; 1946 r = -EFAULT; 1947 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1948 goto out; 1949 r = 0; 1950 break; 1951 } 1952 case KVM_SET_SREGS: { 1953 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1954 if (IS_ERR(kvm_sregs)) { 1955 r = PTR_ERR(kvm_sregs); 1956 kvm_sregs = NULL; 1957 goto out; 1958 } 1959 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1960 break; 1961 } 1962 case KVM_GET_MP_STATE: { 1963 struct kvm_mp_state mp_state; 1964 1965 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1966 if (r) 1967 goto out; 1968 r = -EFAULT; 1969 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1970 goto out; 1971 r = 0; 1972 break; 1973 } 1974 case KVM_SET_MP_STATE: { 1975 struct kvm_mp_state mp_state; 1976 1977 r = -EFAULT; 1978 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1979 goto out; 1980 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1981 break; 1982 } 1983 case KVM_TRANSLATE: { 1984 struct kvm_translation tr; 1985 1986 r = -EFAULT; 1987 if (copy_from_user(&tr, argp, sizeof tr)) 1988 goto out; 1989 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1990 if (r) 1991 goto out; 1992 r = -EFAULT; 1993 if (copy_to_user(argp, &tr, sizeof tr)) 1994 goto out; 1995 r = 0; 1996 break; 1997 } 1998 case KVM_SET_GUEST_DEBUG: { 1999 struct kvm_guest_debug dbg; 2000 2001 r = -EFAULT; 2002 if (copy_from_user(&dbg, argp, sizeof dbg)) 2003 goto out; 2004 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2005 break; 2006 } 2007 case KVM_SET_SIGNAL_MASK: { 2008 struct kvm_signal_mask __user *sigmask_arg = argp; 2009 struct kvm_signal_mask kvm_sigmask; 2010 sigset_t sigset, *p; 2011 2012 p = NULL; 2013 if (argp) { 2014 r = -EFAULT; 2015 if (copy_from_user(&kvm_sigmask, argp, 2016 sizeof kvm_sigmask)) 2017 goto out; 2018 r = -EINVAL; 2019 if (kvm_sigmask.len != sizeof sigset) 2020 goto out; 2021 r = -EFAULT; 2022 if (copy_from_user(&sigset, sigmask_arg->sigset, 2023 sizeof sigset)) 2024 goto out; 2025 p = &sigset; 2026 } 2027 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2028 break; 2029 } 2030 case KVM_GET_FPU: { 2031 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2032 r = -ENOMEM; 2033 if (!fpu) 2034 goto out; 2035 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2036 if (r) 2037 goto out; 2038 r = -EFAULT; 2039 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2040 goto out; 2041 r = 0; 2042 break; 2043 } 2044 case KVM_SET_FPU: { 2045 fpu = memdup_user(argp, sizeof(*fpu)); 2046 if (IS_ERR(fpu)) { 2047 r = PTR_ERR(fpu); 2048 fpu = NULL; 2049 goto out; 2050 } 2051 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2052 break; 2053 } 2054 default: 2055 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2056 } 2057 out: 2058 vcpu_put(vcpu); 2059 kfree(fpu); 2060 kfree(kvm_sregs); 2061 return r; 2062 } 2063 2064 #ifdef CONFIG_COMPAT 2065 static long kvm_vcpu_compat_ioctl(struct file *filp, 2066 unsigned int ioctl, unsigned long arg) 2067 { 2068 struct kvm_vcpu *vcpu = filp->private_data; 2069 void __user *argp = compat_ptr(arg); 2070 int r; 2071 2072 if (vcpu->kvm->mm != current->mm) 2073 return -EIO; 2074 2075 switch (ioctl) { 2076 case KVM_SET_SIGNAL_MASK: { 2077 struct kvm_signal_mask __user *sigmask_arg = argp; 2078 struct kvm_signal_mask kvm_sigmask; 2079 compat_sigset_t csigset; 2080 sigset_t sigset; 2081 2082 if (argp) { 2083 r = -EFAULT; 2084 if (copy_from_user(&kvm_sigmask, argp, 2085 sizeof kvm_sigmask)) 2086 goto out; 2087 r = -EINVAL; 2088 if (kvm_sigmask.len != sizeof csigset) 2089 goto out; 2090 r = -EFAULT; 2091 if (copy_from_user(&csigset, sigmask_arg->sigset, 2092 sizeof csigset)) 2093 goto out; 2094 sigset_from_compat(&sigset, &csigset); 2095 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2096 } else 2097 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2098 break; 2099 } 2100 default: 2101 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2102 } 2103 2104 out: 2105 return r; 2106 } 2107 #endif 2108 2109 static long kvm_vm_ioctl(struct file *filp, 2110 unsigned int ioctl, unsigned long arg) 2111 { 2112 struct kvm *kvm = filp->private_data; 2113 void __user *argp = (void __user *)arg; 2114 int r; 2115 2116 if (kvm->mm != current->mm) 2117 return -EIO; 2118 switch (ioctl) { 2119 case KVM_CREATE_VCPU: 2120 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2121 break; 2122 case KVM_SET_USER_MEMORY_REGION: { 2123 struct kvm_userspace_memory_region kvm_userspace_mem; 2124 2125 r = -EFAULT; 2126 if (copy_from_user(&kvm_userspace_mem, argp, 2127 sizeof kvm_userspace_mem)) 2128 goto out; 2129 2130 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2131 break; 2132 } 2133 case KVM_GET_DIRTY_LOG: { 2134 struct kvm_dirty_log log; 2135 2136 r = -EFAULT; 2137 if (copy_from_user(&log, argp, sizeof log)) 2138 goto out; 2139 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2140 break; 2141 } 2142 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2143 case KVM_REGISTER_COALESCED_MMIO: { 2144 struct kvm_coalesced_mmio_zone zone; 2145 r = -EFAULT; 2146 if (copy_from_user(&zone, argp, sizeof zone)) 2147 goto out; 2148 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2149 break; 2150 } 2151 case KVM_UNREGISTER_COALESCED_MMIO: { 2152 struct kvm_coalesced_mmio_zone zone; 2153 r = -EFAULT; 2154 if (copy_from_user(&zone, argp, sizeof zone)) 2155 goto out; 2156 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2157 break; 2158 } 2159 #endif 2160 case KVM_IRQFD: { 2161 struct kvm_irqfd data; 2162 2163 r = -EFAULT; 2164 if (copy_from_user(&data, argp, sizeof data)) 2165 goto out; 2166 r = kvm_irqfd(kvm, &data); 2167 break; 2168 } 2169 case KVM_IOEVENTFD: { 2170 struct kvm_ioeventfd data; 2171 2172 r = -EFAULT; 2173 if (copy_from_user(&data, argp, sizeof data)) 2174 goto out; 2175 r = kvm_ioeventfd(kvm, &data); 2176 break; 2177 } 2178 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2179 case KVM_SET_BOOT_CPU_ID: 2180 r = 0; 2181 mutex_lock(&kvm->lock); 2182 if (atomic_read(&kvm->online_vcpus) != 0) 2183 r = -EBUSY; 2184 else 2185 kvm->bsp_vcpu_id = arg; 2186 mutex_unlock(&kvm->lock); 2187 break; 2188 #endif 2189 #ifdef CONFIG_HAVE_KVM_MSI 2190 case KVM_SIGNAL_MSI: { 2191 struct kvm_msi msi; 2192 2193 r = -EFAULT; 2194 if (copy_from_user(&msi, argp, sizeof msi)) 2195 goto out; 2196 r = kvm_send_userspace_msi(kvm, &msi); 2197 break; 2198 } 2199 #endif 2200 #ifdef __KVM_HAVE_IRQ_LINE 2201 case KVM_IRQ_LINE_STATUS: 2202 case KVM_IRQ_LINE: { 2203 struct kvm_irq_level irq_event; 2204 2205 r = -EFAULT; 2206 if (copy_from_user(&irq_event, argp, sizeof irq_event)) 2207 goto out; 2208 2209 r = kvm_vm_ioctl_irq_line(kvm, &irq_event); 2210 if (r) 2211 goto out; 2212 2213 r = -EFAULT; 2214 if (ioctl == KVM_IRQ_LINE_STATUS) { 2215 if (copy_to_user(argp, &irq_event, sizeof irq_event)) 2216 goto out; 2217 } 2218 2219 r = 0; 2220 break; 2221 } 2222 #endif 2223 default: 2224 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2225 if (r == -ENOTTY) 2226 r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); 2227 } 2228 out: 2229 return r; 2230 } 2231 2232 #ifdef CONFIG_COMPAT 2233 struct compat_kvm_dirty_log { 2234 __u32 slot; 2235 __u32 padding1; 2236 union { 2237 compat_uptr_t dirty_bitmap; /* one bit per page */ 2238 __u64 padding2; 2239 }; 2240 }; 2241 2242 static long kvm_vm_compat_ioctl(struct file *filp, 2243 unsigned int ioctl, unsigned long arg) 2244 { 2245 struct kvm *kvm = filp->private_data; 2246 int r; 2247 2248 if (kvm->mm != current->mm) 2249 return -EIO; 2250 switch (ioctl) { 2251 case KVM_GET_DIRTY_LOG: { 2252 struct compat_kvm_dirty_log compat_log; 2253 struct kvm_dirty_log log; 2254 2255 r = -EFAULT; 2256 if (copy_from_user(&compat_log, (void __user *)arg, 2257 sizeof(compat_log))) 2258 goto out; 2259 log.slot = compat_log.slot; 2260 log.padding1 = compat_log.padding1; 2261 log.padding2 = compat_log.padding2; 2262 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2263 2264 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2265 break; 2266 } 2267 default: 2268 r = kvm_vm_ioctl(filp, ioctl, arg); 2269 } 2270 2271 out: 2272 return r; 2273 } 2274 #endif 2275 2276 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2277 { 2278 struct page *page[1]; 2279 unsigned long addr; 2280 int npages; 2281 gfn_t gfn = vmf->pgoff; 2282 struct kvm *kvm = vma->vm_file->private_data; 2283 2284 addr = gfn_to_hva(kvm, gfn); 2285 if (kvm_is_error_hva(addr)) 2286 return VM_FAULT_SIGBUS; 2287 2288 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2289 NULL); 2290 if (unlikely(npages != 1)) 2291 return VM_FAULT_SIGBUS; 2292 2293 vmf->page = page[0]; 2294 return 0; 2295 } 2296 2297 static const struct vm_operations_struct kvm_vm_vm_ops = { 2298 .fault = kvm_vm_fault, 2299 }; 2300 2301 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2302 { 2303 vma->vm_ops = &kvm_vm_vm_ops; 2304 return 0; 2305 } 2306 2307 static struct file_operations kvm_vm_fops = { 2308 .release = kvm_vm_release, 2309 .unlocked_ioctl = kvm_vm_ioctl, 2310 #ifdef CONFIG_COMPAT 2311 .compat_ioctl = kvm_vm_compat_ioctl, 2312 #endif 2313 .mmap = kvm_vm_mmap, 2314 .llseek = noop_llseek, 2315 }; 2316 2317 static int kvm_dev_ioctl_create_vm(unsigned long type) 2318 { 2319 int r; 2320 struct kvm *kvm; 2321 2322 kvm = kvm_create_vm(type); 2323 if (IS_ERR(kvm)) 2324 return PTR_ERR(kvm); 2325 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2326 r = kvm_coalesced_mmio_init(kvm); 2327 if (r < 0) { 2328 kvm_put_kvm(kvm); 2329 return r; 2330 } 2331 #endif 2332 r = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 2333 if (r < 0) 2334 kvm_put_kvm(kvm); 2335 2336 return r; 2337 } 2338 2339 static long kvm_dev_ioctl_check_extension_generic(long arg) 2340 { 2341 switch (arg) { 2342 case KVM_CAP_USER_MEMORY: 2343 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2344 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2345 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2346 case KVM_CAP_SET_BOOT_CPU_ID: 2347 #endif 2348 case KVM_CAP_INTERNAL_ERROR_DATA: 2349 #ifdef CONFIG_HAVE_KVM_MSI 2350 case KVM_CAP_SIGNAL_MSI: 2351 #endif 2352 return 1; 2353 #ifdef KVM_CAP_IRQ_ROUTING 2354 case KVM_CAP_IRQ_ROUTING: 2355 return KVM_MAX_IRQ_ROUTES; 2356 #endif 2357 default: 2358 break; 2359 } 2360 return kvm_dev_ioctl_check_extension(arg); 2361 } 2362 2363 static long kvm_dev_ioctl(struct file *filp, 2364 unsigned int ioctl, unsigned long arg) 2365 { 2366 long r = -EINVAL; 2367 2368 switch (ioctl) { 2369 case KVM_GET_API_VERSION: 2370 r = -EINVAL; 2371 if (arg) 2372 goto out; 2373 r = KVM_API_VERSION; 2374 break; 2375 case KVM_CREATE_VM: 2376 r = kvm_dev_ioctl_create_vm(arg); 2377 break; 2378 case KVM_CHECK_EXTENSION: 2379 r = kvm_dev_ioctl_check_extension_generic(arg); 2380 break; 2381 case KVM_GET_VCPU_MMAP_SIZE: 2382 r = -EINVAL; 2383 if (arg) 2384 goto out; 2385 r = PAGE_SIZE; /* struct kvm_run */ 2386 #ifdef CONFIG_X86 2387 r += PAGE_SIZE; /* pio data page */ 2388 #endif 2389 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2390 r += PAGE_SIZE; /* coalesced mmio ring page */ 2391 #endif 2392 break; 2393 case KVM_TRACE_ENABLE: 2394 case KVM_TRACE_PAUSE: 2395 case KVM_TRACE_DISABLE: 2396 r = -EOPNOTSUPP; 2397 break; 2398 default: 2399 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2400 } 2401 out: 2402 return r; 2403 } 2404 2405 static struct file_operations kvm_chardev_ops = { 2406 .unlocked_ioctl = kvm_dev_ioctl, 2407 .compat_ioctl = kvm_dev_ioctl, 2408 .llseek = noop_llseek, 2409 }; 2410 2411 static struct miscdevice kvm_dev = { 2412 KVM_MINOR, 2413 "kvm", 2414 &kvm_chardev_ops, 2415 }; 2416 2417 static void hardware_enable_nolock(void *junk) 2418 { 2419 int cpu = raw_smp_processor_id(); 2420 int r; 2421 2422 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2423 return; 2424 2425 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2426 2427 r = kvm_arch_hardware_enable(NULL); 2428 2429 if (r) { 2430 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2431 atomic_inc(&hardware_enable_failed); 2432 printk(KERN_INFO "kvm: enabling virtualization on " 2433 "CPU%d failed\n", cpu); 2434 } 2435 } 2436 2437 static void hardware_enable(void *junk) 2438 { 2439 raw_spin_lock(&kvm_lock); 2440 hardware_enable_nolock(junk); 2441 raw_spin_unlock(&kvm_lock); 2442 } 2443 2444 static void hardware_disable_nolock(void *junk) 2445 { 2446 int cpu = raw_smp_processor_id(); 2447 2448 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2449 return; 2450 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2451 kvm_arch_hardware_disable(NULL); 2452 } 2453 2454 static void hardware_disable(void *junk) 2455 { 2456 raw_spin_lock(&kvm_lock); 2457 hardware_disable_nolock(junk); 2458 raw_spin_unlock(&kvm_lock); 2459 } 2460 2461 static void hardware_disable_all_nolock(void) 2462 { 2463 BUG_ON(!kvm_usage_count); 2464 2465 kvm_usage_count--; 2466 if (!kvm_usage_count) 2467 on_each_cpu(hardware_disable_nolock, NULL, 1); 2468 } 2469 2470 static void hardware_disable_all(void) 2471 { 2472 raw_spin_lock(&kvm_lock); 2473 hardware_disable_all_nolock(); 2474 raw_spin_unlock(&kvm_lock); 2475 } 2476 2477 static int hardware_enable_all(void) 2478 { 2479 int r = 0; 2480 2481 raw_spin_lock(&kvm_lock); 2482 2483 kvm_usage_count++; 2484 if (kvm_usage_count == 1) { 2485 atomic_set(&hardware_enable_failed, 0); 2486 on_each_cpu(hardware_enable_nolock, NULL, 1); 2487 2488 if (atomic_read(&hardware_enable_failed)) { 2489 hardware_disable_all_nolock(); 2490 r = -EBUSY; 2491 } 2492 } 2493 2494 raw_spin_unlock(&kvm_lock); 2495 2496 return r; 2497 } 2498 2499 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2500 void *v) 2501 { 2502 int cpu = (long)v; 2503 2504 if (!kvm_usage_count) 2505 return NOTIFY_OK; 2506 2507 val &= ~CPU_TASKS_FROZEN; 2508 switch (val) { 2509 case CPU_DYING: 2510 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2511 cpu); 2512 hardware_disable(NULL); 2513 break; 2514 case CPU_STARTING: 2515 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2516 cpu); 2517 hardware_enable(NULL); 2518 break; 2519 } 2520 return NOTIFY_OK; 2521 } 2522 2523 2524 asmlinkage void kvm_spurious_fault(void) 2525 { 2526 /* Fault while not rebooting. We want the trace. */ 2527 BUG(); 2528 } 2529 EXPORT_SYMBOL_GPL(kvm_spurious_fault); 2530 2531 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2532 void *v) 2533 { 2534 /* 2535 * Some (well, at least mine) BIOSes hang on reboot if 2536 * in vmx root mode. 2537 * 2538 * And Intel TXT required VMX off for all cpu when system shutdown. 2539 */ 2540 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2541 kvm_rebooting = true; 2542 on_each_cpu(hardware_disable_nolock, NULL, 1); 2543 return NOTIFY_OK; 2544 } 2545 2546 static struct notifier_block kvm_reboot_notifier = { 2547 .notifier_call = kvm_reboot, 2548 .priority = 0, 2549 }; 2550 2551 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2552 { 2553 int i; 2554 2555 for (i = 0; i < bus->dev_count; i++) { 2556 struct kvm_io_device *pos = bus->range[i].dev; 2557 2558 kvm_iodevice_destructor(pos); 2559 } 2560 kfree(bus); 2561 } 2562 2563 int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 2564 { 2565 const struct kvm_io_range *r1 = p1; 2566 const struct kvm_io_range *r2 = p2; 2567 2568 if (r1->addr < r2->addr) 2569 return -1; 2570 if (r1->addr + r1->len > r2->addr + r2->len) 2571 return 1; 2572 return 0; 2573 } 2574 2575 int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 2576 gpa_t addr, int len) 2577 { 2578 bus->range[bus->dev_count++] = (struct kvm_io_range) { 2579 .addr = addr, 2580 .len = len, 2581 .dev = dev, 2582 }; 2583 2584 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 2585 kvm_io_bus_sort_cmp, NULL); 2586 2587 return 0; 2588 } 2589 2590 int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 2591 gpa_t addr, int len) 2592 { 2593 struct kvm_io_range *range, key; 2594 int off; 2595 2596 key = (struct kvm_io_range) { 2597 .addr = addr, 2598 .len = len, 2599 }; 2600 2601 range = bsearch(&key, bus->range, bus->dev_count, 2602 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 2603 if (range == NULL) 2604 return -ENOENT; 2605 2606 off = range - bus->range; 2607 2608 while (off > 0 && kvm_io_bus_sort_cmp(&key, &bus->range[off-1]) == 0) 2609 off--; 2610 2611 return off; 2612 } 2613 2614 /* kvm_io_bus_write - called under kvm->slots_lock */ 2615 int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2616 int len, const void *val) 2617 { 2618 int idx; 2619 struct kvm_io_bus *bus; 2620 struct kvm_io_range range; 2621 2622 range = (struct kvm_io_range) { 2623 .addr = addr, 2624 .len = len, 2625 }; 2626 2627 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2628 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2629 if (idx < 0) 2630 return -EOPNOTSUPP; 2631 2632 while (idx < bus->dev_count && 2633 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2634 if (!kvm_iodevice_write(bus->range[idx].dev, addr, len, val)) 2635 return 0; 2636 idx++; 2637 } 2638 2639 return -EOPNOTSUPP; 2640 } 2641 2642 /* kvm_io_bus_read - called under kvm->slots_lock */ 2643 int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2644 int len, void *val) 2645 { 2646 int idx; 2647 struct kvm_io_bus *bus; 2648 struct kvm_io_range range; 2649 2650 range = (struct kvm_io_range) { 2651 .addr = addr, 2652 .len = len, 2653 }; 2654 2655 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 2656 idx = kvm_io_bus_get_first_dev(bus, addr, len); 2657 if (idx < 0) 2658 return -EOPNOTSUPP; 2659 2660 while (idx < bus->dev_count && 2661 kvm_io_bus_sort_cmp(&range, &bus->range[idx]) == 0) { 2662 if (!kvm_iodevice_read(bus->range[idx].dev, addr, len, val)) 2663 return 0; 2664 idx++; 2665 } 2666 2667 return -EOPNOTSUPP; 2668 } 2669 2670 /* Caller must hold slots_lock. */ 2671 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 2672 int len, struct kvm_io_device *dev) 2673 { 2674 struct kvm_io_bus *new_bus, *bus; 2675 2676 bus = kvm->buses[bus_idx]; 2677 if (bus->dev_count > NR_IOBUS_DEVS - 1) 2678 return -ENOSPC; 2679 2680 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) * 2681 sizeof(struct kvm_io_range)), GFP_KERNEL); 2682 if (!new_bus) 2683 return -ENOMEM; 2684 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 2685 sizeof(struct kvm_io_range))); 2686 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 2687 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2688 synchronize_srcu_expedited(&kvm->srcu); 2689 kfree(bus); 2690 2691 return 0; 2692 } 2693 2694 /* Caller must hold slots_lock. */ 2695 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 2696 struct kvm_io_device *dev) 2697 { 2698 int i, r; 2699 struct kvm_io_bus *new_bus, *bus; 2700 2701 bus = kvm->buses[bus_idx]; 2702 r = -ENOENT; 2703 for (i = 0; i < bus->dev_count; i++) 2704 if (bus->range[i].dev == dev) { 2705 r = 0; 2706 break; 2707 } 2708 2709 if (r) 2710 return r; 2711 2712 new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) * 2713 sizeof(struct kvm_io_range)), GFP_KERNEL); 2714 if (!new_bus) 2715 return -ENOMEM; 2716 2717 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 2718 new_bus->dev_count--; 2719 memcpy(new_bus->range + i, bus->range + i + 1, 2720 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 2721 2722 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 2723 synchronize_srcu_expedited(&kvm->srcu); 2724 kfree(bus); 2725 return r; 2726 } 2727 2728 static struct notifier_block kvm_cpu_notifier = { 2729 .notifier_call = kvm_cpu_hotplug, 2730 }; 2731 2732 static int vm_stat_get(void *_offset, u64 *val) 2733 { 2734 unsigned offset = (long)_offset; 2735 struct kvm *kvm; 2736 2737 *val = 0; 2738 raw_spin_lock(&kvm_lock); 2739 list_for_each_entry(kvm, &vm_list, vm_list) 2740 *val += *(u32 *)((void *)kvm + offset); 2741 raw_spin_unlock(&kvm_lock); 2742 return 0; 2743 } 2744 2745 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2746 2747 static int vcpu_stat_get(void *_offset, u64 *val) 2748 { 2749 unsigned offset = (long)_offset; 2750 struct kvm *kvm; 2751 struct kvm_vcpu *vcpu; 2752 int i; 2753 2754 *val = 0; 2755 raw_spin_lock(&kvm_lock); 2756 list_for_each_entry(kvm, &vm_list, vm_list) 2757 kvm_for_each_vcpu(i, vcpu, kvm) 2758 *val += *(u32 *)((void *)vcpu + offset); 2759 2760 raw_spin_unlock(&kvm_lock); 2761 return 0; 2762 } 2763 2764 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2765 2766 static const struct file_operations *stat_fops[] = { 2767 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2768 [KVM_STAT_VM] = &vm_stat_fops, 2769 }; 2770 2771 static int kvm_init_debug(void) 2772 { 2773 int r = -EFAULT; 2774 struct kvm_stats_debugfs_item *p; 2775 2776 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2777 if (kvm_debugfs_dir == NULL) 2778 goto out; 2779 2780 for (p = debugfs_entries; p->name; ++p) { 2781 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2782 (void *)(long)p->offset, 2783 stat_fops[p->kind]); 2784 if (p->dentry == NULL) 2785 goto out_dir; 2786 } 2787 2788 return 0; 2789 2790 out_dir: 2791 debugfs_remove_recursive(kvm_debugfs_dir); 2792 out: 2793 return r; 2794 } 2795 2796 static void kvm_exit_debug(void) 2797 { 2798 struct kvm_stats_debugfs_item *p; 2799 2800 for (p = debugfs_entries; p->name; ++p) 2801 debugfs_remove(p->dentry); 2802 debugfs_remove(kvm_debugfs_dir); 2803 } 2804 2805 static int kvm_suspend(void) 2806 { 2807 if (kvm_usage_count) 2808 hardware_disable_nolock(NULL); 2809 return 0; 2810 } 2811 2812 static void kvm_resume(void) 2813 { 2814 if (kvm_usage_count) { 2815 WARN_ON(raw_spin_is_locked(&kvm_lock)); 2816 hardware_enable_nolock(NULL); 2817 } 2818 } 2819 2820 static struct syscore_ops kvm_syscore_ops = { 2821 .suspend = kvm_suspend, 2822 .resume = kvm_resume, 2823 }; 2824 2825 static inline 2826 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2827 { 2828 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2829 } 2830 2831 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2832 { 2833 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2834 2835 kvm_arch_vcpu_load(vcpu, cpu); 2836 } 2837 2838 static void kvm_sched_out(struct preempt_notifier *pn, 2839 struct task_struct *next) 2840 { 2841 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2842 2843 kvm_arch_vcpu_put(vcpu); 2844 } 2845 2846 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 2847 struct module *module) 2848 { 2849 int r; 2850 int cpu; 2851 2852 r = kvm_arch_init(opaque); 2853 if (r) 2854 goto out_fail; 2855 2856 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2857 r = -ENOMEM; 2858 goto out_free_0; 2859 } 2860 2861 r = kvm_arch_hardware_setup(); 2862 if (r < 0) 2863 goto out_free_0a; 2864 2865 for_each_online_cpu(cpu) { 2866 smp_call_function_single(cpu, 2867 kvm_arch_check_processor_compat, 2868 &r, 1); 2869 if (r < 0) 2870 goto out_free_1; 2871 } 2872 2873 r = register_cpu_notifier(&kvm_cpu_notifier); 2874 if (r) 2875 goto out_free_2; 2876 register_reboot_notifier(&kvm_reboot_notifier); 2877 2878 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2879 if (!vcpu_align) 2880 vcpu_align = __alignof__(struct kvm_vcpu); 2881 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 2882 0, NULL); 2883 if (!kvm_vcpu_cache) { 2884 r = -ENOMEM; 2885 goto out_free_3; 2886 } 2887 2888 r = kvm_async_pf_init(); 2889 if (r) 2890 goto out_free; 2891 2892 kvm_chardev_ops.owner = module; 2893 kvm_vm_fops.owner = module; 2894 kvm_vcpu_fops.owner = module; 2895 2896 r = misc_register(&kvm_dev); 2897 if (r) { 2898 printk(KERN_ERR "kvm: misc device register failed\n"); 2899 goto out_unreg; 2900 } 2901 2902 register_syscore_ops(&kvm_syscore_ops); 2903 2904 kvm_preempt_ops.sched_in = kvm_sched_in; 2905 kvm_preempt_ops.sched_out = kvm_sched_out; 2906 2907 r = kvm_init_debug(); 2908 if (r) { 2909 printk(KERN_ERR "kvm: create debugfs files failed\n"); 2910 goto out_undebugfs; 2911 } 2912 2913 return 0; 2914 2915 out_undebugfs: 2916 unregister_syscore_ops(&kvm_syscore_ops); 2917 out_unreg: 2918 kvm_async_pf_deinit(); 2919 out_free: 2920 kmem_cache_destroy(kvm_vcpu_cache); 2921 out_free_3: 2922 unregister_reboot_notifier(&kvm_reboot_notifier); 2923 unregister_cpu_notifier(&kvm_cpu_notifier); 2924 out_free_2: 2925 out_free_1: 2926 kvm_arch_hardware_unsetup(); 2927 out_free_0a: 2928 free_cpumask_var(cpus_hardware_enabled); 2929 out_free_0: 2930 kvm_arch_exit(); 2931 out_fail: 2932 return r; 2933 } 2934 EXPORT_SYMBOL_GPL(kvm_init); 2935 2936 void kvm_exit(void) 2937 { 2938 kvm_exit_debug(); 2939 misc_deregister(&kvm_dev); 2940 kmem_cache_destroy(kvm_vcpu_cache); 2941 kvm_async_pf_deinit(); 2942 unregister_syscore_ops(&kvm_syscore_ops); 2943 unregister_reboot_notifier(&kvm_reboot_notifier); 2944 unregister_cpu_notifier(&kvm_cpu_notifier); 2945 on_each_cpu(hardware_disable_nolock, NULL, 1); 2946 kvm_arch_hardware_unsetup(); 2947 kvm_arch_exit(); 2948 free_cpumask_var(cpus_hardware_enabled); 2949 } 2950 EXPORT_SYMBOL_GPL(kvm_exit); 2951