1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <kvm/iodev.h> 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched/signal.h> 36 #include <linux/sched/mm.h> 37 #include <linux/sched/stat.h> 38 #include <linux/cpumask.h> 39 #include <linux/smp.h> 40 #include <linux/anon_inodes.h> 41 #include <linux/profile.h> 42 #include <linux/kvm_para.h> 43 #include <linux/pagemap.h> 44 #include <linux/mman.h> 45 #include <linux/swap.h> 46 #include <linux/bitops.h> 47 #include <linux/spinlock.h> 48 #include <linux/compat.h> 49 #include <linux/srcu.h> 50 #include <linux/hugetlb.h> 51 #include <linux/slab.h> 52 #include <linux/sort.h> 53 #include <linux/bsearch.h> 54 55 #include <asm/processor.h> 56 #include <asm/io.h> 57 #include <asm/ioctl.h> 58 #include <linux/uaccess.h> 59 #include <asm/pgtable.h> 60 61 #include "coalesced_mmio.h" 62 #include "async_pf.h" 63 #include "vfio.h" 64 65 #define CREATE_TRACE_POINTS 66 #include <trace/events/kvm.h> 67 68 /* Worst case buffer size needed for holding an integer. */ 69 #define ITOA_MAX_LEN 12 70 71 MODULE_AUTHOR("Qumranet"); 72 MODULE_LICENSE("GPL"); 73 74 /* Architectures should define their poll value according to the halt latency */ 75 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 76 module_param(halt_poll_ns, uint, 0644); 77 EXPORT_SYMBOL_GPL(halt_poll_ns); 78 79 /* Default doubles per-vcpu halt_poll_ns. */ 80 unsigned int halt_poll_ns_grow = 2; 81 module_param(halt_poll_ns_grow, uint, 0644); 82 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 83 84 /* Default resets per-vcpu halt_poll_ns . */ 85 unsigned int halt_poll_ns_shrink; 86 module_param(halt_poll_ns_shrink, uint, 0644); 87 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 88 89 /* 90 * Ordering of locks: 91 * 92 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 93 */ 94 95 DEFINE_SPINLOCK(kvm_lock); 96 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 97 LIST_HEAD(vm_list); 98 99 static cpumask_var_t cpus_hardware_enabled; 100 static int kvm_usage_count; 101 static atomic_t hardware_enable_failed; 102 103 struct kmem_cache *kvm_vcpu_cache; 104 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 105 106 static __read_mostly struct preempt_ops kvm_preempt_ops; 107 108 struct dentry *kvm_debugfs_dir; 109 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 110 111 static int kvm_debugfs_num_entries; 112 static const struct file_operations *stat_fops_per_vm[]; 113 114 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 115 unsigned long arg); 116 #ifdef CONFIG_KVM_COMPAT 117 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 118 unsigned long arg); 119 #endif 120 static int hardware_enable_all(void); 121 static void hardware_disable_all(void); 122 123 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 124 125 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 126 127 __visible bool kvm_rebooting; 128 EXPORT_SYMBOL_GPL(kvm_rebooting); 129 130 static bool largepages_enabled = true; 131 132 #define KVM_EVENT_CREATE_VM 0 133 #define KVM_EVENT_DESTROY_VM 1 134 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 135 static unsigned long long kvm_createvm_count; 136 static unsigned long long kvm_active_vms; 137 138 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 139 unsigned long start, unsigned long end) 140 { 141 } 142 143 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 144 { 145 if (pfn_valid(pfn)) 146 return PageReserved(pfn_to_page(pfn)); 147 148 return true; 149 } 150 151 /* 152 * Switches to specified vcpu, until a matching vcpu_put() 153 */ 154 int vcpu_load(struct kvm_vcpu *vcpu) 155 { 156 int cpu; 157 158 if (mutex_lock_killable(&vcpu->mutex)) 159 return -EINTR; 160 cpu = get_cpu(); 161 preempt_notifier_register(&vcpu->preempt_notifier); 162 kvm_arch_vcpu_load(vcpu, cpu); 163 put_cpu(); 164 return 0; 165 } 166 EXPORT_SYMBOL_GPL(vcpu_load); 167 168 void vcpu_put(struct kvm_vcpu *vcpu) 169 { 170 preempt_disable(); 171 kvm_arch_vcpu_put(vcpu); 172 preempt_notifier_unregister(&vcpu->preempt_notifier); 173 preempt_enable(); 174 mutex_unlock(&vcpu->mutex); 175 } 176 EXPORT_SYMBOL_GPL(vcpu_put); 177 178 /* TODO: merge with kvm_arch_vcpu_should_kick */ 179 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 180 { 181 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 182 183 /* 184 * We need to wait for the VCPU to reenable interrupts and get out of 185 * READING_SHADOW_PAGE_TABLES mode. 186 */ 187 if (req & KVM_REQUEST_WAIT) 188 return mode != OUTSIDE_GUEST_MODE; 189 190 /* 191 * Need to kick a running VCPU, but otherwise there is nothing to do. 192 */ 193 return mode == IN_GUEST_MODE; 194 } 195 196 static void ack_flush(void *_completed) 197 { 198 } 199 200 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 201 { 202 if (unlikely(!cpus)) 203 cpus = cpu_online_mask; 204 205 if (cpumask_empty(cpus)) 206 return false; 207 208 smp_call_function_many(cpus, ack_flush, NULL, wait); 209 return true; 210 } 211 212 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 213 { 214 int i, cpu, me; 215 cpumask_var_t cpus; 216 bool called; 217 struct kvm_vcpu *vcpu; 218 219 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 220 221 me = get_cpu(); 222 kvm_for_each_vcpu(i, vcpu, kvm) { 223 kvm_make_request(req, vcpu); 224 cpu = vcpu->cpu; 225 226 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 227 continue; 228 229 if (cpus != NULL && cpu != -1 && cpu != me && 230 kvm_request_needs_ipi(vcpu, req)) 231 __cpumask_set_cpu(cpu, cpus); 232 } 233 called = kvm_kick_many_cpus(cpus, !!(req & KVM_REQUEST_WAIT)); 234 put_cpu(); 235 free_cpumask_var(cpus); 236 return called; 237 } 238 239 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 240 void kvm_flush_remote_tlbs(struct kvm *kvm) 241 { 242 /* 243 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 244 * kvm_make_all_cpus_request. 245 */ 246 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 247 248 /* 249 * We want to publish modifications to the page tables before reading 250 * mode. Pairs with a memory barrier in arch-specific code. 251 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 252 * and smp_mb in walk_shadow_page_lockless_begin/end. 253 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 254 * 255 * There is already an smp_mb__after_atomic() before 256 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 257 * barrier here. 258 */ 259 if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 260 ++kvm->stat.remote_tlb_flush; 261 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 262 } 263 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 264 #endif 265 266 void kvm_reload_remote_mmus(struct kvm *kvm) 267 { 268 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 269 } 270 271 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 272 { 273 struct page *page; 274 int r; 275 276 mutex_init(&vcpu->mutex); 277 vcpu->cpu = -1; 278 vcpu->kvm = kvm; 279 vcpu->vcpu_id = id; 280 vcpu->pid = NULL; 281 init_swait_queue_head(&vcpu->wq); 282 kvm_async_pf_vcpu_init(vcpu); 283 284 vcpu->pre_pcpu = -1; 285 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 286 287 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 288 if (!page) { 289 r = -ENOMEM; 290 goto fail; 291 } 292 vcpu->run = page_address(page); 293 294 kvm_vcpu_set_in_spin_loop(vcpu, false); 295 kvm_vcpu_set_dy_eligible(vcpu, false); 296 vcpu->preempted = false; 297 298 r = kvm_arch_vcpu_init(vcpu); 299 if (r < 0) 300 goto fail_free_run; 301 return 0; 302 303 fail_free_run: 304 free_page((unsigned long)vcpu->run); 305 fail: 306 return r; 307 } 308 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 309 310 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 311 { 312 /* 313 * no need for rcu_read_lock as VCPU_RUN is the only place that 314 * will change the vcpu->pid pointer and on uninit all file 315 * descriptors are already gone. 316 */ 317 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 318 kvm_arch_vcpu_uninit(vcpu); 319 free_page((unsigned long)vcpu->run); 320 } 321 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 322 323 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 324 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 325 { 326 return container_of(mn, struct kvm, mmu_notifier); 327 } 328 329 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 330 struct mm_struct *mm, 331 unsigned long address, 332 pte_t pte) 333 { 334 struct kvm *kvm = mmu_notifier_to_kvm(mn); 335 int idx; 336 337 idx = srcu_read_lock(&kvm->srcu); 338 spin_lock(&kvm->mmu_lock); 339 kvm->mmu_notifier_seq++; 340 kvm_set_spte_hva(kvm, address, pte); 341 spin_unlock(&kvm->mmu_lock); 342 srcu_read_unlock(&kvm->srcu, idx); 343 } 344 345 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 346 struct mm_struct *mm, 347 unsigned long start, 348 unsigned long end) 349 { 350 struct kvm *kvm = mmu_notifier_to_kvm(mn); 351 int need_tlb_flush = 0, idx; 352 353 idx = srcu_read_lock(&kvm->srcu); 354 spin_lock(&kvm->mmu_lock); 355 /* 356 * The count increase must become visible at unlock time as no 357 * spte can be established without taking the mmu_lock and 358 * count is also read inside the mmu_lock critical section. 359 */ 360 kvm->mmu_notifier_count++; 361 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 362 need_tlb_flush |= kvm->tlbs_dirty; 363 /* we've to flush the tlb before the pages can be freed */ 364 if (need_tlb_flush) 365 kvm_flush_remote_tlbs(kvm); 366 367 spin_unlock(&kvm->mmu_lock); 368 369 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 370 371 srcu_read_unlock(&kvm->srcu, idx); 372 } 373 374 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 375 struct mm_struct *mm, 376 unsigned long start, 377 unsigned long end) 378 { 379 struct kvm *kvm = mmu_notifier_to_kvm(mn); 380 381 spin_lock(&kvm->mmu_lock); 382 /* 383 * This sequence increase will notify the kvm page fault that 384 * the page that is going to be mapped in the spte could have 385 * been freed. 386 */ 387 kvm->mmu_notifier_seq++; 388 smp_wmb(); 389 /* 390 * The above sequence increase must be visible before the 391 * below count decrease, which is ensured by the smp_wmb above 392 * in conjunction with the smp_rmb in mmu_notifier_retry(). 393 */ 394 kvm->mmu_notifier_count--; 395 spin_unlock(&kvm->mmu_lock); 396 397 BUG_ON(kvm->mmu_notifier_count < 0); 398 } 399 400 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 401 struct mm_struct *mm, 402 unsigned long start, 403 unsigned long end) 404 { 405 struct kvm *kvm = mmu_notifier_to_kvm(mn); 406 int young, idx; 407 408 idx = srcu_read_lock(&kvm->srcu); 409 spin_lock(&kvm->mmu_lock); 410 411 young = kvm_age_hva(kvm, start, end); 412 if (young) 413 kvm_flush_remote_tlbs(kvm); 414 415 spin_unlock(&kvm->mmu_lock); 416 srcu_read_unlock(&kvm->srcu, idx); 417 418 return young; 419 } 420 421 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 422 struct mm_struct *mm, 423 unsigned long start, 424 unsigned long end) 425 { 426 struct kvm *kvm = mmu_notifier_to_kvm(mn); 427 int young, idx; 428 429 idx = srcu_read_lock(&kvm->srcu); 430 spin_lock(&kvm->mmu_lock); 431 /* 432 * Even though we do not flush TLB, this will still adversely 433 * affect performance on pre-Haswell Intel EPT, where there is 434 * no EPT Access Bit to clear so that we have to tear down EPT 435 * tables instead. If we find this unacceptable, we can always 436 * add a parameter to kvm_age_hva so that it effectively doesn't 437 * do anything on clear_young. 438 * 439 * Also note that currently we never issue secondary TLB flushes 440 * from clear_young, leaving this job up to the regular system 441 * cadence. If we find this inaccurate, we might come up with a 442 * more sophisticated heuristic later. 443 */ 444 young = kvm_age_hva(kvm, start, end); 445 spin_unlock(&kvm->mmu_lock); 446 srcu_read_unlock(&kvm->srcu, idx); 447 448 return young; 449 } 450 451 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 452 struct mm_struct *mm, 453 unsigned long address) 454 { 455 struct kvm *kvm = mmu_notifier_to_kvm(mn); 456 int young, idx; 457 458 idx = srcu_read_lock(&kvm->srcu); 459 spin_lock(&kvm->mmu_lock); 460 young = kvm_test_age_hva(kvm, address); 461 spin_unlock(&kvm->mmu_lock); 462 srcu_read_unlock(&kvm->srcu, idx); 463 464 return young; 465 } 466 467 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 468 struct mm_struct *mm) 469 { 470 struct kvm *kvm = mmu_notifier_to_kvm(mn); 471 int idx; 472 473 idx = srcu_read_lock(&kvm->srcu); 474 kvm_arch_flush_shadow_all(kvm); 475 srcu_read_unlock(&kvm->srcu, idx); 476 } 477 478 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 479 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 480 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 481 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 482 .clear_young = kvm_mmu_notifier_clear_young, 483 .test_young = kvm_mmu_notifier_test_young, 484 .change_pte = kvm_mmu_notifier_change_pte, 485 .release = kvm_mmu_notifier_release, 486 }; 487 488 static int kvm_init_mmu_notifier(struct kvm *kvm) 489 { 490 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 491 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 492 } 493 494 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 495 496 static int kvm_init_mmu_notifier(struct kvm *kvm) 497 { 498 return 0; 499 } 500 501 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 502 503 static struct kvm_memslots *kvm_alloc_memslots(void) 504 { 505 int i; 506 struct kvm_memslots *slots; 507 508 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 509 if (!slots) 510 return NULL; 511 512 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 513 slots->id_to_index[i] = slots->memslots[i].id = i; 514 515 return slots; 516 } 517 518 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 519 { 520 if (!memslot->dirty_bitmap) 521 return; 522 523 kvfree(memslot->dirty_bitmap); 524 memslot->dirty_bitmap = NULL; 525 } 526 527 /* 528 * Free any memory in @free but not in @dont. 529 */ 530 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 531 struct kvm_memory_slot *dont) 532 { 533 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 534 kvm_destroy_dirty_bitmap(free); 535 536 kvm_arch_free_memslot(kvm, free, dont); 537 538 free->npages = 0; 539 } 540 541 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 542 { 543 struct kvm_memory_slot *memslot; 544 545 if (!slots) 546 return; 547 548 kvm_for_each_memslot(memslot, slots) 549 kvm_free_memslot(kvm, memslot, NULL); 550 551 kvfree(slots); 552 } 553 554 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 555 { 556 int i; 557 558 if (!kvm->debugfs_dentry) 559 return; 560 561 debugfs_remove_recursive(kvm->debugfs_dentry); 562 563 if (kvm->debugfs_stat_data) { 564 for (i = 0; i < kvm_debugfs_num_entries; i++) 565 kfree(kvm->debugfs_stat_data[i]); 566 kfree(kvm->debugfs_stat_data); 567 } 568 } 569 570 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 571 { 572 char dir_name[ITOA_MAX_LEN * 2]; 573 struct kvm_stat_data *stat_data; 574 struct kvm_stats_debugfs_item *p; 575 576 if (!debugfs_initialized()) 577 return 0; 578 579 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 580 kvm->debugfs_dentry = debugfs_create_dir(dir_name, 581 kvm_debugfs_dir); 582 if (!kvm->debugfs_dentry) 583 return -ENOMEM; 584 585 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 586 sizeof(*kvm->debugfs_stat_data), 587 GFP_KERNEL); 588 if (!kvm->debugfs_stat_data) 589 return -ENOMEM; 590 591 for (p = debugfs_entries; p->name; p++) { 592 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); 593 if (!stat_data) 594 return -ENOMEM; 595 596 stat_data->kvm = kvm; 597 stat_data->offset = p->offset; 598 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 599 if (!debugfs_create_file(p->name, 0644, 600 kvm->debugfs_dentry, 601 stat_data, 602 stat_fops_per_vm[p->kind])) 603 return -ENOMEM; 604 } 605 return 0; 606 } 607 608 static struct kvm *kvm_create_vm(unsigned long type) 609 { 610 int r, i; 611 struct kvm *kvm = kvm_arch_alloc_vm(); 612 613 if (!kvm) 614 return ERR_PTR(-ENOMEM); 615 616 spin_lock_init(&kvm->mmu_lock); 617 mmgrab(current->mm); 618 kvm->mm = current->mm; 619 kvm_eventfd_init(kvm); 620 mutex_init(&kvm->lock); 621 mutex_init(&kvm->irq_lock); 622 mutex_init(&kvm->slots_lock); 623 refcount_set(&kvm->users_count, 1); 624 INIT_LIST_HEAD(&kvm->devices); 625 626 r = kvm_arch_init_vm(kvm, type); 627 if (r) 628 goto out_err_no_disable; 629 630 r = hardware_enable_all(); 631 if (r) 632 goto out_err_no_disable; 633 634 #ifdef CONFIG_HAVE_KVM_IRQFD 635 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 636 #endif 637 638 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 639 640 r = -ENOMEM; 641 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 642 struct kvm_memslots *slots = kvm_alloc_memslots(); 643 if (!slots) 644 goto out_err_no_srcu; 645 /* 646 * Generations must be different for each address space. 647 * Init kvm generation close to the maximum to easily test the 648 * code of handling generation number wrap-around. 649 */ 650 slots->generation = i * 2 - 150; 651 rcu_assign_pointer(kvm->memslots[i], slots); 652 } 653 654 if (init_srcu_struct(&kvm->srcu)) 655 goto out_err_no_srcu; 656 if (init_srcu_struct(&kvm->irq_srcu)) 657 goto out_err_no_irq_srcu; 658 for (i = 0; i < KVM_NR_BUSES; i++) { 659 rcu_assign_pointer(kvm->buses[i], 660 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); 661 if (!kvm->buses[i]) 662 goto out_err; 663 } 664 665 r = kvm_init_mmu_notifier(kvm); 666 if (r) 667 goto out_err; 668 669 spin_lock(&kvm_lock); 670 list_add(&kvm->vm_list, &vm_list); 671 spin_unlock(&kvm_lock); 672 673 preempt_notifier_inc(); 674 675 return kvm; 676 677 out_err: 678 cleanup_srcu_struct(&kvm->irq_srcu); 679 out_err_no_irq_srcu: 680 cleanup_srcu_struct(&kvm->srcu); 681 out_err_no_srcu: 682 hardware_disable_all(); 683 out_err_no_disable: 684 refcount_set(&kvm->users_count, 0); 685 for (i = 0; i < KVM_NR_BUSES; i++) 686 kfree(kvm_get_bus(kvm, i)); 687 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 688 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 689 kvm_arch_free_vm(kvm); 690 mmdrop(current->mm); 691 return ERR_PTR(r); 692 } 693 694 static void kvm_destroy_devices(struct kvm *kvm) 695 { 696 struct kvm_device *dev, *tmp; 697 698 /* 699 * We do not need to take the kvm->lock here, because nobody else 700 * has a reference to the struct kvm at this point and therefore 701 * cannot access the devices list anyhow. 702 */ 703 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 704 list_del(&dev->vm_node); 705 dev->ops->destroy(dev); 706 } 707 } 708 709 static void kvm_destroy_vm(struct kvm *kvm) 710 { 711 int i; 712 struct mm_struct *mm = kvm->mm; 713 714 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 715 kvm_destroy_vm_debugfs(kvm); 716 kvm_arch_sync_events(kvm); 717 spin_lock(&kvm_lock); 718 list_del(&kvm->vm_list); 719 spin_unlock(&kvm_lock); 720 kvm_free_irq_routing(kvm); 721 for (i = 0; i < KVM_NR_BUSES; i++) { 722 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 723 724 if (bus) 725 kvm_io_bus_destroy(bus); 726 kvm->buses[i] = NULL; 727 } 728 kvm_coalesced_mmio_free(kvm); 729 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 730 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 731 #else 732 kvm_arch_flush_shadow_all(kvm); 733 #endif 734 kvm_arch_destroy_vm(kvm); 735 kvm_destroy_devices(kvm); 736 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 737 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 738 cleanup_srcu_struct(&kvm->irq_srcu); 739 cleanup_srcu_struct(&kvm->srcu); 740 kvm_arch_free_vm(kvm); 741 preempt_notifier_dec(); 742 hardware_disable_all(); 743 mmdrop(mm); 744 } 745 746 void kvm_get_kvm(struct kvm *kvm) 747 { 748 refcount_inc(&kvm->users_count); 749 } 750 EXPORT_SYMBOL_GPL(kvm_get_kvm); 751 752 void kvm_put_kvm(struct kvm *kvm) 753 { 754 if (refcount_dec_and_test(&kvm->users_count)) 755 kvm_destroy_vm(kvm); 756 } 757 EXPORT_SYMBOL_GPL(kvm_put_kvm); 758 759 760 static int kvm_vm_release(struct inode *inode, struct file *filp) 761 { 762 struct kvm *kvm = filp->private_data; 763 764 kvm_irqfd_release(kvm); 765 766 kvm_put_kvm(kvm); 767 return 0; 768 } 769 770 /* 771 * Allocation size is twice as large as the actual dirty bitmap size. 772 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 773 */ 774 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 775 { 776 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 777 778 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); 779 if (!memslot->dirty_bitmap) 780 return -ENOMEM; 781 782 return 0; 783 } 784 785 /* 786 * Insert memslot and re-sort memslots based on their GFN, 787 * so binary search could be used to lookup GFN. 788 * Sorting algorithm takes advantage of having initially 789 * sorted array and known changed memslot position. 790 */ 791 static void update_memslots(struct kvm_memslots *slots, 792 struct kvm_memory_slot *new) 793 { 794 int id = new->id; 795 int i = slots->id_to_index[id]; 796 struct kvm_memory_slot *mslots = slots->memslots; 797 798 WARN_ON(mslots[i].id != id); 799 if (!new->npages) { 800 WARN_ON(!mslots[i].npages); 801 if (mslots[i].npages) 802 slots->used_slots--; 803 } else { 804 if (!mslots[i].npages) 805 slots->used_slots++; 806 } 807 808 while (i < KVM_MEM_SLOTS_NUM - 1 && 809 new->base_gfn <= mslots[i + 1].base_gfn) { 810 if (!mslots[i + 1].npages) 811 break; 812 mslots[i] = mslots[i + 1]; 813 slots->id_to_index[mslots[i].id] = i; 814 i++; 815 } 816 817 /* 818 * The ">=" is needed when creating a slot with base_gfn == 0, 819 * so that it moves before all those with base_gfn == npages == 0. 820 * 821 * On the other hand, if new->npages is zero, the above loop has 822 * already left i pointing to the beginning of the empty part of 823 * mslots, and the ">=" would move the hole backwards in this 824 * case---which is wrong. So skip the loop when deleting a slot. 825 */ 826 if (new->npages) { 827 while (i > 0 && 828 new->base_gfn >= mslots[i - 1].base_gfn) { 829 mslots[i] = mslots[i - 1]; 830 slots->id_to_index[mslots[i].id] = i; 831 i--; 832 } 833 } else 834 WARN_ON_ONCE(i != slots->used_slots); 835 836 mslots[i] = *new; 837 slots->id_to_index[mslots[i].id] = i; 838 } 839 840 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 841 { 842 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 843 844 #ifdef __KVM_HAVE_READONLY_MEM 845 valid_flags |= KVM_MEM_READONLY; 846 #endif 847 848 if (mem->flags & ~valid_flags) 849 return -EINVAL; 850 851 return 0; 852 } 853 854 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 855 int as_id, struct kvm_memslots *slots) 856 { 857 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 858 859 /* 860 * Set the low bit in the generation, which disables SPTE caching 861 * until the end of synchronize_srcu_expedited. 862 */ 863 WARN_ON(old_memslots->generation & 1); 864 slots->generation = old_memslots->generation + 1; 865 866 rcu_assign_pointer(kvm->memslots[as_id], slots); 867 synchronize_srcu_expedited(&kvm->srcu); 868 869 /* 870 * Increment the new memslot generation a second time. This prevents 871 * vm exits that race with memslot updates from caching a memslot 872 * generation that will (potentially) be valid forever. 873 * 874 * Generations must be unique even across address spaces. We do not need 875 * a global counter for that, instead the generation space is evenly split 876 * across address spaces. For example, with two address spaces, address 877 * space 0 will use generations 0, 4, 8, ... while * address space 1 will 878 * use generations 2, 6, 10, 14, ... 879 */ 880 slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; 881 882 kvm_arch_memslots_updated(kvm, slots); 883 884 return old_memslots; 885 } 886 887 /* 888 * Allocate some memory and give it an address in the guest physical address 889 * space. 890 * 891 * Discontiguous memory is allowed, mostly for framebuffers. 892 * 893 * Must be called holding kvm->slots_lock for write. 894 */ 895 int __kvm_set_memory_region(struct kvm *kvm, 896 const struct kvm_userspace_memory_region *mem) 897 { 898 int r; 899 gfn_t base_gfn; 900 unsigned long npages; 901 struct kvm_memory_slot *slot; 902 struct kvm_memory_slot old, new; 903 struct kvm_memslots *slots = NULL, *old_memslots; 904 int as_id, id; 905 enum kvm_mr_change change; 906 907 r = check_memory_region_flags(mem); 908 if (r) 909 goto out; 910 911 r = -EINVAL; 912 as_id = mem->slot >> 16; 913 id = (u16)mem->slot; 914 915 /* General sanity checks */ 916 if (mem->memory_size & (PAGE_SIZE - 1)) 917 goto out; 918 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 919 goto out; 920 /* We can read the guest memory with __xxx_user() later on. */ 921 if ((id < KVM_USER_MEM_SLOTS) && 922 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 923 !access_ok(VERIFY_WRITE, 924 (void __user *)(unsigned long)mem->userspace_addr, 925 mem->memory_size))) 926 goto out; 927 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 928 goto out; 929 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 930 goto out; 931 932 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 933 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 934 npages = mem->memory_size >> PAGE_SHIFT; 935 936 if (npages > KVM_MEM_MAX_NR_PAGES) 937 goto out; 938 939 new = old = *slot; 940 941 new.id = id; 942 new.base_gfn = base_gfn; 943 new.npages = npages; 944 new.flags = mem->flags; 945 946 if (npages) { 947 if (!old.npages) 948 change = KVM_MR_CREATE; 949 else { /* Modify an existing slot. */ 950 if ((mem->userspace_addr != old.userspace_addr) || 951 (npages != old.npages) || 952 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 953 goto out; 954 955 if (base_gfn != old.base_gfn) 956 change = KVM_MR_MOVE; 957 else if (new.flags != old.flags) 958 change = KVM_MR_FLAGS_ONLY; 959 else { /* Nothing to change. */ 960 r = 0; 961 goto out; 962 } 963 } 964 } else { 965 if (!old.npages) 966 goto out; 967 968 change = KVM_MR_DELETE; 969 new.base_gfn = 0; 970 new.flags = 0; 971 } 972 973 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 974 /* Check for overlaps */ 975 r = -EEXIST; 976 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 977 if ((slot->id >= KVM_USER_MEM_SLOTS) || 978 (slot->id == id)) 979 continue; 980 if (!((base_gfn + npages <= slot->base_gfn) || 981 (base_gfn >= slot->base_gfn + slot->npages))) 982 goto out; 983 } 984 } 985 986 /* Free page dirty bitmap if unneeded */ 987 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 988 new.dirty_bitmap = NULL; 989 990 r = -ENOMEM; 991 if (change == KVM_MR_CREATE) { 992 new.userspace_addr = mem->userspace_addr; 993 994 if (kvm_arch_create_memslot(kvm, &new, npages)) 995 goto out_free; 996 } 997 998 /* Allocate page dirty bitmap if needed */ 999 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1000 if (kvm_create_dirty_bitmap(&new) < 0) 1001 goto out_free; 1002 } 1003 1004 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 1005 if (!slots) 1006 goto out_free; 1007 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1008 1009 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1010 slot = id_to_memslot(slots, id); 1011 slot->flags |= KVM_MEMSLOT_INVALID; 1012 1013 old_memslots = install_new_memslots(kvm, as_id, slots); 1014 1015 /* From this point no new shadow pages pointing to a deleted, 1016 * or moved, memslot will be created. 1017 * 1018 * validation of sp->gfn happens in: 1019 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1020 * - kvm_is_visible_gfn (mmu_check_roots) 1021 */ 1022 kvm_arch_flush_shadow_memslot(kvm, slot); 1023 1024 /* 1025 * We can re-use the old_memslots from above, the only difference 1026 * from the currently installed memslots is the invalid flag. This 1027 * will get overwritten by update_memslots anyway. 1028 */ 1029 slots = old_memslots; 1030 } 1031 1032 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1033 if (r) 1034 goto out_slots; 1035 1036 /* actual memory is freed via old in kvm_free_memslot below */ 1037 if (change == KVM_MR_DELETE) { 1038 new.dirty_bitmap = NULL; 1039 memset(&new.arch, 0, sizeof(new.arch)); 1040 } 1041 1042 update_memslots(slots, &new); 1043 old_memslots = install_new_memslots(kvm, as_id, slots); 1044 1045 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1046 1047 kvm_free_memslot(kvm, &old, &new); 1048 kvfree(old_memslots); 1049 return 0; 1050 1051 out_slots: 1052 kvfree(slots); 1053 out_free: 1054 kvm_free_memslot(kvm, &new, &old); 1055 out: 1056 return r; 1057 } 1058 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1059 1060 int kvm_set_memory_region(struct kvm *kvm, 1061 const struct kvm_userspace_memory_region *mem) 1062 { 1063 int r; 1064 1065 mutex_lock(&kvm->slots_lock); 1066 r = __kvm_set_memory_region(kvm, mem); 1067 mutex_unlock(&kvm->slots_lock); 1068 return r; 1069 } 1070 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1071 1072 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1073 struct kvm_userspace_memory_region *mem) 1074 { 1075 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1076 return -EINVAL; 1077 1078 return kvm_set_memory_region(kvm, mem); 1079 } 1080 1081 int kvm_get_dirty_log(struct kvm *kvm, 1082 struct kvm_dirty_log *log, int *is_dirty) 1083 { 1084 struct kvm_memslots *slots; 1085 struct kvm_memory_slot *memslot; 1086 int i, as_id, id; 1087 unsigned long n; 1088 unsigned long any = 0; 1089 1090 as_id = log->slot >> 16; 1091 id = (u16)log->slot; 1092 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1093 return -EINVAL; 1094 1095 slots = __kvm_memslots(kvm, as_id); 1096 memslot = id_to_memslot(slots, id); 1097 if (!memslot->dirty_bitmap) 1098 return -ENOENT; 1099 1100 n = kvm_dirty_bitmap_bytes(memslot); 1101 1102 for (i = 0; !any && i < n/sizeof(long); ++i) 1103 any = memslot->dirty_bitmap[i]; 1104 1105 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1106 return -EFAULT; 1107 1108 if (any) 1109 *is_dirty = 1; 1110 return 0; 1111 } 1112 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1113 1114 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1115 /** 1116 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1117 * are dirty write protect them for next write. 1118 * @kvm: pointer to kvm instance 1119 * @log: slot id and address to which we copy the log 1120 * @is_dirty: flag set if any page is dirty 1121 * 1122 * We need to keep it in mind that VCPU threads can write to the bitmap 1123 * concurrently. So, to avoid losing track of dirty pages we keep the 1124 * following order: 1125 * 1126 * 1. Take a snapshot of the bit and clear it if needed. 1127 * 2. Write protect the corresponding page. 1128 * 3. Copy the snapshot to the userspace. 1129 * 4. Upon return caller flushes TLB's if needed. 1130 * 1131 * Between 2 and 4, the guest may write to the page using the remaining TLB 1132 * entry. This is not a problem because the page is reported dirty using 1133 * the snapshot taken before and step 4 ensures that writes done after 1134 * exiting to userspace will be logged for the next call. 1135 * 1136 */ 1137 int kvm_get_dirty_log_protect(struct kvm *kvm, 1138 struct kvm_dirty_log *log, bool *is_dirty) 1139 { 1140 struct kvm_memslots *slots; 1141 struct kvm_memory_slot *memslot; 1142 int i, as_id, id; 1143 unsigned long n; 1144 unsigned long *dirty_bitmap; 1145 unsigned long *dirty_bitmap_buffer; 1146 1147 as_id = log->slot >> 16; 1148 id = (u16)log->slot; 1149 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1150 return -EINVAL; 1151 1152 slots = __kvm_memslots(kvm, as_id); 1153 memslot = id_to_memslot(slots, id); 1154 1155 dirty_bitmap = memslot->dirty_bitmap; 1156 if (!dirty_bitmap) 1157 return -ENOENT; 1158 1159 n = kvm_dirty_bitmap_bytes(memslot); 1160 1161 dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long); 1162 memset(dirty_bitmap_buffer, 0, n); 1163 1164 spin_lock(&kvm->mmu_lock); 1165 *is_dirty = false; 1166 for (i = 0; i < n / sizeof(long); i++) { 1167 unsigned long mask; 1168 gfn_t offset; 1169 1170 if (!dirty_bitmap[i]) 1171 continue; 1172 1173 *is_dirty = true; 1174 1175 mask = xchg(&dirty_bitmap[i], 0); 1176 dirty_bitmap_buffer[i] = mask; 1177 1178 if (mask) { 1179 offset = i * BITS_PER_LONG; 1180 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1181 offset, mask); 1182 } 1183 } 1184 1185 spin_unlock(&kvm->mmu_lock); 1186 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1187 return -EFAULT; 1188 return 0; 1189 } 1190 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1191 #endif 1192 1193 bool kvm_largepages_enabled(void) 1194 { 1195 return largepages_enabled; 1196 } 1197 1198 void kvm_disable_largepages(void) 1199 { 1200 largepages_enabled = false; 1201 } 1202 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1203 1204 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1205 { 1206 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1207 } 1208 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1209 1210 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1211 { 1212 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1213 } 1214 1215 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1216 { 1217 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1218 1219 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1220 memslot->flags & KVM_MEMSLOT_INVALID) 1221 return false; 1222 1223 return true; 1224 } 1225 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1226 1227 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1228 { 1229 struct vm_area_struct *vma; 1230 unsigned long addr, size; 1231 1232 size = PAGE_SIZE; 1233 1234 addr = gfn_to_hva(kvm, gfn); 1235 if (kvm_is_error_hva(addr)) 1236 return PAGE_SIZE; 1237 1238 down_read(¤t->mm->mmap_sem); 1239 vma = find_vma(current->mm, addr); 1240 if (!vma) 1241 goto out; 1242 1243 size = vma_kernel_pagesize(vma); 1244 1245 out: 1246 up_read(¤t->mm->mmap_sem); 1247 1248 return size; 1249 } 1250 1251 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1252 { 1253 return slot->flags & KVM_MEM_READONLY; 1254 } 1255 1256 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1257 gfn_t *nr_pages, bool write) 1258 { 1259 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1260 return KVM_HVA_ERR_BAD; 1261 1262 if (memslot_is_readonly(slot) && write) 1263 return KVM_HVA_ERR_RO_BAD; 1264 1265 if (nr_pages) 1266 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1267 1268 return __gfn_to_hva_memslot(slot, gfn); 1269 } 1270 1271 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1272 gfn_t *nr_pages) 1273 { 1274 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1275 } 1276 1277 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1278 gfn_t gfn) 1279 { 1280 return gfn_to_hva_many(slot, gfn, NULL); 1281 } 1282 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1283 1284 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1285 { 1286 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1287 } 1288 EXPORT_SYMBOL_GPL(gfn_to_hva); 1289 1290 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1291 { 1292 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1293 } 1294 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1295 1296 /* 1297 * If writable is set to false, the hva returned by this function is only 1298 * allowed to be read. 1299 */ 1300 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1301 gfn_t gfn, bool *writable) 1302 { 1303 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1304 1305 if (!kvm_is_error_hva(hva) && writable) 1306 *writable = !memslot_is_readonly(slot); 1307 1308 return hva; 1309 } 1310 1311 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1312 { 1313 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1314 1315 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1316 } 1317 1318 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1319 { 1320 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1321 1322 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1323 } 1324 1325 static int get_user_page_nowait(unsigned long start, int write, 1326 struct page **page) 1327 { 1328 int flags = FOLL_NOWAIT | FOLL_HWPOISON; 1329 1330 if (write) 1331 flags |= FOLL_WRITE; 1332 1333 return get_user_pages(start, 1, flags, page, NULL); 1334 } 1335 1336 static inline int check_user_page_hwpoison(unsigned long addr) 1337 { 1338 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1339 1340 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1341 return rc == -EHWPOISON; 1342 } 1343 1344 /* 1345 * The atomic path to get the writable pfn which will be stored in @pfn, 1346 * true indicates success, otherwise false is returned. 1347 */ 1348 static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async, 1349 bool write_fault, bool *writable, kvm_pfn_t *pfn) 1350 { 1351 struct page *page[1]; 1352 int npages; 1353 1354 if (!(async || atomic)) 1355 return false; 1356 1357 /* 1358 * Fast pin a writable pfn only if it is a write fault request 1359 * or the caller allows to map a writable pfn for a read fault 1360 * request. 1361 */ 1362 if (!(write_fault || writable)) 1363 return false; 1364 1365 npages = __get_user_pages_fast(addr, 1, 1, page); 1366 if (npages == 1) { 1367 *pfn = page_to_pfn(page[0]); 1368 1369 if (writable) 1370 *writable = true; 1371 return true; 1372 } 1373 1374 return false; 1375 } 1376 1377 /* 1378 * The slow path to get the pfn of the specified host virtual address, 1379 * 1 indicates success, -errno is returned if error is detected. 1380 */ 1381 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1382 bool *writable, kvm_pfn_t *pfn) 1383 { 1384 struct page *page[1]; 1385 int npages = 0; 1386 1387 might_sleep(); 1388 1389 if (writable) 1390 *writable = write_fault; 1391 1392 if (async) { 1393 down_read(¤t->mm->mmap_sem); 1394 npages = get_user_page_nowait(addr, write_fault, page); 1395 up_read(¤t->mm->mmap_sem); 1396 } else { 1397 unsigned int flags = FOLL_HWPOISON; 1398 1399 if (write_fault) 1400 flags |= FOLL_WRITE; 1401 1402 npages = get_user_pages_unlocked(addr, 1, page, flags); 1403 } 1404 if (npages != 1) 1405 return npages; 1406 1407 /* map read fault as writable if possible */ 1408 if (unlikely(!write_fault) && writable) { 1409 struct page *wpage[1]; 1410 1411 npages = __get_user_pages_fast(addr, 1, 1, wpage); 1412 if (npages == 1) { 1413 *writable = true; 1414 put_page(page[0]); 1415 page[0] = wpage[0]; 1416 } 1417 1418 npages = 1; 1419 } 1420 *pfn = page_to_pfn(page[0]); 1421 return npages; 1422 } 1423 1424 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1425 { 1426 if (unlikely(!(vma->vm_flags & VM_READ))) 1427 return false; 1428 1429 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1430 return false; 1431 1432 return true; 1433 } 1434 1435 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1436 unsigned long addr, bool *async, 1437 bool write_fault, kvm_pfn_t *p_pfn) 1438 { 1439 unsigned long pfn; 1440 int r; 1441 1442 r = follow_pfn(vma, addr, &pfn); 1443 if (r) { 1444 /* 1445 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1446 * not call the fault handler, so do it here. 1447 */ 1448 bool unlocked = false; 1449 r = fixup_user_fault(current, current->mm, addr, 1450 (write_fault ? FAULT_FLAG_WRITE : 0), 1451 &unlocked); 1452 if (unlocked) 1453 return -EAGAIN; 1454 if (r) 1455 return r; 1456 1457 r = follow_pfn(vma, addr, &pfn); 1458 if (r) 1459 return r; 1460 1461 } 1462 1463 1464 /* 1465 * Get a reference here because callers of *hva_to_pfn* and 1466 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1467 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1468 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1469 * simply do nothing for reserved pfns. 1470 * 1471 * Whoever called remap_pfn_range is also going to call e.g. 1472 * unmap_mapping_range before the underlying pages are freed, 1473 * causing a call to our MMU notifier. 1474 */ 1475 kvm_get_pfn(pfn); 1476 1477 *p_pfn = pfn; 1478 return 0; 1479 } 1480 1481 /* 1482 * Pin guest page in memory and return its pfn. 1483 * @addr: host virtual address which maps memory to the guest 1484 * @atomic: whether this function can sleep 1485 * @async: whether this function need to wait IO complete if the 1486 * host page is not in the memory 1487 * @write_fault: whether we should get a writable host page 1488 * @writable: whether it allows to map a writable host page for !@write_fault 1489 * 1490 * The function will map a writable host page for these two cases: 1491 * 1): @write_fault = true 1492 * 2): @write_fault = false && @writable, @writable will tell the caller 1493 * whether the mapping is writable. 1494 */ 1495 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1496 bool write_fault, bool *writable) 1497 { 1498 struct vm_area_struct *vma; 1499 kvm_pfn_t pfn = 0; 1500 int npages, r; 1501 1502 /* we can do it either atomically or asynchronously, not both */ 1503 BUG_ON(atomic && async); 1504 1505 if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn)) 1506 return pfn; 1507 1508 if (atomic) 1509 return KVM_PFN_ERR_FAULT; 1510 1511 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1512 if (npages == 1) 1513 return pfn; 1514 1515 down_read(¤t->mm->mmap_sem); 1516 if (npages == -EHWPOISON || 1517 (!async && check_user_page_hwpoison(addr))) { 1518 pfn = KVM_PFN_ERR_HWPOISON; 1519 goto exit; 1520 } 1521 1522 retry: 1523 vma = find_vma_intersection(current->mm, addr, addr + 1); 1524 1525 if (vma == NULL) 1526 pfn = KVM_PFN_ERR_FAULT; 1527 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1528 r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn); 1529 if (r == -EAGAIN) 1530 goto retry; 1531 if (r < 0) 1532 pfn = KVM_PFN_ERR_FAULT; 1533 } else { 1534 if (async && vma_is_valid(vma, write_fault)) 1535 *async = true; 1536 pfn = KVM_PFN_ERR_FAULT; 1537 } 1538 exit: 1539 up_read(¤t->mm->mmap_sem); 1540 return pfn; 1541 } 1542 1543 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1544 bool atomic, bool *async, bool write_fault, 1545 bool *writable) 1546 { 1547 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1548 1549 if (addr == KVM_HVA_ERR_RO_BAD) { 1550 if (writable) 1551 *writable = false; 1552 return KVM_PFN_ERR_RO_FAULT; 1553 } 1554 1555 if (kvm_is_error_hva(addr)) { 1556 if (writable) 1557 *writable = false; 1558 return KVM_PFN_NOSLOT; 1559 } 1560 1561 /* Do not map writable pfn in the readonly memslot. */ 1562 if (writable && memslot_is_readonly(slot)) { 1563 *writable = false; 1564 writable = NULL; 1565 } 1566 1567 return hva_to_pfn(addr, atomic, async, write_fault, 1568 writable); 1569 } 1570 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1571 1572 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1573 bool *writable) 1574 { 1575 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1576 write_fault, writable); 1577 } 1578 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1579 1580 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1581 { 1582 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1583 } 1584 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1585 1586 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1587 { 1588 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1589 } 1590 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1591 1592 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1593 { 1594 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1595 } 1596 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1597 1598 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1599 { 1600 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1601 } 1602 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1603 1604 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1605 { 1606 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1607 } 1608 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1609 1610 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1611 { 1612 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1613 } 1614 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1615 1616 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1617 struct page **pages, int nr_pages) 1618 { 1619 unsigned long addr; 1620 gfn_t entry = 0; 1621 1622 addr = gfn_to_hva_many(slot, gfn, &entry); 1623 if (kvm_is_error_hva(addr)) 1624 return -1; 1625 1626 if (entry < nr_pages) 1627 return 0; 1628 1629 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1630 } 1631 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1632 1633 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1634 { 1635 if (is_error_noslot_pfn(pfn)) 1636 return KVM_ERR_PTR_BAD_PAGE; 1637 1638 if (kvm_is_reserved_pfn(pfn)) { 1639 WARN_ON(1); 1640 return KVM_ERR_PTR_BAD_PAGE; 1641 } 1642 1643 return pfn_to_page(pfn); 1644 } 1645 1646 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1647 { 1648 kvm_pfn_t pfn; 1649 1650 pfn = gfn_to_pfn(kvm, gfn); 1651 1652 return kvm_pfn_to_page(pfn); 1653 } 1654 EXPORT_SYMBOL_GPL(gfn_to_page); 1655 1656 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1657 { 1658 kvm_pfn_t pfn; 1659 1660 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1661 1662 return kvm_pfn_to_page(pfn); 1663 } 1664 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1665 1666 void kvm_release_page_clean(struct page *page) 1667 { 1668 WARN_ON(is_error_page(page)); 1669 1670 kvm_release_pfn_clean(page_to_pfn(page)); 1671 } 1672 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1673 1674 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1675 { 1676 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1677 put_page(pfn_to_page(pfn)); 1678 } 1679 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1680 1681 void kvm_release_page_dirty(struct page *page) 1682 { 1683 WARN_ON(is_error_page(page)); 1684 1685 kvm_release_pfn_dirty(page_to_pfn(page)); 1686 } 1687 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1688 1689 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1690 { 1691 kvm_set_pfn_dirty(pfn); 1692 kvm_release_pfn_clean(pfn); 1693 } 1694 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1695 1696 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1697 { 1698 if (!kvm_is_reserved_pfn(pfn)) { 1699 struct page *page = pfn_to_page(pfn); 1700 1701 if (!PageReserved(page)) 1702 SetPageDirty(page); 1703 } 1704 } 1705 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1706 1707 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 1708 { 1709 if (!kvm_is_reserved_pfn(pfn)) 1710 mark_page_accessed(pfn_to_page(pfn)); 1711 } 1712 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1713 1714 void kvm_get_pfn(kvm_pfn_t pfn) 1715 { 1716 if (!kvm_is_reserved_pfn(pfn)) 1717 get_page(pfn_to_page(pfn)); 1718 } 1719 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1720 1721 static int next_segment(unsigned long len, int offset) 1722 { 1723 if (len > PAGE_SIZE - offset) 1724 return PAGE_SIZE - offset; 1725 else 1726 return len; 1727 } 1728 1729 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1730 void *data, int offset, int len) 1731 { 1732 int r; 1733 unsigned long addr; 1734 1735 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1736 if (kvm_is_error_hva(addr)) 1737 return -EFAULT; 1738 r = __copy_from_user(data, (void __user *)addr + offset, len); 1739 if (r) 1740 return -EFAULT; 1741 return 0; 1742 } 1743 1744 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1745 int len) 1746 { 1747 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1748 1749 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1750 } 1751 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1752 1753 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1754 int offset, int len) 1755 { 1756 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1757 1758 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1759 } 1760 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1761 1762 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1763 { 1764 gfn_t gfn = gpa >> PAGE_SHIFT; 1765 int seg; 1766 int offset = offset_in_page(gpa); 1767 int ret; 1768 1769 while ((seg = next_segment(len, offset)) != 0) { 1770 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1771 if (ret < 0) 1772 return ret; 1773 offset = 0; 1774 len -= seg; 1775 data += seg; 1776 ++gfn; 1777 } 1778 return 0; 1779 } 1780 EXPORT_SYMBOL_GPL(kvm_read_guest); 1781 1782 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 1783 { 1784 gfn_t gfn = gpa >> PAGE_SHIFT; 1785 int seg; 1786 int offset = offset_in_page(gpa); 1787 int ret; 1788 1789 while ((seg = next_segment(len, offset)) != 0) { 1790 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 1791 if (ret < 0) 1792 return ret; 1793 offset = 0; 1794 len -= seg; 1795 data += seg; 1796 ++gfn; 1797 } 1798 return 0; 1799 } 1800 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 1801 1802 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1803 void *data, int offset, unsigned long len) 1804 { 1805 int r; 1806 unsigned long addr; 1807 1808 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1809 if (kvm_is_error_hva(addr)) 1810 return -EFAULT; 1811 pagefault_disable(); 1812 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1813 pagefault_enable(); 1814 if (r) 1815 return -EFAULT; 1816 return 0; 1817 } 1818 1819 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1820 unsigned long len) 1821 { 1822 gfn_t gfn = gpa >> PAGE_SHIFT; 1823 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1824 int offset = offset_in_page(gpa); 1825 1826 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1827 } 1828 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 1829 1830 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 1831 void *data, unsigned long len) 1832 { 1833 gfn_t gfn = gpa >> PAGE_SHIFT; 1834 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1835 int offset = offset_in_page(gpa); 1836 1837 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1838 } 1839 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 1840 1841 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 1842 const void *data, int offset, int len) 1843 { 1844 int r; 1845 unsigned long addr; 1846 1847 addr = gfn_to_hva_memslot(memslot, gfn); 1848 if (kvm_is_error_hva(addr)) 1849 return -EFAULT; 1850 r = __copy_to_user((void __user *)addr + offset, data, len); 1851 if (r) 1852 return -EFAULT; 1853 mark_page_dirty_in_slot(memslot, gfn); 1854 return 0; 1855 } 1856 1857 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 1858 const void *data, int offset, int len) 1859 { 1860 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1861 1862 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1863 } 1864 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1865 1866 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 1867 const void *data, int offset, int len) 1868 { 1869 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1870 1871 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1872 } 1873 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 1874 1875 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1876 unsigned long len) 1877 { 1878 gfn_t gfn = gpa >> PAGE_SHIFT; 1879 int seg; 1880 int offset = offset_in_page(gpa); 1881 int ret; 1882 1883 while ((seg = next_segment(len, offset)) != 0) { 1884 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1885 if (ret < 0) 1886 return ret; 1887 offset = 0; 1888 len -= seg; 1889 data += seg; 1890 ++gfn; 1891 } 1892 return 0; 1893 } 1894 EXPORT_SYMBOL_GPL(kvm_write_guest); 1895 1896 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 1897 unsigned long len) 1898 { 1899 gfn_t gfn = gpa >> PAGE_SHIFT; 1900 int seg; 1901 int offset = offset_in_page(gpa); 1902 int ret; 1903 1904 while ((seg = next_segment(len, offset)) != 0) { 1905 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 1906 if (ret < 0) 1907 return ret; 1908 offset = 0; 1909 len -= seg; 1910 data += seg; 1911 ++gfn; 1912 } 1913 return 0; 1914 } 1915 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 1916 1917 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 1918 struct gfn_to_hva_cache *ghc, 1919 gpa_t gpa, unsigned long len) 1920 { 1921 int offset = offset_in_page(gpa); 1922 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1923 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1924 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 1925 gfn_t nr_pages_avail; 1926 1927 ghc->gpa = gpa; 1928 ghc->generation = slots->generation; 1929 ghc->len = len; 1930 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1931 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1932 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1933 ghc->hva += offset; 1934 } else { 1935 /* 1936 * If the requested region crosses two memslots, we still 1937 * verify that the entire region is valid here. 1938 */ 1939 while (start_gfn <= end_gfn) { 1940 nr_pages_avail = 0; 1941 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1942 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1943 &nr_pages_avail); 1944 if (kvm_is_error_hva(ghc->hva)) 1945 return -EFAULT; 1946 start_gfn += nr_pages_avail; 1947 } 1948 /* Use the slow path for cross page reads and writes. */ 1949 ghc->memslot = NULL; 1950 } 1951 return 0; 1952 } 1953 1954 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1955 gpa_t gpa, unsigned long len) 1956 { 1957 struct kvm_memslots *slots = kvm_memslots(kvm); 1958 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 1959 } 1960 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1961 1962 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1963 void *data, int offset, unsigned long len) 1964 { 1965 struct kvm_memslots *slots = kvm_memslots(kvm); 1966 int r; 1967 gpa_t gpa = ghc->gpa + offset; 1968 1969 BUG_ON(len + offset > ghc->len); 1970 1971 if (slots->generation != ghc->generation) 1972 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 1973 1974 if (unlikely(!ghc->memslot)) 1975 return kvm_write_guest(kvm, gpa, data, len); 1976 1977 if (kvm_is_error_hva(ghc->hva)) 1978 return -EFAULT; 1979 1980 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 1981 if (r) 1982 return -EFAULT; 1983 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 1984 1985 return 0; 1986 } 1987 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 1988 1989 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1990 void *data, unsigned long len) 1991 { 1992 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 1993 } 1994 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1995 1996 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1997 void *data, unsigned long len) 1998 { 1999 struct kvm_memslots *slots = kvm_memslots(kvm); 2000 int r; 2001 2002 BUG_ON(len > ghc->len); 2003 2004 if (slots->generation != ghc->generation) 2005 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2006 2007 if (unlikely(!ghc->memslot)) 2008 return kvm_read_guest(kvm, ghc->gpa, data, len); 2009 2010 if (kvm_is_error_hva(ghc->hva)) 2011 return -EFAULT; 2012 2013 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2014 if (r) 2015 return -EFAULT; 2016 2017 return 0; 2018 } 2019 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2020 2021 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2022 { 2023 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2024 2025 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2026 } 2027 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2028 2029 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2030 { 2031 gfn_t gfn = gpa >> PAGE_SHIFT; 2032 int seg; 2033 int offset = offset_in_page(gpa); 2034 int ret; 2035 2036 while ((seg = next_segment(len, offset)) != 0) { 2037 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2038 if (ret < 0) 2039 return ret; 2040 offset = 0; 2041 len -= seg; 2042 ++gfn; 2043 } 2044 return 0; 2045 } 2046 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2047 2048 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2049 gfn_t gfn) 2050 { 2051 if (memslot && memslot->dirty_bitmap) { 2052 unsigned long rel_gfn = gfn - memslot->base_gfn; 2053 2054 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2055 } 2056 } 2057 2058 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2059 { 2060 struct kvm_memory_slot *memslot; 2061 2062 memslot = gfn_to_memslot(kvm, gfn); 2063 mark_page_dirty_in_slot(memslot, gfn); 2064 } 2065 EXPORT_SYMBOL_GPL(mark_page_dirty); 2066 2067 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2068 { 2069 struct kvm_memory_slot *memslot; 2070 2071 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2072 mark_page_dirty_in_slot(memslot, gfn); 2073 } 2074 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2075 2076 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2077 { 2078 if (!vcpu->sigset_active) 2079 return; 2080 2081 /* 2082 * This does a lockless modification of ->real_blocked, which is fine 2083 * because, only current can change ->real_blocked and all readers of 2084 * ->real_blocked don't care as long ->real_blocked is always a subset 2085 * of ->blocked. 2086 */ 2087 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2088 } 2089 2090 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2091 { 2092 if (!vcpu->sigset_active) 2093 return; 2094 2095 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2096 sigemptyset(¤t->real_blocked); 2097 } 2098 2099 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2100 { 2101 unsigned int old, val, grow; 2102 2103 old = val = vcpu->halt_poll_ns; 2104 grow = READ_ONCE(halt_poll_ns_grow); 2105 /* 10us base */ 2106 if (val == 0 && grow) 2107 val = 10000; 2108 else 2109 val *= grow; 2110 2111 if (val > halt_poll_ns) 2112 val = halt_poll_ns; 2113 2114 vcpu->halt_poll_ns = val; 2115 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2116 } 2117 2118 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2119 { 2120 unsigned int old, val, shrink; 2121 2122 old = val = vcpu->halt_poll_ns; 2123 shrink = READ_ONCE(halt_poll_ns_shrink); 2124 if (shrink == 0) 2125 val = 0; 2126 else 2127 val /= shrink; 2128 2129 vcpu->halt_poll_ns = val; 2130 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2131 } 2132 2133 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2134 { 2135 if (kvm_arch_vcpu_runnable(vcpu)) { 2136 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2137 return -EINTR; 2138 } 2139 if (kvm_cpu_has_pending_timer(vcpu)) 2140 return -EINTR; 2141 if (signal_pending(current)) 2142 return -EINTR; 2143 2144 return 0; 2145 } 2146 2147 /* 2148 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2149 */ 2150 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2151 { 2152 ktime_t start, cur; 2153 DECLARE_SWAITQUEUE(wait); 2154 bool waited = false; 2155 u64 block_ns; 2156 2157 start = cur = ktime_get(); 2158 if (vcpu->halt_poll_ns) { 2159 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2160 2161 ++vcpu->stat.halt_attempted_poll; 2162 do { 2163 /* 2164 * This sets KVM_REQ_UNHALT if an interrupt 2165 * arrives. 2166 */ 2167 if (kvm_vcpu_check_block(vcpu) < 0) { 2168 ++vcpu->stat.halt_successful_poll; 2169 if (!vcpu_valid_wakeup(vcpu)) 2170 ++vcpu->stat.halt_poll_invalid; 2171 goto out; 2172 } 2173 cur = ktime_get(); 2174 } while (single_task_running() && ktime_before(cur, stop)); 2175 } 2176 2177 kvm_arch_vcpu_blocking(vcpu); 2178 2179 for (;;) { 2180 prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2181 2182 if (kvm_vcpu_check_block(vcpu) < 0) 2183 break; 2184 2185 waited = true; 2186 schedule(); 2187 } 2188 2189 finish_swait(&vcpu->wq, &wait); 2190 cur = ktime_get(); 2191 2192 kvm_arch_vcpu_unblocking(vcpu); 2193 out: 2194 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2195 2196 if (!vcpu_valid_wakeup(vcpu)) 2197 shrink_halt_poll_ns(vcpu); 2198 else if (halt_poll_ns) { 2199 if (block_ns <= vcpu->halt_poll_ns) 2200 ; 2201 /* we had a long block, shrink polling */ 2202 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2203 shrink_halt_poll_ns(vcpu); 2204 /* we had a short halt and our poll time is too small */ 2205 else if (vcpu->halt_poll_ns < halt_poll_ns && 2206 block_ns < halt_poll_ns) 2207 grow_halt_poll_ns(vcpu); 2208 } else 2209 vcpu->halt_poll_ns = 0; 2210 2211 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2212 kvm_arch_vcpu_block_finish(vcpu); 2213 } 2214 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2215 2216 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2217 { 2218 struct swait_queue_head *wqp; 2219 2220 wqp = kvm_arch_vcpu_wq(vcpu); 2221 if (swq_has_sleeper(wqp)) { 2222 swake_up(wqp); 2223 ++vcpu->stat.halt_wakeup; 2224 return true; 2225 } 2226 2227 return false; 2228 } 2229 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2230 2231 #ifndef CONFIG_S390 2232 /* 2233 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2234 */ 2235 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2236 { 2237 int me; 2238 int cpu = vcpu->cpu; 2239 2240 if (kvm_vcpu_wake_up(vcpu)) 2241 return; 2242 2243 me = get_cpu(); 2244 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2245 if (kvm_arch_vcpu_should_kick(vcpu)) 2246 smp_send_reschedule(cpu); 2247 put_cpu(); 2248 } 2249 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2250 #endif /* !CONFIG_S390 */ 2251 2252 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2253 { 2254 struct pid *pid; 2255 struct task_struct *task = NULL; 2256 int ret = 0; 2257 2258 rcu_read_lock(); 2259 pid = rcu_dereference(target->pid); 2260 if (pid) 2261 task = get_pid_task(pid, PIDTYPE_PID); 2262 rcu_read_unlock(); 2263 if (!task) 2264 return ret; 2265 ret = yield_to(task, 1); 2266 put_task_struct(task); 2267 2268 return ret; 2269 } 2270 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2271 2272 /* 2273 * Helper that checks whether a VCPU is eligible for directed yield. 2274 * Most eligible candidate to yield is decided by following heuristics: 2275 * 2276 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2277 * (preempted lock holder), indicated by @in_spin_loop. 2278 * Set at the beiginning and cleared at the end of interception/PLE handler. 2279 * 2280 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2281 * chance last time (mostly it has become eligible now since we have probably 2282 * yielded to lockholder in last iteration. This is done by toggling 2283 * @dy_eligible each time a VCPU checked for eligibility.) 2284 * 2285 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2286 * to preempted lock-holder could result in wrong VCPU selection and CPU 2287 * burning. Giving priority for a potential lock-holder increases lock 2288 * progress. 2289 * 2290 * Since algorithm is based on heuristics, accessing another VCPU data without 2291 * locking does not harm. It may result in trying to yield to same VCPU, fail 2292 * and continue with next VCPU and so on. 2293 */ 2294 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2295 { 2296 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2297 bool eligible; 2298 2299 eligible = !vcpu->spin_loop.in_spin_loop || 2300 vcpu->spin_loop.dy_eligible; 2301 2302 if (vcpu->spin_loop.in_spin_loop) 2303 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2304 2305 return eligible; 2306 #else 2307 return true; 2308 #endif 2309 } 2310 2311 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2312 { 2313 struct kvm *kvm = me->kvm; 2314 struct kvm_vcpu *vcpu; 2315 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2316 int yielded = 0; 2317 int try = 3; 2318 int pass; 2319 int i; 2320 2321 kvm_vcpu_set_in_spin_loop(me, true); 2322 /* 2323 * We boost the priority of a VCPU that is runnable but not 2324 * currently running, because it got preempted by something 2325 * else and called schedule in __vcpu_run. Hopefully that 2326 * VCPU is holding the lock that we need and will release it. 2327 * We approximate round-robin by starting at the last boosted VCPU. 2328 */ 2329 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2330 kvm_for_each_vcpu(i, vcpu, kvm) { 2331 if (!pass && i <= last_boosted_vcpu) { 2332 i = last_boosted_vcpu; 2333 continue; 2334 } else if (pass && i > last_boosted_vcpu) 2335 break; 2336 if (!READ_ONCE(vcpu->preempted)) 2337 continue; 2338 if (vcpu == me) 2339 continue; 2340 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2341 continue; 2342 if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) 2343 continue; 2344 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2345 continue; 2346 2347 yielded = kvm_vcpu_yield_to(vcpu); 2348 if (yielded > 0) { 2349 kvm->last_boosted_vcpu = i; 2350 break; 2351 } else if (yielded < 0) { 2352 try--; 2353 if (!try) 2354 break; 2355 } 2356 } 2357 } 2358 kvm_vcpu_set_in_spin_loop(me, false); 2359 2360 /* Ensure vcpu is not eligible during next spinloop */ 2361 kvm_vcpu_set_dy_eligible(me, false); 2362 } 2363 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2364 2365 static int kvm_vcpu_fault(struct vm_fault *vmf) 2366 { 2367 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2368 struct page *page; 2369 2370 if (vmf->pgoff == 0) 2371 page = virt_to_page(vcpu->run); 2372 #ifdef CONFIG_X86 2373 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2374 page = virt_to_page(vcpu->arch.pio_data); 2375 #endif 2376 #ifdef CONFIG_KVM_MMIO 2377 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2378 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2379 #endif 2380 else 2381 return kvm_arch_vcpu_fault(vcpu, vmf); 2382 get_page(page); 2383 vmf->page = page; 2384 return 0; 2385 } 2386 2387 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2388 .fault = kvm_vcpu_fault, 2389 }; 2390 2391 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2392 { 2393 vma->vm_ops = &kvm_vcpu_vm_ops; 2394 return 0; 2395 } 2396 2397 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2398 { 2399 struct kvm_vcpu *vcpu = filp->private_data; 2400 2401 debugfs_remove_recursive(vcpu->debugfs_dentry); 2402 kvm_put_kvm(vcpu->kvm); 2403 return 0; 2404 } 2405 2406 static struct file_operations kvm_vcpu_fops = { 2407 .release = kvm_vcpu_release, 2408 .unlocked_ioctl = kvm_vcpu_ioctl, 2409 #ifdef CONFIG_KVM_COMPAT 2410 .compat_ioctl = kvm_vcpu_compat_ioctl, 2411 #endif 2412 .mmap = kvm_vcpu_mmap, 2413 .llseek = noop_llseek, 2414 }; 2415 2416 /* 2417 * Allocates an inode for the vcpu. 2418 */ 2419 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2420 { 2421 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2422 } 2423 2424 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2425 { 2426 char dir_name[ITOA_MAX_LEN * 2]; 2427 int ret; 2428 2429 if (!kvm_arch_has_vcpu_debugfs()) 2430 return 0; 2431 2432 if (!debugfs_initialized()) 2433 return 0; 2434 2435 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2436 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2437 vcpu->kvm->debugfs_dentry); 2438 if (!vcpu->debugfs_dentry) 2439 return -ENOMEM; 2440 2441 ret = kvm_arch_create_vcpu_debugfs(vcpu); 2442 if (ret < 0) { 2443 debugfs_remove_recursive(vcpu->debugfs_dentry); 2444 return ret; 2445 } 2446 2447 return 0; 2448 } 2449 2450 /* 2451 * Creates some virtual cpus. Good luck creating more than one. 2452 */ 2453 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2454 { 2455 int r; 2456 struct kvm_vcpu *vcpu; 2457 2458 if (id >= KVM_MAX_VCPU_ID) 2459 return -EINVAL; 2460 2461 mutex_lock(&kvm->lock); 2462 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2463 mutex_unlock(&kvm->lock); 2464 return -EINVAL; 2465 } 2466 2467 kvm->created_vcpus++; 2468 mutex_unlock(&kvm->lock); 2469 2470 vcpu = kvm_arch_vcpu_create(kvm, id); 2471 if (IS_ERR(vcpu)) { 2472 r = PTR_ERR(vcpu); 2473 goto vcpu_decrement; 2474 } 2475 2476 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2477 2478 r = kvm_arch_vcpu_setup(vcpu); 2479 if (r) 2480 goto vcpu_destroy; 2481 2482 r = kvm_create_vcpu_debugfs(vcpu); 2483 if (r) 2484 goto vcpu_destroy; 2485 2486 mutex_lock(&kvm->lock); 2487 if (kvm_get_vcpu_by_id(kvm, id)) { 2488 r = -EEXIST; 2489 goto unlock_vcpu_destroy; 2490 } 2491 2492 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2493 2494 /* Now it's all set up, let userspace reach it */ 2495 kvm_get_kvm(kvm); 2496 r = create_vcpu_fd(vcpu); 2497 if (r < 0) { 2498 kvm_put_kvm(kvm); 2499 goto unlock_vcpu_destroy; 2500 } 2501 2502 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2503 2504 /* 2505 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2506 * before kvm->online_vcpu's incremented value. 2507 */ 2508 smp_wmb(); 2509 atomic_inc(&kvm->online_vcpus); 2510 2511 mutex_unlock(&kvm->lock); 2512 kvm_arch_vcpu_postcreate(vcpu); 2513 return r; 2514 2515 unlock_vcpu_destroy: 2516 mutex_unlock(&kvm->lock); 2517 debugfs_remove_recursive(vcpu->debugfs_dentry); 2518 vcpu_destroy: 2519 kvm_arch_vcpu_destroy(vcpu); 2520 vcpu_decrement: 2521 mutex_lock(&kvm->lock); 2522 kvm->created_vcpus--; 2523 mutex_unlock(&kvm->lock); 2524 return r; 2525 } 2526 2527 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2528 { 2529 if (sigset) { 2530 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2531 vcpu->sigset_active = 1; 2532 vcpu->sigset = *sigset; 2533 } else 2534 vcpu->sigset_active = 0; 2535 return 0; 2536 } 2537 2538 static long kvm_vcpu_ioctl(struct file *filp, 2539 unsigned int ioctl, unsigned long arg) 2540 { 2541 struct kvm_vcpu *vcpu = filp->private_data; 2542 void __user *argp = (void __user *)arg; 2543 int r; 2544 struct kvm_fpu *fpu = NULL; 2545 struct kvm_sregs *kvm_sregs = NULL; 2546 2547 if (vcpu->kvm->mm != current->mm) 2548 return -EIO; 2549 2550 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2551 return -EINVAL; 2552 2553 #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) 2554 /* 2555 * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 2556 * so vcpu_load() would break it. 2557 */ 2558 if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_S390_IRQ || ioctl == KVM_INTERRUPT) 2559 return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2560 #endif 2561 2562 2563 r = vcpu_load(vcpu); 2564 if (r) 2565 return r; 2566 switch (ioctl) { 2567 case KVM_RUN: { 2568 struct pid *oldpid; 2569 r = -EINVAL; 2570 if (arg) 2571 goto out; 2572 oldpid = rcu_access_pointer(vcpu->pid); 2573 if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { 2574 /* The thread running this VCPU changed. */ 2575 struct pid *newpid = get_task_pid(current, PIDTYPE_PID); 2576 2577 rcu_assign_pointer(vcpu->pid, newpid); 2578 if (oldpid) 2579 synchronize_rcu(); 2580 put_pid(oldpid); 2581 } 2582 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2583 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2584 break; 2585 } 2586 case KVM_GET_REGS: { 2587 struct kvm_regs *kvm_regs; 2588 2589 r = -ENOMEM; 2590 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2591 if (!kvm_regs) 2592 goto out; 2593 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2594 if (r) 2595 goto out_free1; 2596 r = -EFAULT; 2597 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2598 goto out_free1; 2599 r = 0; 2600 out_free1: 2601 kfree(kvm_regs); 2602 break; 2603 } 2604 case KVM_SET_REGS: { 2605 struct kvm_regs *kvm_regs; 2606 2607 r = -ENOMEM; 2608 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2609 if (IS_ERR(kvm_regs)) { 2610 r = PTR_ERR(kvm_regs); 2611 goto out; 2612 } 2613 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2614 kfree(kvm_regs); 2615 break; 2616 } 2617 case KVM_GET_SREGS: { 2618 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2619 r = -ENOMEM; 2620 if (!kvm_sregs) 2621 goto out; 2622 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2623 if (r) 2624 goto out; 2625 r = -EFAULT; 2626 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2627 goto out; 2628 r = 0; 2629 break; 2630 } 2631 case KVM_SET_SREGS: { 2632 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2633 if (IS_ERR(kvm_sregs)) { 2634 r = PTR_ERR(kvm_sregs); 2635 kvm_sregs = NULL; 2636 goto out; 2637 } 2638 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2639 break; 2640 } 2641 case KVM_GET_MP_STATE: { 2642 struct kvm_mp_state mp_state; 2643 2644 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2645 if (r) 2646 goto out; 2647 r = -EFAULT; 2648 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2649 goto out; 2650 r = 0; 2651 break; 2652 } 2653 case KVM_SET_MP_STATE: { 2654 struct kvm_mp_state mp_state; 2655 2656 r = -EFAULT; 2657 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2658 goto out; 2659 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2660 break; 2661 } 2662 case KVM_TRANSLATE: { 2663 struct kvm_translation tr; 2664 2665 r = -EFAULT; 2666 if (copy_from_user(&tr, argp, sizeof(tr))) 2667 goto out; 2668 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2669 if (r) 2670 goto out; 2671 r = -EFAULT; 2672 if (copy_to_user(argp, &tr, sizeof(tr))) 2673 goto out; 2674 r = 0; 2675 break; 2676 } 2677 case KVM_SET_GUEST_DEBUG: { 2678 struct kvm_guest_debug dbg; 2679 2680 r = -EFAULT; 2681 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2682 goto out; 2683 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2684 break; 2685 } 2686 case KVM_SET_SIGNAL_MASK: { 2687 struct kvm_signal_mask __user *sigmask_arg = argp; 2688 struct kvm_signal_mask kvm_sigmask; 2689 sigset_t sigset, *p; 2690 2691 p = NULL; 2692 if (argp) { 2693 r = -EFAULT; 2694 if (copy_from_user(&kvm_sigmask, argp, 2695 sizeof(kvm_sigmask))) 2696 goto out; 2697 r = -EINVAL; 2698 if (kvm_sigmask.len != sizeof(sigset)) 2699 goto out; 2700 r = -EFAULT; 2701 if (copy_from_user(&sigset, sigmask_arg->sigset, 2702 sizeof(sigset))) 2703 goto out; 2704 p = &sigset; 2705 } 2706 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2707 break; 2708 } 2709 case KVM_GET_FPU: { 2710 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2711 r = -ENOMEM; 2712 if (!fpu) 2713 goto out; 2714 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2715 if (r) 2716 goto out; 2717 r = -EFAULT; 2718 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2719 goto out; 2720 r = 0; 2721 break; 2722 } 2723 case KVM_SET_FPU: { 2724 fpu = memdup_user(argp, sizeof(*fpu)); 2725 if (IS_ERR(fpu)) { 2726 r = PTR_ERR(fpu); 2727 fpu = NULL; 2728 goto out; 2729 } 2730 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2731 break; 2732 } 2733 default: 2734 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2735 } 2736 out: 2737 vcpu_put(vcpu); 2738 kfree(fpu); 2739 kfree(kvm_sregs); 2740 return r; 2741 } 2742 2743 #ifdef CONFIG_KVM_COMPAT 2744 static long kvm_vcpu_compat_ioctl(struct file *filp, 2745 unsigned int ioctl, unsigned long arg) 2746 { 2747 struct kvm_vcpu *vcpu = filp->private_data; 2748 void __user *argp = compat_ptr(arg); 2749 int r; 2750 2751 if (vcpu->kvm->mm != current->mm) 2752 return -EIO; 2753 2754 switch (ioctl) { 2755 case KVM_SET_SIGNAL_MASK: { 2756 struct kvm_signal_mask __user *sigmask_arg = argp; 2757 struct kvm_signal_mask kvm_sigmask; 2758 sigset_t sigset; 2759 2760 if (argp) { 2761 r = -EFAULT; 2762 if (copy_from_user(&kvm_sigmask, argp, 2763 sizeof(kvm_sigmask))) 2764 goto out; 2765 r = -EINVAL; 2766 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 2767 goto out; 2768 r = -EFAULT; 2769 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 2770 goto out; 2771 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2772 } else 2773 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2774 break; 2775 } 2776 default: 2777 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2778 } 2779 2780 out: 2781 return r; 2782 } 2783 #endif 2784 2785 static int kvm_device_ioctl_attr(struct kvm_device *dev, 2786 int (*accessor)(struct kvm_device *dev, 2787 struct kvm_device_attr *attr), 2788 unsigned long arg) 2789 { 2790 struct kvm_device_attr attr; 2791 2792 if (!accessor) 2793 return -EPERM; 2794 2795 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2796 return -EFAULT; 2797 2798 return accessor(dev, &attr); 2799 } 2800 2801 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2802 unsigned long arg) 2803 { 2804 struct kvm_device *dev = filp->private_data; 2805 2806 switch (ioctl) { 2807 case KVM_SET_DEVICE_ATTR: 2808 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2809 case KVM_GET_DEVICE_ATTR: 2810 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2811 case KVM_HAS_DEVICE_ATTR: 2812 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2813 default: 2814 if (dev->ops->ioctl) 2815 return dev->ops->ioctl(dev, ioctl, arg); 2816 2817 return -ENOTTY; 2818 } 2819 } 2820 2821 static int kvm_device_release(struct inode *inode, struct file *filp) 2822 { 2823 struct kvm_device *dev = filp->private_data; 2824 struct kvm *kvm = dev->kvm; 2825 2826 kvm_put_kvm(kvm); 2827 return 0; 2828 } 2829 2830 static const struct file_operations kvm_device_fops = { 2831 .unlocked_ioctl = kvm_device_ioctl, 2832 #ifdef CONFIG_KVM_COMPAT 2833 .compat_ioctl = kvm_device_ioctl, 2834 #endif 2835 .release = kvm_device_release, 2836 }; 2837 2838 struct kvm_device *kvm_device_from_filp(struct file *filp) 2839 { 2840 if (filp->f_op != &kvm_device_fops) 2841 return NULL; 2842 2843 return filp->private_data; 2844 } 2845 2846 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 2847 #ifdef CONFIG_KVM_MPIC 2848 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 2849 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 2850 #endif 2851 }; 2852 2853 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 2854 { 2855 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 2856 return -ENOSPC; 2857 2858 if (kvm_device_ops_table[type] != NULL) 2859 return -EEXIST; 2860 2861 kvm_device_ops_table[type] = ops; 2862 return 0; 2863 } 2864 2865 void kvm_unregister_device_ops(u32 type) 2866 { 2867 if (kvm_device_ops_table[type] != NULL) 2868 kvm_device_ops_table[type] = NULL; 2869 } 2870 2871 static int kvm_ioctl_create_device(struct kvm *kvm, 2872 struct kvm_create_device *cd) 2873 { 2874 struct kvm_device_ops *ops = NULL; 2875 struct kvm_device *dev; 2876 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2877 int ret; 2878 2879 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 2880 return -ENODEV; 2881 2882 ops = kvm_device_ops_table[cd->type]; 2883 if (ops == NULL) 2884 return -ENODEV; 2885 2886 if (test) 2887 return 0; 2888 2889 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2890 if (!dev) 2891 return -ENOMEM; 2892 2893 dev->ops = ops; 2894 dev->kvm = kvm; 2895 2896 mutex_lock(&kvm->lock); 2897 ret = ops->create(dev, cd->type); 2898 if (ret < 0) { 2899 mutex_unlock(&kvm->lock); 2900 kfree(dev); 2901 return ret; 2902 } 2903 list_add(&dev->vm_node, &kvm->devices); 2904 mutex_unlock(&kvm->lock); 2905 2906 if (ops->init) 2907 ops->init(dev); 2908 2909 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 2910 if (ret < 0) { 2911 mutex_lock(&kvm->lock); 2912 list_del(&dev->vm_node); 2913 mutex_unlock(&kvm->lock); 2914 ops->destroy(dev); 2915 return ret; 2916 } 2917 2918 kvm_get_kvm(kvm); 2919 cd->fd = ret; 2920 return 0; 2921 } 2922 2923 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 2924 { 2925 switch (arg) { 2926 case KVM_CAP_USER_MEMORY: 2927 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2928 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2929 case KVM_CAP_INTERNAL_ERROR_DATA: 2930 #ifdef CONFIG_HAVE_KVM_MSI 2931 case KVM_CAP_SIGNAL_MSI: 2932 #endif 2933 #ifdef CONFIG_HAVE_KVM_IRQFD 2934 case KVM_CAP_IRQFD: 2935 case KVM_CAP_IRQFD_RESAMPLE: 2936 #endif 2937 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 2938 case KVM_CAP_CHECK_EXTENSION_VM: 2939 return 1; 2940 #ifdef CONFIG_KVM_MMIO 2941 case KVM_CAP_COALESCED_MMIO: 2942 return KVM_COALESCED_MMIO_PAGE_OFFSET; 2943 #endif 2944 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2945 case KVM_CAP_IRQ_ROUTING: 2946 return KVM_MAX_IRQ_ROUTES; 2947 #endif 2948 #if KVM_ADDRESS_SPACE_NUM > 1 2949 case KVM_CAP_MULTI_ADDRESS_SPACE: 2950 return KVM_ADDRESS_SPACE_NUM; 2951 #endif 2952 case KVM_CAP_MAX_VCPU_ID: 2953 return KVM_MAX_VCPU_ID; 2954 default: 2955 break; 2956 } 2957 return kvm_vm_ioctl_check_extension(kvm, arg); 2958 } 2959 2960 static long kvm_vm_ioctl(struct file *filp, 2961 unsigned int ioctl, unsigned long arg) 2962 { 2963 struct kvm *kvm = filp->private_data; 2964 void __user *argp = (void __user *)arg; 2965 int r; 2966 2967 if (kvm->mm != current->mm) 2968 return -EIO; 2969 switch (ioctl) { 2970 case KVM_CREATE_VCPU: 2971 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2972 break; 2973 case KVM_SET_USER_MEMORY_REGION: { 2974 struct kvm_userspace_memory_region kvm_userspace_mem; 2975 2976 r = -EFAULT; 2977 if (copy_from_user(&kvm_userspace_mem, argp, 2978 sizeof(kvm_userspace_mem))) 2979 goto out; 2980 2981 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2982 break; 2983 } 2984 case KVM_GET_DIRTY_LOG: { 2985 struct kvm_dirty_log log; 2986 2987 r = -EFAULT; 2988 if (copy_from_user(&log, argp, sizeof(log))) 2989 goto out; 2990 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2991 break; 2992 } 2993 #ifdef CONFIG_KVM_MMIO 2994 case KVM_REGISTER_COALESCED_MMIO: { 2995 struct kvm_coalesced_mmio_zone zone; 2996 2997 r = -EFAULT; 2998 if (copy_from_user(&zone, argp, sizeof(zone))) 2999 goto out; 3000 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3001 break; 3002 } 3003 case KVM_UNREGISTER_COALESCED_MMIO: { 3004 struct kvm_coalesced_mmio_zone zone; 3005 3006 r = -EFAULT; 3007 if (copy_from_user(&zone, argp, sizeof(zone))) 3008 goto out; 3009 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3010 break; 3011 } 3012 #endif 3013 case KVM_IRQFD: { 3014 struct kvm_irqfd data; 3015 3016 r = -EFAULT; 3017 if (copy_from_user(&data, argp, sizeof(data))) 3018 goto out; 3019 r = kvm_irqfd(kvm, &data); 3020 break; 3021 } 3022 case KVM_IOEVENTFD: { 3023 struct kvm_ioeventfd data; 3024 3025 r = -EFAULT; 3026 if (copy_from_user(&data, argp, sizeof(data))) 3027 goto out; 3028 r = kvm_ioeventfd(kvm, &data); 3029 break; 3030 } 3031 #ifdef CONFIG_HAVE_KVM_MSI 3032 case KVM_SIGNAL_MSI: { 3033 struct kvm_msi msi; 3034 3035 r = -EFAULT; 3036 if (copy_from_user(&msi, argp, sizeof(msi))) 3037 goto out; 3038 r = kvm_send_userspace_msi(kvm, &msi); 3039 break; 3040 } 3041 #endif 3042 #ifdef __KVM_HAVE_IRQ_LINE 3043 case KVM_IRQ_LINE_STATUS: 3044 case KVM_IRQ_LINE: { 3045 struct kvm_irq_level irq_event; 3046 3047 r = -EFAULT; 3048 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3049 goto out; 3050 3051 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3052 ioctl == KVM_IRQ_LINE_STATUS); 3053 if (r) 3054 goto out; 3055 3056 r = -EFAULT; 3057 if (ioctl == KVM_IRQ_LINE_STATUS) { 3058 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3059 goto out; 3060 } 3061 3062 r = 0; 3063 break; 3064 } 3065 #endif 3066 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3067 case KVM_SET_GSI_ROUTING: { 3068 struct kvm_irq_routing routing; 3069 struct kvm_irq_routing __user *urouting; 3070 struct kvm_irq_routing_entry *entries = NULL; 3071 3072 r = -EFAULT; 3073 if (copy_from_user(&routing, argp, sizeof(routing))) 3074 goto out; 3075 r = -EINVAL; 3076 if (!kvm_arch_can_set_irq_routing(kvm)) 3077 goto out; 3078 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3079 goto out; 3080 if (routing.flags) 3081 goto out; 3082 if (routing.nr) { 3083 r = -ENOMEM; 3084 entries = vmalloc(routing.nr * sizeof(*entries)); 3085 if (!entries) 3086 goto out; 3087 r = -EFAULT; 3088 urouting = argp; 3089 if (copy_from_user(entries, urouting->entries, 3090 routing.nr * sizeof(*entries))) 3091 goto out_free_irq_routing; 3092 } 3093 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3094 routing.flags); 3095 out_free_irq_routing: 3096 vfree(entries); 3097 break; 3098 } 3099 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3100 case KVM_CREATE_DEVICE: { 3101 struct kvm_create_device cd; 3102 3103 r = -EFAULT; 3104 if (copy_from_user(&cd, argp, sizeof(cd))) 3105 goto out; 3106 3107 r = kvm_ioctl_create_device(kvm, &cd); 3108 if (r) 3109 goto out; 3110 3111 r = -EFAULT; 3112 if (copy_to_user(argp, &cd, sizeof(cd))) 3113 goto out; 3114 3115 r = 0; 3116 break; 3117 } 3118 case KVM_CHECK_EXTENSION: 3119 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3120 break; 3121 default: 3122 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3123 } 3124 out: 3125 return r; 3126 } 3127 3128 #ifdef CONFIG_KVM_COMPAT 3129 struct compat_kvm_dirty_log { 3130 __u32 slot; 3131 __u32 padding1; 3132 union { 3133 compat_uptr_t dirty_bitmap; /* one bit per page */ 3134 __u64 padding2; 3135 }; 3136 }; 3137 3138 static long kvm_vm_compat_ioctl(struct file *filp, 3139 unsigned int ioctl, unsigned long arg) 3140 { 3141 struct kvm *kvm = filp->private_data; 3142 int r; 3143 3144 if (kvm->mm != current->mm) 3145 return -EIO; 3146 switch (ioctl) { 3147 case KVM_GET_DIRTY_LOG: { 3148 struct compat_kvm_dirty_log compat_log; 3149 struct kvm_dirty_log log; 3150 3151 if (copy_from_user(&compat_log, (void __user *)arg, 3152 sizeof(compat_log))) 3153 return -EFAULT; 3154 log.slot = compat_log.slot; 3155 log.padding1 = compat_log.padding1; 3156 log.padding2 = compat_log.padding2; 3157 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3158 3159 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3160 break; 3161 } 3162 default: 3163 r = kvm_vm_ioctl(filp, ioctl, arg); 3164 } 3165 return r; 3166 } 3167 #endif 3168 3169 static struct file_operations kvm_vm_fops = { 3170 .release = kvm_vm_release, 3171 .unlocked_ioctl = kvm_vm_ioctl, 3172 #ifdef CONFIG_KVM_COMPAT 3173 .compat_ioctl = kvm_vm_compat_ioctl, 3174 #endif 3175 .llseek = noop_llseek, 3176 }; 3177 3178 static int kvm_dev_ioctl_create_vm(unsigned long type) 3179 { 3180 int r; 3181 struct kvm *kvm; 3182 struct file *file; 3183 3184 kvm = kvm_create_vm(type); 3185 if (IS_ERR(kvm)) 3186 return PTR_ERR(kvm); 3187 #ifdef CONFIG_KVM_MMIO 3188 r = kvm_coalesced_mmio_init(kvm); 3189 if (r < 0) { 3190 kvm_put_kvm(kvm); 3191 return r; 3192 } 3193 #endif 3194 r = get_unused_fd_flags(O_CLOEXEC); 3195 if (r < 0) { 3196 kvm_put_kvm(kvm); 3197 return r; 3198 } 3199 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3200 if (IS_ERR(file)) { 3201 put_unused_fd(r); 3202 kvm_put_kvm(kvm); 3203 return PTR_ERR(file); 3204 } 3205 3206 /* 3207 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3208 * already set, with ->release() being kvm_vm_release(). In error 3209 * cases it will be called by the final fput(file) and will take 3210 * care of doing kvm_put_kvm(kvm). 3211 */ 3212 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3213 put_unused_fd(r); 3214 fput(file); 3215 return -ENOMEM; 3216 } 3217 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3218 3219 fd_install(r, file); 3220 return r; 3221 } 3222 3223 static long kvm_dev_ioctl(struct file *filp, 3224 unsigned int ioctl, unsigned long arg) 3225 { 3226 long r = -EINVAL; 3227 3228 switch (ioctl) { 3229 case KVM_GET_API_VERSION: 3230 if (arg) 3231 goto out; 3232 r = KVM_API_VERSION; 3233 break; 3234 case KVM_CREATE_VM: 3235 r = kvm_dev_ioctl_create_vm(arg); 3236 break; 3237 case KVM_CHECK_EXTENSION: 3238 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3239 break; 3240 case KVM_GET_VCPU_MMAP_SIZE: 3241 if (arg) 3242 goto out; 3243 r = PAGE_SIZE; /* struct kvm_run */ 3244 #ifdef CONFIG_X86 3245 r += PAGE_SIZE; /* pio data page */ 3246 #endif 3247 #ifdef CONFIG_KVM_MMIO 3248 r += PAGE_SIZE; /* coalesced mmio ring page */ 3249 #endif 3250 break; 3251 case KVM_TRACE_ENABLE: 3252 case KVM_TRACE_PAUSE: 3253 case KVM_TRACE_DISABLE: 3254 r = -EOPNOTSUPP; 3255 break; 3256 default: 3257 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3258 } 3259 out: 3260 return r; 3261 } 3262 3263 static struct file_operations kvm_chardev_ops = { 3264 .unlocked_ioctl = kvm_dev_ioctl, 3265 .compat_ioctl = kvm_dev_ioctl, 3266 .llseek = noop_llseek, 3267 }; 3268 3269 static struct miscdevice kvm_dev = { 3270 KVM_MINOR, 3271 "kvm", 3272 &kvm_chardev_ops, 3273 }; 3274 3275 static void hardware_enable_nolock(void *junk) 3276 { 3277 int cpu = raw_smp_processor_id(); 3278 int r; 3279 3280 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3281 return; 3282 3283 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3284 3285 r = kvm_arch_hardware_enable(); 3286 3287 if (r) { 3288 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3289 atomic_inc(&hardware_enable_failed); 3290 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3291 } 3292 } 3293 3294 static int kvm_starting_cpu(unsigned int cpu) 3295 { 3296 raw_spin_lock(&kvm_count_lock); 3297 if (kvm_usage_count) 3298 hardware_enable_nolock(NULL); 3299 raw_spin_unlock(&kvm_count_lock); 3300 return 0; 3301 } 3302 3303 static void hardware_disable_nolock(void *junk) 3304 { 3305 int cpu = raw_smp_processor_id(); 3306 3307 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3308 return; 3309 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3310 kvm_arch_hardware_disable(); 3311 } 3312 3313 static int kvm_dying_cpu(unsigned int cpu) 3314 { 3315 raw_spin_lock(&kvm_count_lock); 3316 if (kvm_usage_count) 3317 hardware_disable_nolock(NULL); 3318 raw_spin_unlock(&kvm_count_lock); 3319 return 0; 3320 } 3321 3322 static void hardware_disable_all_nolock(void) 3323 { 3324 BUG_ON(!kvm_usage_count); 3325 3326 kvm_usage_count--; 3327 if (!kvm_usage_count) 3328 on_each_cpu(hardware_disable_nolock, NULL, 1); 3329 } 3330 3331 static void hardware_disable_all(void) 3332 { 3333 raw_spin_lock(&kvm_count_lock); 3334 hardware_disable_all_nolock(); 3335 raw_spin_unlock(&kvm_count_lock); 3336 } 3337 3338 static int hardware_enable_all(void) 3339 { 3340 int r = 0; 3341 3342 raw_spin_lock(&kvm_count_lock); 3343 3344 kvm_usage_count++; 3345 if (kvm_usage_count == 1) { 3346 atomic_set(&hardware_enable_failed, 0); 3347 on_each_cpu(hardware_enable_nolock, NULL, 1); 3348 3349 if (atomic_read(&hardware_enable_failed)) { 3350 hardware_disable_all_nolock(); 3351 r = -EBUSY; 3352 } 3353 } 3354 3355 raw_spin_unlock(&kvm_count_lock); 3356 3357 return r; 3358 } 3359 3360 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3361 void *v) 3362 { 3363 /* 3364 * Some (well, at least mine) BIOSes hang on reboot if 3365 * in vmx root mode. 3366 * 3367 * And Intel TXT required VMX off for all cpu when system shutdown. 3368 */ 3369 pr_info("kvm: exiting hardware virtualization\n"); 3370 kvm_rebooting = true; 3371 on_each_cpu(hardware_disable_nolock, NULL, 1); 3372 return NOTIFY_OK; 3373 } 3374 3375 static struct notifier_block kvm_reboot_notifier = { 3376 .notifier_call = kvm_reboot, 3377 .priority = 0, 3378 }; 3379 3380 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3381 { 3382 int i; 3383 3384 for (i = 0; i < bus->dev_count; i++) { 3385 struct kvm_io_device *pos = bus->range[i].dev; 3386 3387 kvm_iodevice_destructor(pos); 3388 } 3389 kfree(bus); 3390 } 3391 3392 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3393 const struct kvm_io_range *r2) 3394 { 3395 gpa_t addr1 = r1->addr; 3396 gpa_t addr2 = r2->addr; 3397 3398 if (addr1 < addr2) 3399 return -1; 3400 3401 /* If r2->len == 0, match the exact address. If r2->len != 0, 3402 * accept any overlapping write. Any order is acceptable for 3403 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3404 * we process all of them. 3405 */ 3406 if (r2->len) { 3407 addr1 += r1->len; 3408 addr2 += r2->len; 3409 } 3410 3411 if (addr1 > addr2) 3412 return 1; 3413 3414 return 0; 3415 } 3416 3417 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3418 { 3419 return kvm_io_bus_cmp(p1, p2); 3420 } 3421 3422 static int kvm_io_bus_insert_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev, 3423 gpa_t addr, int len) 3424 { 3425 bus->range[bus->dev_count++] = (struct kvm_io_range) { 3426 .addr = addr, 3427 .len = len, 3428 .dev = dev, 3429 }; 3430 3431 sort(bus->range, bus->dev_count, sizeof(struct kvm_io_range), 3432 kvm_io_bus_sort_cmp, NULL); 3433 3434 return 0; 3435 } 3436 3437 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3438 gpa_t addr, int len) 3439 { 3440 struct kvm_io_range *range, key; 3441 int off; 3442 3443 key = (struct kvm_io_range) { 3444 .addr = addr, 3445 .len = len, 3446 }; 3447 3448 range = bsearch(&key, bus->range, bus->dev_count, 3449 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3450 if (range == NULL) 3451 return -ENOENT; 3452 3453 off = range - bus->range; 3454 3455 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3456 off--; 3457 3458 return off; 3459 } 3460 3461 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3462 struct kvm_io_range *range, const void *val) 3463 { 3464 int idx; 3465 3466 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3467 if (idx < 0) 3468 return -EOPNOTSUPP; 3469 3470 while (idx < bus->dev_count && 3471 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3472 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3473 range->len, val)) 3474 return idx; 3475 idx++; 3476 } 3477 3478 return -EOPNOTSUPP; 3479 } 3480 3481 /* kvm_io_bus_write - called under kvm->slots_lock */ 3482 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3483 int len, const void *val) 3484 { 3485 struct kvm_io_bus *bus; 3486 struct kvm_io_range range; 3487 int r; 3488 3489 range = (struct kvm_io_range) { 3490 .addr = addr, 3491 .len = len, 3492 }; 3493 3494 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3495 if (!bus) 3496 return -ENOMEM; 3497 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3498 return r < 0 ? r : 0; 3499 } 3500 3501 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3502 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3503 gpa_t addr, int len, const void *val, long cookie) 3504 { 3505 struct kvm_io_bus *bus; 3506 struct kvm_io_range range; 3507 3508 range = (struct kvm_io_range) { 3509 .addr = addr, 3510 .len = len, 3511 }; 3512 3513 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3514 if (!bus) 3515 return -ENOMEM; 3516 3517 /* First try the device referenced by cookie. */ 3518 if ((cookie >= 0) && (cookie < bus->dev_count) && 3519 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3520 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3521 val)) 3522 return cookie; 3523 3524 /* 3525 * cookie contained garbage; fall back to search and return the 3526 * correct cookie value. 3527 */ 3528 return __kvm_io_bus_write(vcpu, bus, &range, val); 3529 } 3530 3531 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3532 struct kvm_io_range *range, void *val) 3533 { 3534 int idx; 3535 3536 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3537 if (idx < 0) 3538 return -EOPNOTSUPP; 3539 3540 while (idx < bus->dev_count && 3541 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3542 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3543 range->len, val)) 3544 return idx; 3545 idx++; 3546 } 3547 3548 return -EOPNOTSUPP; 3549 } 3550 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3551 3552 /* kvm_io_bus_read - called under kvm->slots_lock */ 3553 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3554 int len, void *val) 3555 { 3556 struct kvm_io_bus *bus; 3557 struct kvm_io_range range; 3558 int r; 3559 3560 range = (struct kvm_io_range) { 3561 .addr = addr, 3562 .len = len, 3563 }; 3564 3565 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3566 if (!bus) 3567 return -ENOMEM; 3568 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3569 return r < 0 ? r : 0; 3570 } 3571 3572 3573 /* Caller must hold slots_lock. */ 3574 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3575 int len, struct kvm_io_device *dev) 3576 { 3577 struct kvm_io_bus *new_bus, *bus; 3578 3579 bus = kvm_get_bus(kvm, bus_idx); 3580 if (!bus) 3581 return -ENOMEM; 3582 3583 /* exclude ioeventfd which is limited by maximum fd */ 3584 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3585 return -ENOSPC; 3586 3587 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3588 sizeof(struct kvm_io_range)), GFP_KERNEL); 3589 if (!new_bus) 3590 return -ENOMEM; 3591 memcpy(new_bus, bus, sizeof(*bus) + (bus->dev_count * 3592 sizeof(struct kvm_io_range))); 3593 kvm_io_bus_insert_dev(new_bus, dev, addr, len); 3594 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3595 synchronize_srcu_expedited(&kvm->srcu); 3596 kfree(bus); 3597 3598 return 0; 3599 } 3600 3601 /* Caller must hold slots_lock. */ 3602 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3603 struct kvm_io_device *dev) 3604 { 3605 int i; 3606 struct kvm_io_bus *new_bus, *bus; 3607 3608 bus = kvm_get_bus(kvm, bus_idx); 3609 if (!bus) 3610 return; 3611 3612 for (i = 0; i < bus->dev_count; i++) 3613 if (bus->range[i].dev == dev) { 3614 break; 3615 } 3616 3617 if (i == bus->dev_count) 3618 return; 3619 3620 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3621 sizeof(struct kvm_io_range)), GFP_KERNEL); 3622 if (!new_bus) { 3623 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3624 goto broken; 3625 } 3626 3627 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3628 new_bus->dev_count--; 3629 memcpy(new_bus->range + i, bus->range + i + 1, 3630 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3631 3632 broken: 3633 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3634 synchronize_srcu_expedited(&kvm->srcu); 3635 kfree(bus); 3636 return; 3637 } 3638 3639 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3640 gpa_t addr) 3641 { 3642 struct kvm_io_bus *bus; 3643 int dev_idx, srcu_idx; 3644 struct kvm_io_device *iodev = NULL; 3645 3646 srcu_idx = srcu_read_lock(&kvm->srcu); 3647 3648 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 3649 if (!bus) 3650 goto out_unlock; 3651 3652 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 3653 if (dev_idx < 0) 3654 goto out_unlock; 3655 3656 iodev = bus->range[dev_idx].dev; 3657 3658 out_unlock: 3659 srcu_read_unlock(&kvm->srcu, srcu_idx); 3660 3661 return iodev; 3662 } 3663 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 3664 3665 static int kvm_debugfs_open(struct inode *inode, struct file *file, 3666 int (*get)(void *, u64 *), int (*set)(void *, u64), 3667 const char *fmt) 3668 { 3669 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3670 inode->i_private; 3671 3672 /* The debugfs files are a reference to the kvm struct which 3673 * is still valid when kvm_destroy_vm is called. 3674 * To avoid the race between open and the removal of the debugfs 3675 * directory we test against the users count. 3676 */ 3677 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 3678 return -ENOENT; 3679 3680 if (simple_attr_open(inode, file, get, set, fmt)) { 3681 kvm_put_kvm(stat_data->kvm); 3682 return -ENOMEM; 3683 } 3684 3685 return 0; 3686 } 3687 3688 static int kvm_debugfs_release(struct inode *inode, struct file *file) 3689 { 3690 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3691 inode->i_private; 3692 3693 simple_attr_release(inode, file); 3694 kvm_put_kvm(stat_data->kvm); 3695 3696 return 0; 3697 } 3698 3699 static int vm_stat_get_per_vm(void *data, u64 *val) 3700 { 3701 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3702 3703 *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 3704 3705 return 0; 3706 } 3707 3708 static int vm_stat_clear_per_vm(void *data, u64 val) 3709 { 3710 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3711 3712 if (val) 3713 return -EINVAL; 3714 3715 *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 3716 3717 return 0; 3718 } 3719 3720 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 3721 { 3722 __simple_attr_check_format("%llu\n", 0ull); 3723 return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 3724 vm_stat_clear_per_vm, "%llu\n"); 3725 } 3726 3727 static const struct file_operations vm_stat_get_per_vm_fops = { 3728 .owner = THIS_MODULE, 3729 .open = vm_stat_get_per_vm_open, 3730 .release = kvm_debugfs_release, 3731 .read = simple_attr_read, 3732 .write = simple_attr_write, 3733 .llseek = no_llseek, 3734 }; 3735 3736 static int vcpu_stat_get_per_vm(void *data, u64 *val) 3737 { 3738 int i; 3739 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3740 struct kvm_vcpu *vcpu; 3741 3742 *val = 0; 3743 3744 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3745 *val += *(u64 *)((void *)vcpu + stat_data->offset); 3746 3747 return 0; 3748 } 3749 3750 static int vcpu_stat_clear_per_vm(void *data, u64 val) 3751 { 3752 int i; 3753 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3754 struct kvm_vcpu *vcpu; 3755 3756 if (val) 3757 return -EINVAL; 3758 3759 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3760 *(u64 *)((void *)vcpu + stat_data->offset) = 0; 3761 3762 return 0; 3763 } 3764 3765 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 3766 { 3767 __simple_attr_check_format("%llu\n", 0ull); 3768 return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 3769 vcpu_stat_clear_per_vm, "%llu\n"); 3770 } 3771 3772 static const struct file_operations vcpu_stat_get_per_vm_fops = { 3773 .owner = THIS_MODULE, 3774 .open = vcpu_stat_get_per_vm_open, 3775 .release = kvm_debugfs_release, 3776 .read = simple_attr_read, 3777 .write = simple_attr_write, 3778 .llseek = no_llseek, 3779 }; 3780 3781 static const struct file_operations *stat_fops_per_vm[] = { 3782 [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 3783 [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 3784 }; 3785 3786 static int vm_stat_get(void *_offset, u64 *val) 3787 { 3788 unsigned offset = (long)_offset; 3789 struct kvm *kvm; 3790 struct kvm_stat_data stat_tmp = {.offset = offset}; 3791 u64 tmp_val; 3792 3793 *val = 0; 3794 spin_lock(&kvm_lock); 3795 list_for_each_entry(kvm, &vm_list, vm_list) { 3796 stat_tmp.kvm = kvm; 3797 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3798 *val += tmp_val; 3799 } 3800 spin_unlock(&kvm_lock); 3801 return 0; 3802 } 3803 3804 static int vm_stat_clear(void *_offset, u64 val) 3805 { 3806 unsigned offset = (long)_offset; 3807 struct kvm *kvm; 3808 struct kvm_stat_data stat_tmp = {.offset = offset}; 3809 3810 if (val) 3811 return -EINVAL; 3812 3813 spin_lock(&kvm_lock); 3814 list_for_each_entry(kvm, &vm_list, vm_list) { 3815 stat_tmp.kvm = kvm; 3816 vm_stat_clear_per_vm((void *)&stat_tmp, 0); 3817 } 3818 spin_unlock(&kvm_lock); 3819 3820 return 0; 3821 } 3822 3823 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 3824 3825 static int vcpu_stat_get(void *_offset, u64 *val) 3826 { 3827 unsigned offset = (long)_offset; 3828 struct kvm *kvm; 3829 struct kvm_stat_data stat_tmp = {.offset = offset}; 3830 u64 tmp_val; 3831 3832 *val = 0; 3833 spin_lock(&kvm_lock); 3834 list_for_each_entry(kvm, &vm_list, vm_list) { 3835 stat_tmp.kvm = kvm; 3836 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3837 *val += tmp_val; 3838 } 3839 spin_unlock(&kvm_lock); 3840 return 0; 3841 } 3842 3843 static int vcpu_stat_clear(void *_offset, u64 val) 3844 { 3845 unsigned offset = (long)_offset; 3846 struct kvm *kvm; 3847 struct kvm_stat_data stat_tmp = {.offset = offset}; 3848 3849 if (val) 3850 return -EINVAL; 3851 3852 spin_lock(&kvm_lock); 3853 list_for_each_entry(kvm, &vm_list, vm_list) { 3854 stat_tmp.kvm = kvm; 3855 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 3856 } 3857 spin_unlock(&kvm_lock); 3858 3859 return 0; 3860 } 3861 3862 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 3863 "%llu\n"); 3864 3865 static const struct file_operations *stat_fops[] = { 3866 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3867 [KVM_STAT_VM] = &vm_stat_fops, 3868 }; 3869 3870 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 3871 { 3872 struct kobj_uevent_env *env; 3873 unsigned long long created, active; 3874 3875 if (!kvm_dev.this_device || !kvm) 3876 return; 3877 3878 spin_lock(&kvm_lock); 3879 if (type == KVM_EVENT_CREATE_VM) { 3880 kvm_createvm_count++; 3881 kvm_active_vms++; 3882 } else if (type == KVM_EVENT_DESTROY_VM) { 3883 kvm_active_vms--; 3884 } 3885 created = kvm_createvm_count; 3886 active = kvm_active_vms; 3887 spin_unlock(&kvm_lock); 3888 3889 env = kzalloc(sizeof(*env), GFP_KERNEL); 3890 if (!env) 3891 return; 3892 3893 add_uevent_var(env, "CREATED=%llu", created); 3894 add_uevent_var(env, "COUNT=%llu", active); 3895 3896 if (type == KVM_EVENT_CREATE_VM) { 3897 add_uevent_var(env, "EVENT=create"); 3898 kvm->userspace_pid = task_pid_nr(current); 3899 } else if (type == KVM_EVENT_DESTROY_VM) { 3900 add_uevent_var(env, "EVENT=destroy"); 3901 } 3902 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 3903 3904 if (kvm->debugfs_dentry) { 3905 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); 3906 3907 if (p) { 3908 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 3909 if (!IS_ERR(tmp)) 3910 add_uevent_var(env, "STATS_PATH=%s", tmp); 3911 kfree(p); 3912 } 3913 } 3914 /* no need for checks, since we are adding at most only 5 keys */ 3915 env->envp[env->envp_idx++] = NULL; 3916 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 3917 kfree(env); 3918 } 3919 3920 static int kvm_init_debug(void) 3921 { 3922 int r = -EEXIST; 3923 struct kvm_stats_debugfs_item *p; 3924 3925 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 3926 if (kvm_debugfs_dir == NULL) 3927 goto out; 3928 3929 kvm_debugfs_num_entries = 0; 3930 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 3931 if (!debugfs_create_file(p->name, 0644, kvm_debugfs_dir, 3932 (void *)(long)p->offset, 3933 stat_fops[p->kind])) 3934 goto out_dir; 3935 } 3936 3937 return 0; 3938 3939 out_dir: 3940 debugfs_remove_recursive(kvm_debugfs_dir); 3941 out: 3942 return r; 3943 } 3944 3945 static int kvm_suspend(void) 3946 { 3947 if (kvm_usage_count) 3948 hardware_disable_nolock(NULL); 3949 return 0; 3950 } 3951 3952 static void kvm_resume(void) 3953 { 3954 if (kvm_usage_count) { 3955 WARN_ON(raw_spin_is_locked(&kvm_count_lock)); 3956 hardware_enable_nolock(NULL); 3957 } 3958 } 3959 3960 static struct syscore_ops kvm_syscore_ops = { 3961 .suspend = kvm_suspend, 3962 .resume = kvm_resume, 3963 }; 3964 3965 static inline 3966 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 3967 { 3968 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3969 } 3970 3971 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3972 { 3973 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3974 3975 if (vcpu->preempted) 3976 vcpu->preempted = false; 3977 3978 kvm_arch_sched_in(vcpu, cpu); 3979 3980 kvm_arch_vcpu_load(vcpu, cpu); 3981 } 3982 3983 static void kvm_sched_out(struct preempt_notifier *pn, 3984 struct task_struct *next) 3985 { 3986 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3987 3988 if (current->state == TASK_RUNNING) 3989 vcpu->preempted = true; 3990 kvm_arch_vcpu_put(vcpu); 3991 } 3992 3993 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 3994 struct module *module) 3995 { 3996 int r; 3997 int cpu; 3998 3999 r = kvm_arch_init(opaque); 4000 if (r) 4001 goto out_fail; 4002 4003 /* 4004 * kvm_arch_init makes sure there's at most one caller 4005 * for architectures that support multiple implementations, 4006 * like intel and amd on x86. 4007 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4008 * conflicts in case kvm is already setup for another implementation. 4009 */ 4010 r = kvm_irqfd_init(); 4011 if (r) 4012 goto out_irqfd; 4013 4014 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4015 r = -ENOMEM; 4016 goto out_free_0; 4017 } 4018 4019 r = kvm_arch_hardware_setup(); 4020 if (r < 0) 4021 goto out_free_0a; 4022 4023 for_each_online_cpu(cpu) { 4024 smp_call_function_single(cpu, 4025 kvm_arch_check_processor_compat, 4026 &r, 1); 4027 if (r < 0) 4028 goto out_free_1; 4029 } 4030 4031 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4032 kvm_starting_cpu, kvm_dying_cpu); 4033 if (r) 4034 goto out_free_2; 4035 register_reboot_notifier(&kvm_reboot_notifier); 4036 4037 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4038 if (!vcpu_align) 4039 vcpu_align = __alignof__(struct kvm_vcpu); 4040 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, vcpu_align, 4041 SLAB_ACCOUNT, NULL); 4042 if (!kvm_vcpu_cache) { 4043 r = -ENOMEM; 4044 goto out_free_3; 4045 } 4046 4047 r = kvm_async_pf_init(); 4048 if (r) 4049 goto out_free; 4050 4051 kvm_chardev_ops.owner = module; 4052 kvm_vm_fops.owner = module; 4053 kvm_vcpu_fops.owner = module; 4054 4055 r = misc_register(&kvm_dev); 4056 if (r) { 4057 pr_err("kvm: misc device register failed\n"); 4058 goto out_unreg; 4059 } 4060 4061 register_syscore_ops(&kvm_syscore_ops); 4062 4063 kvm_preempt_ops.sched_in = kvm_sched_in; 4064 kvm_preempt_ops.sched_out = kvm_sched_out; 4065 4066 r = kvm_init_debug(); 4067 if (r) { 4068 pr_err("kvm: create debugfs files failed\n"); 4069 goto out_undebugfs; 4070 } 4071 4072 r = kvm_vfio_ops_init(); 4073 WARN_ON(r); 4074 4075 return 0; 4076 4077 out_undebugfs: 4078 unregister_syscore_ops(&kvm_syscore_ops); 4079 misc_deregister(&kvm_dev); 4080 out_unreg: 4081 kvm_async_pf_deinit(); 4082 out_free: 4083 kmem_cache_destroy(kvm_vcpu_cache); 4084 out_free_3: 4085 unregister_reboot_notifier(&kvm_reboot_notifier); 4086 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4087 out_free_2: 4088 out_free_1: 4089 kvm_arch_hardware_unsetup(); 4090 out_free_0a: 4091 free_cpumask_var(cpus_hardware_enabled); 4092 out_free_0: 4093 kvm_irqfd_exit(); 4094 out_irqfd: 4095 kvm_arch_exit(); 4096 out_fail: 4097 return r; 4098 } 4099 EXPORT_SYMBOL_GPL(kvm_init); 4100 4101 void kvm_exit(void) 4102 { 4103 debugfs_remove_recursive(kvm_debugfs_dir); 4104 misc_deregister(&kvm_dev); 4105 kmem_cache_destroy(kvm_vcpu_cache); 4106 kvm_async_pf_deinit(); 4107 unregister_syscore_ops(&kvm_syscore_ops); 4108 unregister_reboot_notifier(&kvm_reboot_notifier); 4109 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4110 on_each_cpu(hardware_disable_nolock, NULL, 1); 4111 kvm_arch_hardware_unsetup(); 4112 kvm_arch_exit(); 4113 kvm_irqfd_exit(); 4114 free_cpumask_var(cpus_hardware_enabled); 4115 kvm_vfio_ops_exit(); 4116 } 4117 EXPORT_SYMBOL_GPL(kvm_exit); 4118