1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <kvm/iodev.h> 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched/signal.h> 36 #include <linux/sched/mm.h> 37 #include <linux/sched/stat.h> 38 #include <linux/cpumask.h> 39 #include <linux/smp.h> 40 #include <linux/anon_inodes.h> 41 #include <linux/profile.h> 42 #include <linux/kvm_para.h> 43 #include <linux/pagemap.h> 44 #include <linux/mman.h> 45 #include <linux/swap.h> 46 #include <linux/bitops.h> 47 #include <linux/spinlock.h> 48 #include <linux/compat.h> 49 #include <linux/srcu.h> 50 #include <linux/hugetlb.h> 51 #include <linux/slab.h> 52 #include <linux/sort.h> 53 #include <linux/bsearch.h> 54 55 #include <asm/processor.h> 56 #include <asm/io.h> 57 #include <asm/ioctl.h> 58 #include <linux/uaccess.h> 59 #include <asm/pgtable.h> 60 61 #include "coalesced_mmio.h" 62 #include "async_pf.h" 63 #include "vfio.h" 64 65 #define CREATE_TRACE_POINTS 66 #include <trace/events/kvm.h> 67 68 /* Worst case buffer size needed for holding an integer. */ 69 #define ITOA_MAX_LEN 12 70 71 MODULE_AUTHOR("Qumranet"); 72 MODULE_LICENSE("GPL"); 73 74 /* Architectures should define their poll value according to the halt latency */ 75 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 76 module_param(halt_poll_ns, uint, 0644); 77 EXPORT_SYMBOL_GPL(halt_poll_ns); 78 79 /* Default doubles per-vcpu halt_poll_ns. */ 80 unsigned int halt_poll_ns_grow = 2; 81 module_param(halt_poll_ns_grow, uint, 0644); 82 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 83 84 /* Default resets per-vcpu halt_poll_ns . */ 85 unsigned int halt_poll_ns_shrink; 86 module_param(halt_poll_ns_shrink, uint, 0644); 87 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 88 89 /* 90 * Ordering of locks: 91 * 92 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 93 */ 94 95 DEFINE_SPINLOCK(kvm_lock); 96 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 97 LIST_HEAD(vm_list); 98 99 static cpumask_var_t cpus_hardware_enabled; 100 static int kvm_usage_count; 101 static atomic_t hardware_enable_failed; 102 103 struct kmem_cache *kvm_vcpu_cache; 104 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 105 106 static __read_mostly struct preempt_ops kvm_preempt_ops; 107 108 struct dentry *kvm_debugfs_dir; 109 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 110 111 static int kvm_debugfs_num_entries; 112 static const struct file_operations *stat_fops_per_vm[]; 113 114 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 115 unsigned long arg); 116 #ifdef CONFIG_KVM_COMPAT 117 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 118 unsigned long arg); 119 #define KVM_COMPAT(c) .compat_ioctl = (c) 120 #else 121 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg) { return -EINVAL; } 123 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl 124 #endif 125 static int hardware_enable_all(void); 126 static void hardware_disable_all(void); 127 128 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 129 130 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 131 132 __visible bool kvm_rebooting; 133 EXPORT_SYMBOL_GPL(kvm_rebooting); 134 135 static bool largepages_enabled = true; 136 137 #define KVM_EVENT_CREATE_VM 0 138 #define KVM_EVENT_DESTROY_VM 1 139 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 140 static unsigned long long kvm_createvm_count; 141 static unsigned long long kvm_active_vms; 142 143 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 144 unsigned long start, unsigned long end, bool blockable) 145 { 146 return 0; 147 } 148 149 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 150 { 151 if (pfn_valid(pfn)) 152 return PageReserved(pfn_to_page(pfn)); 153 154 return true; 155 } 156 157 /* 158 * Switches to specified vcpu, until a matching vcpu_put() 159 */ 160 void vcpu_load(struct kvm_vcpu *vcpu) 161 { 162 int cpu = get_cpu(); 163 preempt_notifier_register(&vcpu->preempt_notifier); 164 kvm_arch_vcpu_load(vcpu, cpu); 165 put_cpu(); 166 } 167 EXPORT_SYMBOL_GPL(vcpu_load); 168 169 void vcpu_put(struct kvm_vcpu *vcpu) 170 { 171 preempt_disable(); 172 kvm_arch_vcpu_put(vcpu); 173 preempt_notifier_unregister(&vcpu->preempt_notifier); 174 preempt_enable(); 175 } 176 EXPORT_SYMBOL_GPL(vcpu_put); 177 178 /* TODO: merge with kvm_arch_vcpu_should_kick */ 179 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 180 { 181 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 182 183 /* 184 * We need to wait for the VCPU to reenable interrupts and get out of 185 * READING_SHADOW_PAGE_TABLES mode. 186 */ 187 if (req & KVM_REQUEST_WAIT) 188 return mode != OUTSIDE_GUEST_MODE; 189 190 /* 191 * Need to kick a running VCPU, but otherwise there is nothing to do. 192 */ 193 return mode == IN_GUEST_MODE; 194 } 195 196 static void ack_flush(void *_completed) 197 { 198 } 199 200 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 201 { 202 if (unlikely(!cpus)) 203 cpus = cpu_online_mask; 204 205 if (cpumask_empty(cpus)) 206 return false; 207 208 smp_call_function_many(cpus, ack_flush, NULL, wait); 209 return true; 210 } 211 212 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 213 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 214 { 215 int i, cpu, me; 216 struct kvm_vcpu *vcpu; 217 bool called; 218 219 me = get_cpu(); 220 221 kvm_for_each_vcpu(i, vcpu, kvm) { 222 if (!test_bit(i, vcpu_bitmap)) 223 continue; 224 225 kvm_make_request(req, vcpu); 226 cpu = vcpu->cpu; 227 228 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 229 continue; 230 231 if (tmp != NULL && cpu != -1 && cpu != me && 232 kvm_request_needs_ipi(vcpu, req)) 233 __cpumask_set_cpu(cpu, tmp); 234 } 235 236 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 237 put_cpu(); 238 239 return called; 240 } 241 242 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 243 { 244 cpumask_var_t cpus; 245 bool called; 246 static unsigned long vcpu_bitmap[BITS_TO_LONGS(KVM_MAX_VCPUS)] 247 = {[0 ... BITS_TO_LONGS(KVM_MAX_VCPUS)-1] = ULONG_MAX}; 248 249 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 250 251 called = kvm_make_vcpus_request_mask(kvm, req, vcpu_bitmap, cpus); 252 253 free_cpumask_var(cpus); 254 return called; 255 } 256 257 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 258 void kvm_flush_remote_tlbs(struct kvm *kvm) 259 { 260 /* 261 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 262 * kvm_make_all_cpus_request. 263 */ 264 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 265 266 /* 267 * We want to publish modifications to the page tables before reading 268 * mode. Pairs with a memory barrier in arch-specific code. 269 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 270 * and smp_mb in walk_shadow_page_lockless_begin/end. 271 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 272 * 273 * There is already an smp_mb__after_atomic() before 274 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 275 * barrier here. 276 */ 277 if (!kvm_arch_flush_remote_tlb(kvm) 278 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 279 ++kvm->stat.remote_tlb_flush; 280 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 281 } 282 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 283 #endif 284 285 void kvm_reload_remote_mmus(struct kvm *kvm) 286 { 287 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 288 } 289 290 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 291 { 292 struct page *page; 293 int r; 294 295 mutex_init(&vcpu->mutex); 296 vcpu->cpu = -1; 297 vcpu->kvm = kvm; 298 vcpu->vcpu_id = id; 299 vcpu->pid = NULL; 300 init_swait_queue_head(&vcpu->wq); 301 kvm_async_pf_vcpu_init(vcpu); 302 303 vcpu->pre_pcpu = -1; 304 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 305 306 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 307 if (!page) { 308 r = -ENOMEM; 309 goto fail; 310 } 311 vcpu->run = page_address(page); 312 313 kvm_vcpu_set_in_spin_loop(vcpu, false); 314 kvm_vcpu_set_dy_eligible(vcpu, false); 315 vcpu->preempted = false; 316 317 r = kvm_arch_vcpu_init(vcpu); 318 if (r < 0) 319 goto fail_free_run; 320 return 0; 321 322 fail_free_run: 323 free_page((unsigned long)vcpu->run); 324 fail: 325 return r; 326 } 327 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 328 329 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 330 { 331 /* 332 * no need for rcu_read_lock as VCPU_RUN is the only place that 333 * will change the vcpu->pid pointer and on uninit all file 334 * descriptors are already gone. 335 */ 336 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 337 kvm_arch_vcpu_uninit(vcpu); 338 free_page((unsigned long)vcpu->run); 339 } 340 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 341 342 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 343 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 344 { 345 return container_of(mn, struct kvm, mmu_notifier); 346 } 347 348 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 349 struct mm_struct *mm, 350 unsigned long address, 351 pte_t pte) 352 { 353 struct kvm *kvm = mmu_notifier_to_kvm(mn); 354 int idx; 355 356 idx = srcu_read_lock(&kvm->srcu); 357 spin_lock(&kvm->mmu_lock); 358 kvm->mmu_notifier_seq++; 359 kvm_set_spte_hva(kvm, address, pte); 360 spin_unlock(&kvm->mmu_lock); 361 srcu_read_unlock(&kvm->srcu, idx); 362 } 363 364 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 365 struct mm_struct *mm, 366 unsigned long start, 367 unsigned long end, 368 bool blockable) 369 { 370 struct kvm *kvm = mmu_notifier_to_kvm(mn); 371 int need_tlb_flush = 0, idx; 372 int ret; 373 374 idx = srcu_read_lock(&kvm->srcu); 375 spin_lock(&kvm->mmu_lock); 376 /* 377 * The count increase must become visible at unlock time as no 378 * spte can be established without taking the mmu_lock and 379 * count is also read inside the mmu_lock critical section. 380 */ 381 kvm->mmu_notifier_count++; 382 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 383 need_tlb_flush |= kvm->tlbs_dirty; 384 /* we've to flush the tlb before the pages can be freed */ 385 if (need_tlb_flush) 386 kvm_flush_remote_tlbs(kvm); 387 388 spin_unlock(&kvm->mmu_lock); 389 390 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); 391 392 srcu_read_unlock(&kvm->srcu, idx); 393 394 return ret; 395 } 396 397 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 398 struct mm_struct *mm, 399 unsigned long start, 400 unsigned long end) 401 { 402 struct kvm *kvm = mmu_notifier_to_kvm(mn); 403 404 spin_lock(&kvm->mmu_lock); 405 /* 406 * This sequence increase will notify the kvm page fault that 407 * the page that is going to be mapped in the spte could have 408 * been freed. 409 */ 410 kvm->mmu_notifier_seq++; 411 smp_wmb(); 412 /* 413 * The above sequence increase must be visible before the 414 * below count decrease, which is ensured by the smp_wmb above 415 * in conjunction with the smp_rmb in mmu_notifier_retry(). 416 */ 417 kvm->mmu_notifier_count--; 418 spin_unlock(&kvm->mmu_lock); 419 420 BUG_ON(kvm->mmu_notifier_count < 0); 421 } 422 423 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 424 struct mm_struct *mm, 425 unsigned long start, 426 unsigned long end) 427 { 428 struct kvm *kvm = mmu_notifier_to_kvm(mn); 429 int young, idx; 430 431 idx = srcu_read_lock(&kvm->srcu); 432 spin_lock(&kvm->mmu_lock); 433 434 young = kvm_age_hva(kvm, start, end); 435 if (young) 436 kvm_flush_remote_tlbs(kvm); 437 438 spin_unlock(&kvm->mmu_lock); 439 srcu_read_unlock(&kvm->srcu, idx); 440 441 return young; 442 } 443 444 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 445 struct mm_struct *mm, 446 unsigned long start, 447 unsigned long end) 448 { 449 struct kvm *kvm = mmu_notifier_to_kvm(mn); 450 int young, idx; 451 452 idx = srcu_read_lock(&kvm->srcu); 453 spin_lock(&kvm->mmu_lock); 454 /* 455 * Even though we do not flush TLB, this will still adversely 456 * affect performance on pre-Haswell Intel EPT, where there is 457 * no EPT Access Bit to clear so that we have to tear down EPT 458 * tables instead. If we find this unacceptable, we can always 459 * add a parameter to kvm_age_hva so that it effectively doesn't 460 * do anything on clear_young. 461 * 462 * Also note that currently we never issue secondary TLB flushes 463 * from clear_young, leaving this job up to the regular system 464 * cadence. If we find this inaccurate, we might come up with a 465 * more sophisticated heuristic later. 466 */ 467 young = kvm_age_hva(kvm, start, end); 468 spin_unlock(&kvm->mmu_lock); 469 srcu_read_unlock(&kvm->srcu, idx); 470 471 return young; 472 } 473 474 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 475 struct mm_struct *mm, 476 unsigned long address) 477 { 478 struct kvm *kvm = mmu_notifier_to_kvm(mn); 479 int young, idx; 480 481 idx = srcu_read_lock(&kvm->srcu); 482 spin_lock(&kvm->mmu_lock); 483 young = kvm_test_age_hva(kvm, address); 484 spin_unlock(&kvm->mmu_lock); 485 srcu_read_unlock(&kvm->srcu, idx); 486 487 return young; 488 } 489 490 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 491 struct mm_struct *mm) 492 { 493 struct kvm *kvm = mmu_notifier_to_kvm(mn); 494 int idx; 495 496 idx = srcu_read_lock(&kvm->srcu); 497 kvm_arch_flush_shadow_all(kvm); 498 srcu_read_unlock(&kvm->srcu, idx); 499 } 500 501 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 502 .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, 503 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 504 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 505 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 506 .clear_young = kvm_mmu_notifier_clear_young, 507 .test_young = kvm_mmu_notifier_test_young, 508 .change_pte = kvm_mmu_notifier_change_pte, 509 .release = kvm_mmu_notifier_release, 510 }; 511 512 static int kvm_init_mmu_notifier(struct kvm *kvm) 513 { 514 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 515 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 516 } 517 518 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 519 520 static int kvm_init_mmu_notifier(struct kvm *kvm) 521 { 522 return 0; 523 } 524 525 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 526 527 static struct kvm_memslots *kvm_alloc_memslots(void) 528 { 529 int i; 530 struct kvm_memslots *slots; 531 532 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 533 if (!slots) 534 return NULL; 535 536 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 537 slots->id_to_index[i] = slots->memslots[i].id = i; 538 539 return slots; 540 } 541 542 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 543 { 544 if (!memslot->dirty_bitmap) 545 return; 546 547 kvfree(memslot->dirty_bitmap); 548 memslot->dirty_bitmap = NULL; 549 } 550 551 /* 552 * Free any memory in @free but not in @dont. 553 */ 554 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 555 struct kvm_memory_slot *dont) 556 { 557 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 558 kvm_destroy_dirty_bitmap(free); 559 560 kvm_arch_free_memslot(kvm, free, dont); 561 562 free->npages = 0; 563 } 564 565 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 566 { 567 struct kvm_memory_slot *memslot; 568 569 if (!slots) 570 return; 571 572 kvm_for_each_memslot(memslot, slots) 573 kvm_free_memslot(kvm, memslot, NULL); 574 575 kvfree(slots); 576 } 577 578 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 579 { 580 int i; 581 582 if (!kvm->debugfs_dentry) 583 return; 584 585 debugfs_remove_recursive(kvm->debugfs_dentry); 586 587 if (kvm->debugfs_stat_data) { 588 for (i = 0; i < kvm_debugfs_num_entries; i++) 589 kfree(kvm->debugfs_stat_data[i]); 590 kfree(kvm->debugfs_stat_data); 591 } 592 } 593 594 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 595 { 596 char dir_name[ITOA_MAX_LEN * 2]; 597 struct kvm_stat_data *stat_data; 598 struct kvm_stats_debugfs_item *p; 599 600 if (!debugfs_initialized()) 601 return 0; 602 603 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 604 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 605 606 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 607 sizeof(*kvm->debugfs_stat_data), 608 GFP_KERNEL); 609 if (!kvm->debugfs_stat_data) 610 return -ENOMEM; 611 612 for (p = debugfs_entries; p->name; p++) { 613 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); 614 if (!stat_data) 615 return -ENOMEM; 616 617 stat_data->kvm = kvm; 618 stat_data->offset = p->offset; 619 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 620 debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, 621 stat_data, stat_fops_per_vm[p->kind]); 622 } 623 return 0; 624 } 625 626 static struct kvm *kvm_create_vm(unsigned long type) 627 { 628 int r, i; 629 struct kvm *kvm = kvm_arch_alloc_vm(); 630 631 if (!kvm) 632 return ERR_PTR(-ENOMEM); 633 634 spin_lock_init(&kvm->mmu_lock); 635 mmgrab(current->mm); 636 kvm->mm = current->mm; 637 kvm_eventfd_init(kvm); 638 mutex_init(&kvm->lock); 639 mutex_init(&kvm->irq_lock); 640 mutex_init(&kvm->slots_lock); 641 refcount_set(&kvm->users_count, 1); 642 INIT_LIST_HEAD(&kvm->devices); 643 644 r = kvm_arch_init_vm(kvm, type); 645 if (r) 646 goto out_err_no_disable; 647 648 r = hardware_enable_all(); 649 if (r) 650 goto out_err_no_disable; 651 652 #ifdef CONFIG_HAVE_KVM_IRQFD 653 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 654 #endif 655 656 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 657 658 r = -ENOMEM; 659 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 660 struct kvm_memslots *slots = kvm_alloc_memslots(); 661 if (!slots) 662 goto out_err_no_srcu; 663 /* 664 * Generations must be different for each address space. 665 * Init kvm generation close to the maximum to easily test the 666 * code of handling generation number wrap-around. 667 */ 668 slots->generation = i * 2 - 150; 669 rcu_assign_pointer(kvm->memslots[i], slots); 670 } 671 672 if (init_srcu_struct(&kvm->srcu)) 673 goto out_err_no_srcu; 674 if (init_srcu_struct(&kvm->irq_srcu)) 675 goto out_err_no_irq_srcu; 676 for (i = 0; i < KVM_NR_BUSES; i++) { 677 rcu_assign_pointer(kvm->buses[i], 678 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); 679 if (!kvm->buses[i]) 680 goto out_err; 681 } 682 683 r = kvm_init_mmu_notifier(kvm); 684 if (r) 685 goto out_err; 686 687 spin_lock(&kvm_lock); 688 list_add(&kvm->vm_list, &vm_list); 689 spin_unlock(&kvm_lock); 690 691 preempt_notifier_inc(); 692 693 return kvm; 694 695 out_err: 696 cleanup_srcu_struct(&kvm->irq_srcu); 697 out_err_no_irq_srcu: 698 cleanup_srcu_struct(&kvm->srcu); 699 out_err_no_srcu: 700 hardware_disable_all(); 701 out_err_no_disable: 702 refcount_set(&kvm->users_count, 0); 703 for (i = 0; i < KVM_NR_BUSES; i++) 704 kfree(kvm_get_bus(kvm, i)); 705 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 706 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 707 kvm_arch_free_vm(kvm); 708 mmdrop(current->mm); 709 return ERR_PTR(r); 710 } 711 712 static void kvm_destroy_devices(struct kvm *kvm) 713 { 714 struct kvm_device *dev, *tmp; 715 716 /* 717 * We do not need to take the kvm->lock here, because nobody else 718 * has a reference to the struct kvm at this point and therefore 719 * cannot access the devices list anyhow. 720 */ 721 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 722 list_del(&dev->vm_node); 723 dev->ops->destroy(dev); 724 } 725 } 726 727 static void kvm_destroy_vm(struct kvm *kvm) 728 { 729 int i; 730 struct mm_struct *mm = kvm->mm; 731 732 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 733 kvm_destroy_vm_debugfs(kvm); 734 kvm_arch_sync_events(kvm); 735 spin_lock(&kvm_lock); 736 list_del(&kvm->vm_list); 737 spin_unlock(&kvm_lock); 738 kvm_free_irq_routing(kvm); 739 for (i = 0; i < KVM_NR_BUSES; i++) { 740 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 741 742 if (bus) 743 kvm_io_bus_destroy(bus); 744 kvm->buses[i] = NULL; 745 } 746 kvm_coalesced_mmio_free(kvm); 747 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 748 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 749 #else 750 kvm_arch_flush_shadow_all(kvm); 751 #endif 752 kvm_arch_destroy_vm(kvm); 753 kvm_destroy_devices(kvm); 754 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 755 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 756 cleanup_srcu_struct(&kvm->irq_srcu); 757 cleanup_srcu_struct(&kvm->srcu); 758 kvm_arch_free_vm(kvm); 759 preempt_notifier_dec(); 760 hardware_disable_all(); 761 mmdrop(mm); 762 } 763 764 void kvm_get_kvm(struct kvm *kvm) 765 { 766 refcount_inc(&kvm->users_count); 767 } 768 EXPORT_SYMBOL_GPL(kvm_get_kvm); 769 770 void kvm_put_kvm(struct kvm *kvm) 771 { 772 if (refcount_dec_and_test(&kvm->users_count)) 773 kvm_destroy_vm(kvm); 774 } 775 EXPORT_SYMBOL_GPL(kvm_put_kvm); 776 777 778 static int kvm_vm_release(struct inode *inode, struct file *filp) 779 { 780 struct kvm *kvm = filp->private_data; 781 782 kvm_irqfd_release(kvm); 783 784 kvm_put_kvm(kvm); 785 return 0; 786 } 787 788 /* 789 * Allocation size is twice as large as the actual dirty bitmap size. 790 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 791 */ 792 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 793 { 794 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 795 796 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); 797 if (!memslot->dirty_bitmap) 798 return -ENOMEM; 799 800 return 0; 801 } 802 803 /* 804 * Insert memslot and re-sort memslots based on their GFN, 805 * so binary search could be used to lookup GFN. 806 * Sorting algorithm takes advantage of having initially 807 * sorted array and known changed memslot position. 808 */ 809 static void update_memslots(struct kvm_memslots *slots, 810 struct kvm_memory_slot *new) 811 { 812 int id = new->id; 813 int i = slots->id_to_index[id]; 814 struct kvm_memory_slot *mslots = slots->memslots; 815 816 WARN_ON(mslots[i].id != id); 817 if (!new->npages) { 818 WARN_ON(!mslots[i].npages); 819 if (mslots[i].npages) 820 slots->used_slots--; 821 } else { 822 if (!mslots[i].npages) 823 slots->used_slots++; 824 } 825 826 while (i < KVM_MEM_SLOTS_NUM - 1 && 827 new->base_gfn <= mslots[i + 1].base_gfn) { 828 if (!mslots[i + 1].npages) 829 break; 830 mslots[i] = mslots[i + 1]; 831 slots->id_to_index[mslots[i].id] = i; 832 i++; 833 } 834 835 /* 836 * The ">=" is needed when creating a slot with base_gfn == 0, 837 * so that it moves before all those with base_gfn == npages == 0. 838 * 839 * On the other hand, if new->npages is zero, the above loop has 840 * already left i pointing to the beginning of the empty part of 841 * mslots, and the ">=" would move the hole backwards in this 842 * case---which is wrong. So skip the loop when deleting a slot. 843 */ 844 if (new->npages) { 845 while (i > 0 && 846 new->base_gfn >= mslots[i - 1].base_gfn) { 847 mslots[i] = mslots[i - 1]; 848 slots->id_to_index[mslots[i].id] = i; 849 i--; 850 } 851 } else 852 WARN_ON_ONCE(i != slots->used_slots); 853 854 mslots[i] = *new; 855 slots->id_to_index[mslots[i].id] = i; 856 } 857 858 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 859 { 860 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 861 862 #ifdef __KVM_HAVE_READONLY_MEM 863 valid_flags |= KVM_MEM_READONLY; 864 #endif 865 866 if (mem->flags & ~valid_flags) 867 return -EINVAL; 868 869 return 0; 870 } 871 872 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 873 int as_id, struct kvm_memslots *slots) 874 { 875 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 876 877 /* 878 * Set the low bit in the generation, which disables SPTE caching 879 * until the end of synchronize_srcu_expedited. 880 */ 881 WARN_ON(old_memslots->generation & 1); 882 slots->generation = old_memslots->generation + 1; 883 884 rcu_assign_pointer(kvm->memslots[as_id], slots); 885 synchronize_srcu_expedited(&kvm->srcu); 886 887 /* 888 * Increment the new memslot generation a second time. This prevents 889 * vm exits that race with memslot updates from caching a memslot 890 * generation that will (potentially) be valid forever. 891 * 892 * Generations must be unique even across address spaces. We do not need 893 * a global counter for that, instead the generation space is evenly split 894 * across address spaces. For example, with two address spaces, address 895 * space 0 will use generations 0, 4, 8, ... while * address space 1 will 896 * use generations 2, 6, 10, 14, ... 897 */ 898 slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; 899 900 kvm_arch_memslots_updated(kvm, slots); 901 902 return old_memslots; 903 } 904 905 /* 906 * Allocate some memory and give it an address in the guest physical address 907 * space. 908 * 909 * Discontiguous memory is allowed, mostly for framebuffers. 910 * 911 * Must be called holding kvm->slots_lock for write. 912 */ 913 int __kvm_set_memory_region(struct kvm *kvm, 914 const struct kvm_userspace_memory_region *mem) 915 { 916 int r; 917 gfn_t base_gfn; 918 unsigned long npages; 919 struct kvm_memory_slot *slot; 920 struct kvm_memory_slot old, new; 921 struct kvm_memslots *slots = NULL, *old_memslots; 922 int as_id, id; 923 enum kvm_mr_change change; 924 925 r = check_memory_region_flags(mem); 926 if (r) 927 goto out; 928 929 r = -EINVAL; 930 as_id = mem->slot >> 16; 931 id = (u16)mem->slot; 932 933 /* General sanity checks */ 934 if (mem->memory_size & (PAGE_SIZE - 1)) 935 goto out; 936 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 937 goto out; 938 /* We can read the guest memory with __xxx_user() later on. */ 939 if ((id < KVM_USER_MEM_SLOTS) && 940 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 941 !access_ok(VERIFY_WRITE, 942 (void __user *)(unsigned long)mem->userspace_addr, 943 mem->memory_size))) 944 goto out; 945 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 946 goto out; 947 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 948 goto out; 949 950 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 951 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 952 npages = mem->memory_size >> PAGE_SHIFT; 953 954 if (npages > KVM_MEM_MAX_NR_PAGES) 955 goto out; 956 957 new = old = *slot; 958 959 new.id = id; 960 new.base_gfn = base_gfn; 961 new.npages = npages; 962 new.flags = mem->flags; 963 964 if (npages) { 965 if (!old.npages) 966 change = KVM_MR_CREATE; 967 else { /* Modify an existing slot. */ 968 if ((mem->userspace_addr != old.userspace_addr) || 969 (npages != old.npages) || 970 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 971 goto out; 972 973 if (base_gfn != old.base_gfn) 974 change = KVM_MR_MOVE; 975 else if (new.flags != old.flags) 976 change = KVM_MR_FLAGS_ONLY; 977 else { /* Nothing to change. */ 978 r = 0; 979 goto out; 980 } 981 } 982 } else { 983 if (!old.npages) 984 goto out; 985 986 change = KVM_MR_DELETE; 987 new.base_gfn = 0; 988 new.flags = 0; 989 } 990 991 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 992 /* Check for overlaps */ 993 r = -EEXIST; 994 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 995 if (slot->id == id) 996 continue; 997 if (!((base_gfn + npages <= slot->base_gfn) || 998 (base_gfn >= slot->base_gfn + slot->npages))) 999 goto out; 1000 } 1001 } 1002 1003 /* Free page dirty bitmap if unneeded */ 1004 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1005 new.dirty_bitmap = NULL; 1006 1007 r = -ENOMEM; 1008 if (change == KVM_MR_CREATE) { 1009 new.userspace_addr = mem->userspace_addr; 1010 1011 if (kvm_arch_create_memslot(kvm, &new, npages)) 1012 goto out_free; 1013 } 1014 1015 /* Allocate page dirty bitmap if needed */ 1016 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1017 if (kvm_create_dirty_bitmap(&new) < 0) 1018 goto out_free; 1019 } 1020 1021 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 1022 if (!slots) 1023 goto out_free; 1024 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1025 1026 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1027 slot = id_to_memslot(slots, id); 1028 slot->flags |= KVM_MEMSLOT_INVALID; 1029 1030 old_memslots = install_new_memslots(kvm, as_id, slots); 1031 1032 /* From this point no new shadow pages pointing to a deleted, 1033 * or moved, memslot will be created. 1034 * 1035 * validation of sp->gfn happens in: 1036 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1037 * - kvm_is_visible_gfn (mmu_check_roots) 1038 */ 1039 kvm_arch_flush_shadow_memslot(kvm, slot); 1040 1041 /* 1042 * We can re-use the old_memslots from above, the only difference 1043 * from the currently installed memslots is the invalid flag. This 1044 * will get overwritten by update_memslots anyway. 1045 */ 1046 slots = old_memslots; 1047 } 1048 1049 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1050 if (r) 1051 goto out_slots; 1052 1053 /* actual memory is freed via old in kvm_free_memslot below */ 1054 if (change == KVM_MR_DELETE) { 1055 new.dirty_bitmap = NULL; 1056 memset(&new.arch, 0, sizeof(new.arch)); 1057 } 1058 1059 update_memslots(slots, &new); 1060 old_memslots = install_new_memslots(kvm, as_id, slots); 1061 1062 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1063 1064 kvm_free_memslot(kvm, &old, &new); 1065 kvfree(old_memslots); 1066 return 0; 1067 1068 out_slots: 1069 kvfree(slots); 1070 out_free: 1071 kvm_free_memslot(kvm, &new, &old); 1072 out: 1073 return r; 1074 } 1075 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1076 1077 int kvm_set_memory_region(struct kvm *kvm, 1078 const struct kvm_userspace_memory_region *mem) 1079 { 1080 int r; 1081 1082 mutex_lock(&kvm->slots_lock); 1083 r = __kvm_set_memory_region(kvm, mem); 1084 mutex_unlock(&kvm->slots_lock); 1085 return r; 1086 } 1087 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1088 1089 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1090 struct kvm_userspace_memory_region *mem) 1091 { 1092 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1093 return -EINVAL; 1094 1095 return kvm_set_memory_region(kvm, mem); 1096 } 1097 1098 int kvm_get_dirty_log(struct kvm *kvm, 1099 struct kvm_dirty_log *log, int *is_dirty) 1100 { 1101 struct kvm_memslots *slots; 1102 struct kvm_memory_slot *memslot; 1103 int i, as_id, id; 1104 unsigned long n; 1105 unsigned long any = 0; 1106 1107 as_id = log->slot >> 16; 1108 id = (u16)log->slot; 1109 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1110 return -EINVAL; 1111 1112 slots = __kvm_memslots(kvm, as_id); 1113 memslot = id_to_memslot(slots, id); 1114 if (!memslot->dirty_bitmap) 1115 return -ENOENT; 1116 1117 n = kvm_dirty_bitmap_bytes(memslot); 1118 1119 for (i = 0; !any && i < n/sizeof(long); ++i) 1120 any = memslot->dirty_bitmap[i]; 1121 1122 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1123 return -EFAULT; 1124 1125 if (any) 1126 *is_dirty = 1; 1127 return 0; 1128 } 1129 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1130 1131 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1132 /** 1133 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1134 * are dirty write protect them for next write. 1135 * @kvm: pointer to kvm instance 1136 * @log: slot id and address to which we copy the log 1137 * @is_dirty: flag set if any page is dirty 1138 * 1139 * We need to keep it in mind that VCPU threads can write to the bitmap 1140 * concurrently. So, to avoid losing track of dirty pages we keep the 1141 * following order: 1142 * 1143 * 1. Take a snapshot of the bit and clear it if needed. 1144 * 2. Write protect the corresponding page. 1145 * 3. Copy the snapshot to the userspace. 1146 * 4. Upon return caller flushes TLB's if needed. 1147 * 1148 * Between 2 and 4, the guest may write to the page using the remaining TLB 1149 * entry. This is not a problem because the page is reported dirty using 1150 * the snapshot taken before and step 4 ensures that writes done after 1151 * exiting to userspace will be logged for the next call. 1152 * 1153 */ 1154 int kvm_get_dirty_log_protect(struct kvm *kvm, 1155 struct kvm_dirty_log *log, bool *is_dirty) 1156 { 1157 struct kvm_memslots *slots; 1158 struct kvm_memory_slot *memslot; 1159 int i, as_id, id; 1160 unsigned long n; 1161 unsigned long *dirty_bitmap; 1162 unsigned long *dirty_bitmap_buffer; 1163 1164 as_id = log->slot >> 16; 1165 id = (u16)log->slot; 1166 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1167 return -EINVAL; 1168 1169 slots = __kvm_memslots(kvm, as_id); 1170 memslot = id_to_memslot(slots, id); 1171 1172 dirty_bitmap = memslot->dirty_bitmap; 1173 if (!dirty_bitmap) 1174 return -ENOENT; 1175 1176 n = kvm_dirty_bitmap_bytes(memslot); 1177 1178 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1179 memset(dirty_bitmap_buffer, 0, n); 1180 1181 spin_lock(&kvm->mmu_lock); 1182 *is_dirty = false; 1183 for (i = 0; i < n / sizeof(long); i++) { 1184 unsigned long mask; 1185 gfn_t offset; 1186 1187 if (!dirty_bitmap[i]) 1188 continue; 1189 1190 *is_dirty = true; 1191 1192 mask = xchg(&dirty_bitmap[i], 0); 1193 dirty_bitmap_buffer[i] = mask; 1194 1195 if (mask) { 1196 offset = i * BITS_PER_LONG; 1197 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1198 offset, mask); 1199 } 1200 } 1201 1202 spin_unlock(&kvm->mmu_lock); 1203 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1204 return -EFAULT; 1205 return 0; 1206 } 1207 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1208 #endif 1209 1210 bool kvm_largepages_enabled(void) 1211 { 1212 return largepages_enabled; 1213 } 1214 1215 void kvm_disable_largepages(void) 1216 { 1217 largepages_enabled = false; 1218 } 1219 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1220 1221 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1222 { 1223 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1224 } 1225 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1226 1227 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1228 { 1229 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1230 } 1231 1232 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1233 { 1234 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1235 1236 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1237 memslot->flags & KVM_MEMSLOT_INVALID) 1238 return false; 1239 1240 return true; 1241 } 1242 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1243 1244 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1245 { 1246 struct vm_area_struct *vma; 1247 unsigned long addr, size; 1248 1249 size = PAGE_SIZE; 1250 1251 addr = gfn_to_hva(kvm, gfn); 1252 if (kvm_is_error_hva(addr)) 1253 return PAGE_SIZE; 1254 1255 down_read(¤t->mm->mmap_sem); 1256 vma = find_vma(current->mm, addr); 1257 if (!vma) 1258 goto out; 1259 1260 size = vma_kernel_pagesize(vma); 1261 1262 out: 1263 up_read(¤t->mm->mmap_sem); 1264 1265 return size; 1266 } 1267 1268 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1269 { 1270 return slot->flags & KVM_MEM_READONLY; 1271 } 1272 1273 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1274 gfn_t *nr_pages, bool write) 1275 { 1276 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1277 return KVM_HVA_ERR_BAD; 1278 1279 if (memslot_is_readonly(slot) && write) 1280 return KVM_HVA_ERR_RO_BAD; 1281 1282 if (nr_pages) 1283 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1284 1285 return __gfn_to_hva_memslot(slot, gfn); 1286 } 1287 1288 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1289 gfn_t *nr_pages) 1290 { 1291 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1292 } 1293 1294 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1295 gfn_t gfn) 1296 { 1297 return gfn_to_hva_many(slot, gfn, NULL); 1298 } 1299 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1300 1301 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1302 { 1303 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1304 } 1305 EXPORT_SYMBOL_GPL(gfn_to_hva); 1306 1307 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1308 { 1309 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1310 } 1311 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1312 1313 /* 1314 * If writable is set to false, the hva returned by this function is only 1315 * allowed to be read. 1316 */ 1317 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1318 gfn_t gfn, bool *writable) 1319 { 1320 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1321 1322 if (!kvm_is_error_hva(hva) && writable) 1323 *writable = !memslot_is_readonly(slot); 1324 1325 return hva; 1326 } 1327 1328 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1329 { 1330 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1331 1332 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1333 } 1334 1335 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1336 { 1337 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1338 1339 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1340 } 1341 1342 static inline int check_user_page_hwpoison(unsigned long addr) 1343 { 1344 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1345 1346 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1347 return rc == -EHWPOISON; 1348 } 1349 1350 /* 1351 * The fast path to get the writable pfn which will be stored in @pfn, 1352 * true indicates success, otherwise false is returned. It's also the 1353 * only part that runs if we can are in atomic context. 1354 */ 1355 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1356 bool *writable, kvm_pfn_t *pfn) 1357 { 1358 struct page *page[1]; 1359 int npages; 1360 1361 /* 1362 * Fast pin a writable pfn only if it is a write fault request 1363 * or the caller allows to map a writable pfn for a read fault 1364 * request. 1365 */ 1366 if (!(write_fault || writable)) 1367 return false; 1368 1369 npages = __get_user_pages_fast(addr, 1, 1, page); 1370 if (npages == 1) { 1371 *pfn = page_to_pfn(page[0]); 1372 1373 if (writable) 1374 *writable = true; 1375 return true; 1376 } 1377 1378 return false; 1379 } 1380 1381 /* 1382 * The slow path to get the pfn of the specified host virtual address, 1383 * 1 indicates success, -errno is returned if error is detected. 1384 */ 1385 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1386 bool *writable, kvm_pfn_t *pfn) 1387 { 1388 unsigned int flags = FOLL_HWPOISON; 1389 struct page *page; 1390 int npages = 0; 1391 1392 might_sleep(); 1393 1394 if (writable) 1395 *writable = write_fault; 1396 1397 if (write_fault) 1398 flags |= FOLL_WRITE; 1399 if (async) 1400 flags |= FOLL_NOWAIT; 1401 1402 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1403 if (npages != 1) 1404 return npages; 1405 1406 /* map read fault as writable if possible */ 1407 if (unlikely(!write_fault) && writable) { 1408 struct page *wpage; 1409 1410 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1411 *writable = true; 1412 put_page(page); 1413 page = wpage; 1414 } 1415 } 1416 *pfn = page_to_pfn(page); 1417 return npages; 1418 } 1419 1420 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1421 { 1422 if (unlikely(!(vma->vm_flags & VM_READ))) 1423 return false; 1424 1425 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1426 return false; 1427 1428 return true; 1429 } 1430 1431 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1432 unsigned long addr, bool *async, 1433 bool write_fault, bool *writable, 1434 kvm_pfn_t *p_pfn) 1435 { 1436 unsigned long pfn; 1437 int r; 1438 1439 r = follow_pfn(vma, addr, &pfn); 1440 if (r) { 1441 /* 1442 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1443 * not call the fault handler, so do it here. 1444 */ 1445 bool unlocked = false; 1446 r = fixup_user_fault(current, current->mm, addr, 1447 (write_fault ? FAULT_FLAG_WRITE : 0), 1448 &unlocked); 1449 if (unlocked) 1450 return -EAGAIN; 1451 if (r) 1452 return r; 1453 1454 r = follow_pfn(vma, addr, &pfn); 1455 if (r) 1456 return r; 1457 1458 } 1459 1460 if (writable) 1461 *writable = true; 1462 1463 /* 1464 * Get a reference here because callers of *hva_to_pfn* and 1465 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1466 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1467 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1468 * simply do nothing for reserved pfns. 1469 * 1470 * Whoever called remap_pfn_range is also going to call e.g. 1471 * unmap_mapping_range before the underlying pages are freed, 1472 * causing a call to our MMU notifier. 1473 */ 1474 kvm_get_pfn(pfn); 1475 1476 *p_pfn = pfn; 1477 return 0; 1478 } 1479 1480 /* 1481 * Pin guest page in memory and return its pfn. 1482 * @addr: host virtual address which maps memory to the guest 1483 * @atomic: whether this function can sleep 1484 * @async: whether this function need to wait IO complete if the 1485 * host page is not in the memory 1486 * @write_fault: whether we should get a writable host page 1487 * @writable: whether it allows to map a writable host page for !@write_fault 1488 * 1489 * The function will map a writable host page for these two cases: 1490 * 1): @write_fault = true 1491 * 2): @write_fault = false && @writable, @writable will tell the caller 1492 * whether the mapping is writable. 1493 */ 1494 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1495 bool write_fault, bool *writable) 1496 { 1497 struct vm_area_struct *vma; 1498 kvm_pfn_t pfn = 0; 1499 int npages, r; 1500 1501 /* we can do it either atomically or asynchronously, not both */ 1502 BUG_ON(atomic && async); 1503 1504 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1505 return pfn; 1506 1507 if (atomic) 1508 return KVM_PFN_ERR_FAULT; 1509 1510 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1511 if (npages == 1) 1512 return pfn; 1513 1514 down_read(¤t->mm->mmap_sem); 1515 if (npages == -EHWPOISON || 1516 (!async && check_user_page_hwpoison(addr))) { 1517 pfn = KVM_PFN_ERR_HWPOISON; 1518 goto exit; 1519 } 1520 1521 retry: 1522 vma = find_vma_intersection(current->mm, addr, addr + 1); 1523 1524 if (vma == NULL) 1525 pfn = KVM_PFN_ERR_FAULT; 1526 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1527 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1528 if (r == -EAGAIN) 1529 goto retry; 1530 if (r < 0) 1531 pfn = KVM_PFN_ERR_FAULT; 1532 } else { 1533 if (async && vma_is_valid(vma, write_fault)) 1534 *async = true; 1535 pfn = KVM_PFN_ERR_FAULT; 1536 } 1537 exit: 1538 up_read(¤t->mm->mmap_sem); 1539 return pfn; 1540 } 1541 1542 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1543 bool atomic, bool *async, bool write_fault, 1544 bool *writable) 1545 { 1546 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1547 1548 if (addr == KVM_HVA_ERR_RO_BAD) { 1549 if (writable) 1550 *writable = false; 1551 return KVM_PFN_ERR_RO_FAULT; 1552 } 1553 1554 if (kvm_is_error_hva(addr)) { 1555 if (writable) 1556 *writable = false; 1557 return KVM_PFN_NOSLOT; 1558 } 1559 1560 /* Do not map writable pfn in the readonly memslot. */ 1561 if (writable && memslot_is_readonly(slot)) { 1562 *writable = false; 1563 writable = NULL; 1564 } 1565 1566 return hva_to_pfn(addr, atomic, async, write_fault, 1567 writable); 1568 } 1569 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1570 1571 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1572 bool *writable) 1573 { 1574 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1575 write_fault, writable); 1576 } 1577 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1578 1579 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1580 { 1581 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1582 } 1583 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1584 1585 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1586 { 1587 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1588 } 1589 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1590 1591 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1592 { 1593 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1594 } 1595 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1596 1597 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1598 { 1599 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1600 } 1601 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1602 1603 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1604 { 1605 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1606 } 1607 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1608 1609 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1610 { 1611 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1612 } 1613 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1614 1615 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1616 struct page **pages, int nr_pages) 1617 { 1618 unsigned long addr; 1619 gfn_t entry = 0; 1620 1621 addr = gfn_to_hva_many(slot, gfn, &entry); 1622 if (kvm_is_error_hva(addr)) 1623 return -1; 1624 1625 if (entry < nr_pages) 1626 return 0; 1627 1628 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1629 } 1630 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1631 1632 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1633 { 1634 if (is_error_noslot_pfn(pfn)) 1635 return KVM_ERR_PTR_BAD_PAGE; 1636 1637 if (kvm_is_reserved_pfn(pfn)) { 1638 WARN_ON(1); 1639 return KVM_ERR_PTR_BAD_PAGE; 1640 } 1641 1642 return pfn_to_page(pfn); 1643 } 1644 1645 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1646 { 1647 kvm_pfn_t pfn; 1648 1649 pfn = gfn_to_pfn(kvm, gfn); 1650 1651 return kvm_pfn_to_page(pfn); 1652 } 1653 EXPORT_SYMBOL_GPL(gfn_to_page); 1654 1655 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1656 { 1657 kvm_pfn_t pfn; 1658 1659 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1660 1661 return kvm_pfn_to_page(pfn); 1662 } 1663 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1664 1665 void kvm_release_page_clean(struct page *page) 1666 { 1667 WARN_ON(is_error_page(page)); 1668 1669 kvm_release_pfn_clean(page_to_pfn(page)); 1670 } 1671 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1672 1673 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1674 { 1675 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1676 put_page(pfn_to_page(pfn)); 1677 } 1678 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1679 1680 void kvm_release_page_dirty(struct page *page) 1681 { 1682 WARN_ON(is_error_page(page)); 1683 1684 kvm_release_pfn_dirty(page_to_pfn(page)); 1685 } 1686 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1687 1688 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1689 { 1690 kvm_set_pfn_dirty(pfn); 1691 kvm_release_pfn_clean(pfn); 1692 } 1693 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1694 1695 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1696 { 1697 if (!kvm_is_reserved_pfn(pfn)) { 1698 struct page *page = pfn_to_page(pfn); 1699 1700 if (!PageReserved(page)) 1701 SetPageDirty(page); 1702 } 1703 } 1704 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1705 1706 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 1707 { 1708 if (!kvm_is_reserved_pfn(pfn)) 1709 mark_page_accessed(pfn_to_page(pfn)); 1710 } 1711 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1712 1713 void kvm_get_pfn(kvm_pfn_t pfn) 1714 { 1715 if (!kvm_is_reserved_pfn(pfn)) 1716 get_page(pfn_to_page(pfn)); 1717 } 1718 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1719 1720 static int next_segment(unsigned long len, int offset) 1721 { 1722 if (len > PAGE_SIZE - offset) 1723 return PAGE_SIZE - offset; 1724 else 1725 return len; 1726 } 1727 1728 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1729 void *data, int offset, int len) 1730 { 1731 int r; 1732 unsigned long addr; 1733 1734 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1735 if (kvm_is_error_hva(addr)) 1736 return -EFAULT; 1737 r = __copy_from_user(data, (void __user *)addr + offset, len); 1738 if (r) 1739 return -EFAULT; 1740 return 0; 1741 } 1742 1743 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1744 int len) 1745 { 1746 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1747 1748 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1749 } 1750 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1751 1752 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1753 int offset, int len) 1754 { 1755 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1756 1757 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1758 } 1759 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1760 1761 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1762 { 1763 gfn_t gfn = gpa >> PAGE_SHIFT; 1764 int seg; 1765 int offset = offset_in_page(gpa); 1766 int ret; 1767 1768 while ((seg = next_segment(len, offset)) != 0) { 1769 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1770 if (ret < 0) 1771 return ret; 1772 offset = 0; 1773 len -= seg; 1774 data += seg; 1775 ++gfn; 1776 } 1777 return 0; 1778 } 1779 EXPORT_SYMBOL_GPL(kvm_read_guest); 1780 1781 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 1782 { 1783 gfn_t gfn = gpa >> PAGE_SHIFT; 1784 int seg; 1785 int offset = offset_in_page(gpa); 1786 int ret; 1787 1788 while ((seg = next_segment(len, offset)) != 0) { 1789 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 1790 if (ret < 0) 1791 return ret; 1792 offset = 0; 1793 len -= seg; 1794 data += seg; 1795 ++gfn; 1796 } 1797 return 0; 1798 } 1799 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 1800 1801 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1802 void *data, int offset, unsigned long len) 1803 { 1804 int r; 1805 unsigned long addr; 1806 1807 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1808 if (kvm_is_error_hva(addr)) 1809 return -EFAULT; 1810 pagefault_disable(); 1811 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1812 pagefault_enable(); 1813 if (r) 1814 return -EFAULT; 1815 return 0; 1816 } 1817 1818 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1819 unsigned long len) 1820 { 1821 gfn_t gfn = gpa >> PAGE_SHIFT; 1822 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1823 int offset = offset_in_page(gpa); 1824 1825 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1826 } 1827 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 1828 1829 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 1830 void *data, unsigned long len) 1831 { 1832 gfn_t gfn = gpa >> PAGE_SHIFT; 1833 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1834 int offset = offset_in_page(gpa); 1835 1836 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1837 } 1838 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 1839 1840 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 1841 const void *data, int offset, int len) 1842 { 1843 int r; 1844 unsigned long addr; 1845 1846 addr = gfn_to_hva_memslot(memslot, gfn); 1847 if (kvm_is_error_hva(addr)) 1848 return -EFAULT; 1849 r = __copy_to_user((void __user *)addr + offset, data, len); 1850 if (r) 1851 return -EFAULT; 1852 mark_page_dirty_in_slot(memslot, gfn); 1853 return 0; 1854 } 1855 1856 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 1857 const void *data, int offset, int len) 1858 { 1859 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1860 1861 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1862 } 1863 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1864 1865 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 1866 const void *data, int offset, int len) 1867 { 1868 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1869 1870 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1871 } 1872 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 1873 1874 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1875 unsigned long len) 1876 { 1877 gfn_t gfn = gpa >> PAGE_SHIFT; 1878 int seg; 1879 int offset = offset_in_page(gpa); 1880 int ret; 1881 1882 while ((seg = next_segment(len, offset)) != 0) { 1883 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1884 if (ret < 0) 1885 return ret; 1886 offset = 0; 1887 len -= seg; 1888 data += seg; 1889 ++gfn; 1890 } 1891 return 0; 1892 } 1893 EXPORT_SYMBOL_GPL(kvm_write_guest); 1894 1895 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 1896 unsigned long len) 1897 { 1898 gfn_t gfn = gpa >> PAGE_SHIFT; 1899 int seg; 1900 int offset = offset_in_page(gpa); 1901 int ret; 1902 1903 while ((seg = next_segment(len, offset)) != 0) { 1904 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 1905 if (ret < 0) 1906 return ret; 1907 offset = 0; 1908 len -= seg; 1909 data += seg; 1910 ++gfn; 1911 } 1912 return 0; 1913 } 1914 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 1915 1916 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 1917 struct gfn_to_hva_cache *ghc, 1918 gpa_t gpa, unsigned long len) 1919 { 1920 int offset = offset_in_page(gpa); 1921 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1922 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1923 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 1924 gfn_t nr_pages_avail; 1925 1926 ghc->gpa = gpa; 1927 ghc->generation = slots->generation; 1928 ghc->len = len; 1929 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1930 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1931 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1932 ghc->hva += offset; 1933 } else { 1934 /* 1935 * If the requested region crosses two memslots, we still 1936 * verify that the entire region is valid here. 1937 */ 1938 while (start_gfn <= end_gfn) { 1939 nr_pages_avail = 0; 1940 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1941 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1942 &nr_pages_avail); 1943 if (kvm_is_error_hva(ghc->hva)) 1944 return -EFAULT; 1945 start_gfn += nr_pages_avail; 1946 } 1947 /* Use the slow path for cross page reads and writes. */ 1948 ghc->memslot = NULL; 1949 } 1950 return 0; 1951 } 1952 1953 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1954 gpa_t gpa, unsigned long len) 1955 { 1956 struct kvm_memslots *slots = kvm_memslots(kvm); 1957 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 1958 } 1959 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1960 1961 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1962 void *data, int offset, unsigned long len) 1963 { 1964 struct kvm_memslots *slots = kvm_memslots(kvm); 1965 int r; 1966 gpa_t gpa = ghc->gpa + offset; 1967 1968 BUG_ON(len + offset > ghc->len); 1969 1970 if (slots->generation != ghc->generation) 1971 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 1972 1973 if (unlikely(!ghc->memslot)) 1974 return kvm_write_guest(kvm, gpa, data, len); 1975 1976 if (kvm_is_error_hva(ghc->hva)) 1977 return -EFAULT; 1978 1979 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 1980 if (r) 1981 return -EFAULT; 1982 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 1983 1984 return 0; 1985 } 1986 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 1987 1988 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1989 void *data, unsigned long len) 1990 { 1991 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 1992 } 1993 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 1994 1995 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1996 void *data, unsigned long len) 1997 { 1998 struct kvm_memslots *slots = kvm_memslots(kvm); 1999 int r; 2000 2001 BUG_ON(len > ghc->len); 2002 2003 if (slots->generation != ghc->generation) 2004 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2005 2006 if (unlikely(!ghc->memslot)) 2007 return kvm_read_guest(kvm, ghc->gpa, data, len); 2008 2009 if (kvm_is_error_hva(ghc->hva)) 2010 return -EFAULT; 2011 2012 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2013 if (r) 2014 return -EFAULT; 2015 2016 return 0; 2017 } 2018 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2019 2020 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2021 { 2022 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2023 2024 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2025 } 2026 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2027 2028 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2029 { 2030 gfn_t gfn = gpa >> PAGE_SHIFT; 2031 int seg; 2032 int offset = offset_in_page(gpa); 2033 int ret; 2034 2035 while ((seg = next_segment(len, offset)) != 0) { 2036 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2037 if (ret < 0) 2038 return ret; 2039 offset = 0; 2040 len -= seg; 2041 ++gfn; 2042 } 2043 return 0; 2044 } 2045 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2046 2047 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2048 gfn_t gfn) 2049 { 2050 if (memslot && memslot->dirty_bitmap) { 2051 unsigned long rel_gfn = gfn - memslot->base_gfn; 2052 2053 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2054 } 2055 } 2056 2057 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2058 { 2059 struct kvm_memory_slot *memslot; 2060 2061 memslot = gfn_to_memslot(kvm, gfn); 2062 mark_page_dirty_in_slot(memslot, gfn); 2063 } 2064 EXPORT_SYMBOL_GPL(mark_page_dirty); 2065 2066 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2067 { 2068 struct kvm_memory_slot *memslot; 2069 2070 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2071 mark_page_dirty_in_slot(memslot, gfn); 2072 } 2073 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2074 2075 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2076 { 2077 if (!vcpu->sigset_active) 2078 return; 2079 2080 /* 2081 * This does a lockless modification of ->real_blocked, which is fine 2082 * because, only current can change ->real_blocked and all readers of 2083 * ->real_blocked don't care as long ->real_blocked is always a subset 2084 * of ->blocked. 2085 */ 2086 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2087 } 2088 2089 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2090 { 2091 if (!vcpu->sigset_active) 2092 return; 2093 2094 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2095 sigemptyset(¤t->real_blocked); 2096 } 2097 2098 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2099 { 2100 unsigned int old, val, grow; 2101 2102 old = val = vcpu->halt_poll_ns; 2103 grow = READ_ONCE(halt_poll_ns_grow); 2104 /* 10us base */ 2105 if (val == 0 && grow) 2106 val = 10000; 2107 else 2108 val *= grow; 2109 2110 if (val > halt_poll_ns) 2111 val = halt_poll_ns; 2112 2113 vcpu->halt_poll_ns = val; 2114 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2115 } 2116 2117 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2118 { 2119 unsigned int old, val, shrink; 2120 2121 old = val = vcpu->halt_poll_ns; 2122 shrink = READ_ONCE(halt_poll_ns_shrink); 2123 if (shrink == 0) 2124 val = 0; 2125 else 2126 val /= shrink; 2127 2128 vcpu->halt_poll_ns = val; 2129 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2130 } 2131 2132 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2133 { 2134 int ret = -EINTR; 2135 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2136 2137 if (kvm_arch_vcpu_runnable(vcpu)) { 2138 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2139 goto out; 2140 } 2141 if (kvm_cpu_has_pending_timer(vcpu)) 2142 goto out; 2143 if (signal_pending(current)) 2144 goto out; 2145 2146 ret = 0; 2147 out: 2148 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2149 return ret; 2150 } 2151 2152 /* 2153 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2154 */ 2155 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2156 { 2157 ktime_t start, cur; 2158 DECLARE_SWAITQUEUE(wait); 2159 bool waited = false; 2160 u64 block_ns; 2161 2162 start = cur = ktime_get(); 2163 if (vcpu->halt_poll_ns) { 2164 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2165 2166 ++vcpu->stat.halt_attempted_poll; 2167 do { 2168 /* 2169 * This sets KVM_REQ_UNHALT if an interrupt 2170 * arrives. 2171 */ 2172 if (kvm_vcpu_check_block(vcpu) < 0) { 2173 ++vcpu->stat.halt_successful_poll; 2174 if (!vcpu_valid_wakeup(vcpu)) 2175 ++vcpu->stat.halt_poll_invalid; 2176 goto out; 2177 } 2178 cur = ktime_get(); 2179 } while (single_task_running() && ktime_before(cur, stop)); 2180 } 2181 2182 kvm_arch_vcpu_blocking(vcpu); 2183 2184 for (;;) { 2185 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2186 2187 if (kvm_vcpu_check_block(vcpu) < 0) 2188 break; 2189 2190 waited = true; 2191 schedule(); 2192 } 2193 2194 finish_swait(&vcpu->wq, &wait); 2195 cur = ktime_get(); 2196 2197 kvm_arch_vcpu_unblocking(vcpu); 2198 out: 2199 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2200 2201 if (!vcpu_valid_wakeup(vcpu)) 2202 shrink_halt_poll_ns(vcpu); 2203 else if (halt_poll_ns) { 2204 if (block_ns <= vcpu->halt_poll_ns) 2205 ; 2206 /* we had a long block, shrink polling */ 2207 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2208 shrink_halt_poll_ns(vcpu); 2209 /* we had a short halt and our poll time is too small */ 2210 else if (vcpu->halt_poll_ns < halt_poll_ns && 2211 block_ns < halt_poll_ns) 2212 grow_halt_poll_ns(vcpu); 2213 } else 2214 vcpu->halt_poll_ns = 0; 2215 2216 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2217 kvm_arch_vcpu_block_finish(vcpu); 2218 } 2219 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2220 2221 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2222 { 2223 struct swait_queue_head *wqp; 2224 2225 wqp = kvm_arch_vcpu_wq(vcpu); 2226 if (swq_has_sleeper(wqp)) { 2227 swake_up_one(wqp); 2228 ++vcpu->stat.halt_wakeup; 2229 return true; 2230 } 2231 2232 return false; 2233 } 2234 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2235 2236 #ifndef CONFIG_S390 2237 /* 2238 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2239 */ 2240 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2241 { 2242 int me; 2243 int cpu = vcpu->cpu; 2244 2245 if (kvm_vcpu_wake_up(vcpu)) 2246 return; 2247 2248 me = get_cpu(); 2249 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2250 if (kvm_arch_vcpu_should_kick(vcpu)) 2251 smp_send_reschedule(cpu); 2252 put_cpu(); 2253 } 2254 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2255 #endif /* !CONFIG_S390 */ 2256 2257 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2258 { 2259 struct pid *pid; 2260 struct task_struct *task = NULL; 2261 int ret = 0; 2262 2263 rcu_read_lock(); 2264 pid = rcu_dereference(target->pid); 2265 if (pid) 2266 task = get_pid_task(pid, PIDTYPE_PID); 2267 rcu_read_unlock(); 2268 if (!task) 2269 return ret; 2270 ret = yield_to(task, 1); 2271 put_task_struct(task); 2272 2273 return ret; 2274 } 2275 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2276 2277 /* 2278 * Helper that checks whether a VCPU is eligible for directed yield. 2279 * Most eligible candidate to yield is decided by following heuristics: 2280 * 2281 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2282 * (preempted lock holder), indicated by @in_spin_loop. 2283 * Set at the beiginning and cleared at the end of interception/PLE handler. 2284 * 2285 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2286 * chance last time (mostly it has become eligible now since we have probably 2287 * yielded to lockholder in last iteration. This is done by toggling 2288 * @dy_eligible each time a VCPU checked for eligibility.) 2289 * 2290 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2291 * to preempted lock-holder could result in wrong VCPU selection and CPU 2292 * burning. Giving priority for a potential lock-holder increases lock 2293 * progress. 2294 * 2295 * Since algorithm is based on heuristics, accessing another VCPU data without 2296 * locking does not harm. It may result in trying to yield to same VCPU, fail 2297 * and continue with next VCPU and so on. 2298 */ 2299 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2300 { 2301 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2302 bool eligible; 2303 2304 eligible = !vcpu->spin_loop.in_spin_loop || 2305 vcpu->spin_loop.dy_eligible; 2306 2307 if (vcpu->spin_loop.in_spin_loop) 2308 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2309 2310 return eligible; 2311 #else 2312 return true; 2313 #endif 2314 } 2315 2316 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2317 { 2318 struct kvm *kvm = me->kvm; 2319 struct kvm_vcpu *vcpu; 2320 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2321 int yielded = 0; 2322 int try = 3; 2323 int pass; 2324 int i; 2325 2326 kvm_vcpu_set_in_spin_loop(me, true); 2327 /* 2328 * We boost the priority of a VCPU that is runnable but not 2329 * currently running, because it got preempted by something 2330 * else and called schedule in __vcpu_run. Hopefully that 2331 * VCPU is holding the lock that we need and will release it. 2332 * We approximate round-robin by starting at the last boosted VCPU. 2333 */ 2334 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2335 kvm_for_each_vcpu(i, vcpu, kvm) { 2336 if (!pass && i <= last_boosted_vcpu) { 2337 i = last_boosted_vcpu; 2338 continue; 2339 } else if (pass && i > last_boosted_vcpu) 2340 break; 2341 if (!READ_ONCE(vcpu->preempted)) 2342 continue; 2343 if (vcpu == me) 2344 continue; 2345 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2346 continue; 2347 if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) 2348 continue; 2349 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2350 continue; 2351 2352 yielded = kvm_vcpu_yield_to(vcpu); 2353 if (yielded > 0) { 2354 kvm->last_boosted_vcpu = i; 2355 break; 2356 } else if (yielded < 0) { 2357 try--; 2358 if (!try) 2359 break; 2360 } 2361 } 2362 } 2363 kvm_vcpu_set_in_spin_loop(me, false); 2364 2365 /* Ensure vcpu is not eligible during next spinloop */ 2366 kvm_vcpu_set_dy_eligible(me, false); 2367 } 2368 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2369 2370 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2371 { 2372 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2373 struct page *page; 2374 2375 if (vmf->pgoff == 0) 2376 page = virt_to_page(vcpu->run); 2377 #ifdef CONFIG_X86 2378 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2379 page = virt_to_page(vcpu->arch.pio_data); 2380 #endif 2381 #ifdef CONFIG_KVM_MMIO 2382 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2383 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2384 #endif 2385 else 2386 return kvm_arch_vcpu_fault(vcpu, vmf); 2387 get_page(page); 2388 vmf->page = page; 2389 return 0; 2390 } 2391 2392 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2393 .fault = kvm_vcpu_fault, 2394 }; 2395 2396 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2397 { 2398 vma->vm_ops = &kvm_vcpu_vm_ops; 2399 return 0; 2400 } 2401 2402 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2403 { 2404 struct kvm_vcpu *vcpu = filp->private_data; 2405 2406 debugfs_remove_recursive(vcpu->debugfs_dentry); 2407 kvm_put_kvm(vcpu->kvm); 2408 return 0; 2409 } 2410 2411 static struct file_operations kvm_vcpu_fops = { 2412 .release = kvm_vcpu_release, 2413 .unlocked_ioctl = kvm_vcpu_ioctl, 2414 .mmap = kvm_vcpu_mmap, 2415 .llseek = noop_llseek, 2416 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2417 }; 2418 2419 /* 2420 * Allocates an inode for the vcpu. 2421 */ 2422 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2423 { 2424 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2425 2426 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2427 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2428 } 2429 2430 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2431 { 2432 char dir_name[ITOA_MAX_LEN * 2]; 2433 int ret; 2434 2435 if (!kvm_arch_has_vcpu_debugfs()) 2436 return 0; 2437 2438 if (!debugfs_initialized()) 2439 return 0; 2440 2441 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2442 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2443 vcpu->kvm->debugfs_dentry); 2444 if (!vcpu->debugfs_dentry) 2445 return -ENOMEM; 2446 2447 ret = kvm_arch_create_vcpu_debugfs(vcpu); 2448 if (ret < 0) { 2449 debugfs_remove_recursive(vcpu->debugfs_dentry); 2450 return ret; 2451 } 2452 2453 return 0; 2454 } 2455 2456 /* 2457 * Creates some virtual cpus. Good luck creating more than one. 2458 */ 2459 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2460 { 2461 int r; 2462 struct kvm_vcpu *vcpu; 2463 2464 if (id >= KVM_MAX_VCPU_ID) 2465 return -EINVAL; 2466 2467 mutex_lock(&kvm->lock); 2468 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2469 mutex_unlock(&kvm->lock); 2470 return -EINVAL; 2471 } 2472 2473 kvm->created_vcpus++; 2474 mutex_unlock(&kvm->lock); 2475 2476 vcpu = kvm_arch_vcpu_create(kvm, id); 2477 if (IS_ERR(vcpu)) { 2478 r = PTR_ERR(vcpu); 2479 goto vcpu_decrement; 2480 } 2481 2482 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2483 2484 r = kvm_arch_vcpu_setup(vcpu); 2485 if (r) 2486 goto vcpu_destroy; 2487 2488 r = kvm_create_vcpu_debugfs(vcpu); 2489 if (r) 2490 goto vcpu_destroy; 2491 2492 mutex_lock(&kvm->lock); 2493 if (kvm_get_vcpu_by_id(kvm, id)) { 2494 r = -EEXIST; 2495 goto unlock_vcpu_destroy; 2496 } 2497 2498 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2499 2500 /* Now it's all set up, let userspace reach it */ 2501 kvm_get_kvm(kvm); 2502 r = create_vcpu_fd(vcpu); 2503 if (r < 0) { 2504 kvm_put_kvm(kvm); 2505 goto unlock_vcpu_destroy; 2506 } 2507 2508 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2509 2510 /* 2511 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2512 * before kvm->online_vcpu's incremented value. 2513 */ 2514 smp_wmb(); 2515 atomic_inc(&kvm->online_vcpus); 2516 2517 mutex_unlock(&kvm->lock); 2518 kvm_arch_vcpu_postcreate(vcpu); 2519 return r; 2520 2521 unlock_vcpu_destroy: 2522 mutex_unlock(&kvm->lock); 2523 debugfs_remove_recursive(vcpu->debugfs_dentry); 2524 vcpu_destroy: 2525 kvm_arch_vcpu_destroy(vcpu); 2526 vcpu_decrement: 2527 mutex_lock(&kvm->lock); 2528 kvm->created_vcpus--; 2529 mutex_unlock(&kvm->lock); 2530 return r; 2531 } 2532 2533 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2534 { 2535 if (sigset) { 2536 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2537 vcpu->sigset_active = 1; 2538 vcpu->sigset = *sigset; 2539 } else 2540 vcpu->sigset_active = 0; 2541 return 0; 2542 } 2543 2544 static long kvm_vcpu_ioctl(struct file *filp, 2545 unsigned int ioctl, unsigned long arg) 2546 { 2547 struct kvm_vcpu *vcpu = filp->private_data; 2548 void __user *argp = (void __user *)arg; 2549 int r; 2550 struct kvm_fpu *fpu = NULL; 2551 struct kvm_sregs *kvm_sregs = NULL; 2552 2553 if (vcpu->kvm->mm != current->mm) 2554 return -EIO; 2555 2556 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2557 return -EINVAL; 2558 2559 /* 2560 * Some architectures have vcpu ioctls that are asynchronous to vcpu 2561 * execution; mutex_lock() would break them. 2562 */ 2563 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 2564 if (r != -ENOIOCTLCMD) 2565 return r; 2566 2567 if (mutex_lock_killable(&vcpu->mutex)) 2568 return -EINTR; 2569 switch (ioctl) { 2570 case KVM_RUN: { 2571 struct pid *oldpid; 2572 r = -EINVAL; 2573 if (arg) 2574 goto out; 2575 oldpid = rcu_access_pointer(vcpu->pid); 2576 if (unlikely(oldpid != task_pid(current))) { 2577 /* The thread running this VCPU changed. */ 2578 struct pid *newpid; 2579 2580 r = kvm_arch_vcpu_run_pid_change(vcpu); 2581 if (r) 2582 break; 2583 2584 newpid = get_task_pid(current, PIDTYPE_PID); 2585 rcu_assign_pointer(vcpu->pid, newpid); 2586 if (oldpid) 2587 synchronize_rcu(); 2588 put_pid(oldpid); 2589 } 2590 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2591 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2592 break; 2593 } 2594 case KVM_GET_REGS: { 2595 struct kvm_regs *kvm_regs; 2596 2597 r = -ENOMEM; 2598 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2599 if (!kvm_regs) 2600 goto out; 2601 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2602 if (r) 2603 goto out_free1; 2604 r = -EFAULT; 2605 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2606 goto out_free1; 2607 r = 0; 2608 out_free1: 2609 kfree(kvm_regs); 2610 break; 2611 } 2612 case KVM_SET_REGS: { 2613 struct kvm_regs *kvm_regs; 2614 2615 r = -ENOMEM; 2616 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2617 if (IS_ERR(kvm_regs)) { 2618 r = PTR_ERR(kvm_regs); 2619 goto out; 2620 } 2621 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2622 kfree(kvm_regs); 2623 break; 2624 } 2625 case KVM_GET_SREGS: { 2626 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2627 r = -ENOMEM; 2628 if (!kvm_sregs) 2629 goto out; 2630 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2631 if (r) 2632 goto out; 2633 r = -EFAULT; 2634 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2635 goto out; 2636 r = 0; 2637 break; 2638 } 2639 case KVM_SET_SREGS: { 2640 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2641 if (IS_ERR(kvm_sregs)) { 2642 r = PTR_ERR(kvm_sregs); 2643 kvm_sregs = NULL; 2644 goto out; 2645 } 2646 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2647 break; 2648 } 2649 case KVM_GET_MP_STATE: { 2650 struct kvm_mp_state mp_state; 2651 2652 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2653 if (r) 2654 goto out; 2655 r = -EFAULT; 2656 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2657 goto out; 2658 r = 0; 2659 break; 2660 } 2661 case KVM_SET_MP_STATE: { 2662 struct kvm_mp_state mp_state; 2663 2664 r = -EFAULT; 2665 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2666 goto out; 2667 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2668 break; 2669 } 2670 case KVM_TRANSLATE: { 2671 struct kvm_translation tr; 2672 2673 r = -EFAULT; 2674 if (copy_from_user(&tr, argp, sizeof(tr))) 2675 goto out; 2676 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2677 if (r) 2678 goto out; 2679 r = -EFAULT; 2680 if (copy_to_user(argp, &tr, sizeof(tr))) 2681 goto out; 2682 r = 0; 2683 break; 2684 } 2685 case KVM_SET_GUEST_DEBUG: { 2686 struct kvm_guest_debug dbg; 2687 2688 r = -EFAULT; 2689 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2690 goto out; 2691 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2692 break; 2693 } 2694 case KVM_SET_SIGNAL_MASK: { 2695 struct kvm_signal_mask __user *sigmask_arg = argp; 2696 struct kvm_signal_mask kvm_sigmask; 2697 sigset_t sigset, *p; 2698 2699 p = NULL; 2700 if (argp) { 2701 r = -EFAULT; 2702 if (copy_from_user(&kvm_sigmask, argp, 2703 sizeof(kvm_sigmask))) 2704 goto out; 2705 r = -EINVAL; 2706 if (kvm_sigmask.len != sizeof(sigset)) 2707 goto out; 2708 r = -EFAULT; 2709 if (copy_from_user(&sigset, sigmask_arg->sigset, 2710 sizeof(sigset))) 2711 goto out; 2712 p = &sigset; 2713 } 2714 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2715 break; 2716 } 2717 case KVM_GET_FPU: { 2718 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2719 r = -ENOMEM; 2720 if (!fpu) 2721 goto out; 2722 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2723 if (r) 2724 goto out; 2725 r = -EFAULT; 2726 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2727 goto out; 2728 r = 0; 2729 break; 2730 } 2731 case KVM_SET_FPU: { 2732 fpu = memdup_user(argp, sizeof(*fpu)); 2733 if (IS_ERR(fpu)) { 2734 r = PTR_ERR(fpu); 2735 fpu = NULL; 2736 goto out; 2737 } 2738 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2739 break; 2740 } 2741 default: 2742 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2743 } 2744 out: 2745 mutex_unlock(&vcpu->mutex); 2746 kfree(fpu); 2747 kfree(kvm_sregs); 2748 return r; 2749 } 2750 2751 #ifdef CONFIG_KVM_COMPAT 2752 static long kvm_vcpu_compat_ioctl(struct file *filp, 2753 unsigned int ioctl, unsigned long arg) 2754 { 2755 struct kvm_vcpu *vcpu = filp->private_data; 2756 void __user *argp = compat_ptr(arg); 2757 int r; 2758 2759 if (vcpu->kvm->mm != current->mm) 2760 return -EIO; 2761 2762 switch (ioctl) { 2763 case KVM_SET_SIGNAL_MASK: { 2764 struct kvm_signal_mask __user *sigmask_arg = argp; 2765 struct kvm_signal_mask kvm_sigmask; 2766 sigset_t sigset; 2767 2768 if (argp) { 2769 r = -EFAULT; 2770 if (copy_from_user(&kvm_sigmask, argp, 2771 sizeof(kvm_sigmask))) 2772 goto out; 2773 r = -EINVAL; 2774 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 2775 goto out; 2776 r = -EFAULT; 2777 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 2778 goto out; 2779 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2780 } else 2781 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2782 break; 2783 } 2784 default: 2785 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2786 } 2787 2788 out: 2789 return r; 2790 } 2791 #endif 2792 2793 static int kvm_device_ioctl_attr(struct kvm_device *dev, 2794 int (*accessor)(struct kvm_device *dev, 2795 struct kvm_device_attr *attr), 2796 unsigned long arg) 2797 { 2798 struct kvm_device_attr attr; 2799 2800 if (!accessor) 2801 return -EPERM; 2802 2803 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2804 return -EFAULT; 2805 2806 return accessor(dev, &attr); 2807 } 2808 2809 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2810 unsigned long arg) 2811 { 2812 struct kvm_device *dev = filp->private_data; 2813 2814 switch (ioctl) { 2815 case KVM_SET_DEVICE_ATTR: 2816 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2817 case KVM_GET_DEVICE_ATTR: 2818 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2819 case KVM_HAS_DEVICE_ATTR: 2820 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2821 default: 2822 if (dev->ops->ioctl) 2823 return dev->ops->ioctl(dev, ioctl, arg); 2824 2825 return -ENOTTY; 2826 } 2827 } 2828 2829 static int kvm_device_release(struct inode *inode, struct file *filp) 2830 { 2831 struct kvm_device *dev = filp->private_data; 2832 struct kvm *kvm = dev->kvm; 2833 2834 kvm_put_kvm(kvm); 2835 return 0; 2836 } 2837 2838 static const struct file_operations kvm_device_fops = { 2839 .unlocked_ioctl = kvm_device_ioctl, 2840 .release = kvm_device_release, 2841 KVM_COMPAT(kvm_device_ioctl), 2842 }; 2843 2844 struct kvm_device *kvm_device_from_filp(struct file *filp) 2845 { 2846 if (filp->f_op != &kvm_device_fops) 2847 return NULL; 2848 2849 return filp->private_data; 2850 } 2851 2852 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 2853 #ifdef CONFIG_KVM_MPIC 2854 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 2855 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 2856 #endif 2857 }; 2858 2859 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 2860 { 2861 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 2862 return -ENOSPC; 2863 2864 if (kvm_device_ops_table[type] != NULL) 2865 return -EEXIST; 2866 2867 kvm_device_ops_table[type] = ops; 2868 return 0; 2869 } 2870 2871 void kvm_unregister_device_ops(u32 type) 2872 { 2873 if (kvm_device_ops_table[type] != NULL) 2874 kvm_device_ops_table[type] = NULL; 2875 } 2876 2877 static int kvm_ioctl_create_device(struct kvm *kvm, 2878 struct kvm_create_device *cd) 2879 { 2880 struct kvm_device_ops *ops = NULL; 2881 struct kvm_device *dev; 2882 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2883 int ret; 2884 2885 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 2886 return -ENODEV; 2887 2888 ops = kvm_device_ops_table[cd->type]; 2889 if (ops == NULL) 2890 return -ENODEV; 2891 2892 if (test) 2893 return 0; 2894 2895 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2896 if (!dev) 2897 return -ENOMEM; 2898 2899 dev->ops = ops; 2900 dev->kvm = kvm; 2901 2902 mutex_lock(&kvm->lock); 2903 ret = ops->create(dev, cd->type); 2904 if (ret < 0) { 2905 mutex_unlock(&kvm->lock); 2906 kfree(dev); 2907 return ret; 2908 } 2909 list_add(&dev->vm_node, &kvm->devices); 2910 mutex_unlock(&kvm->lock); 2911 2912 if (ops->init) 2913 ops->init(dev); 2914 2915 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 2916 if (ret < 0) { 2917 mutex_lock(&kvm->lock); 2918 list_del(&dev->vm_node); 2919 mutex_unlock(&kvm->lock); 2920 ops->destroy(dev); 2921 return ret; 2922 } 2923 2924 kvm_get_kvm(kvm); 2925 cd->fd = ret; 2926 return 0; 2927 } 2928 2929 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 2930 { 2931 switch (arg) { 2932 case KVM_CAP_USER_MEMORY: 2933 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2934 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2935 case KVM_CAP_INTERNAL_ERROR_DATA: 2936 #ifdef CONFIG_HAVE_KVM_MSI 2937 case KVM_CAP_SIGNAL_MSI: 2938 #endif 2939 #ifdef CONFIG_HAVE_KVM_IRQFD 2940 case KVM_CAP_IRQFD: 2941 case KVM_CAP_IRQFD_RESAMPLE: 2942 #endif 2943 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 2944 case KVM_CAP_CHECK_EXTENSION_VM: 2945 return 1; 2946 #ifdef CONFIG_KVM_MMIO 2947 case KVM_CAP_COALESCED_MMIO: 2948 return KVM_COALESCED_MMIO_PAGE_OFFSET; 2949 #endif 2950 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2951 case KVM_CAP_IRQ_ROUTING: 2952 return KVM_MAX_IRQ_ROUTES; 2953 #endif 2954 #if KVM_ADDRESS_SPACE_NUM > 1 2955 case KVM_CAP_MULTI_ADDRESS_SPACE: 2956 return KVM_ADDRESS_SPACE_NUM; 2957 #endif 2958 case KVM_CAP_MAX_VCPU_ID: 2959 return KVM_MAX_VCPU_ID; 2960 default: 2961 break; 2962 } 2963 return kvm_vm_ioctl_check_extension(kvm, arg); 2964 } 2965 2966 static long kvm_vm_ioctl(struct file *filp, 2967 unsigned int ioctl, unsigned long arg) 2968 { 2969 struct kvm *kvm = filp->private_data; 2970 void __user *argp = (void __user *)arg; 2971 int r; 2972 2973 if (kvm->mm != current->mm) 2974 return -EIO; 2975 switch (ioctl) { 2976 case KVM_CREATE_VCPU: 2977 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2978 break; 2979 case KVM_SET_USER_MEMORY_REGION: { 2980 struct kvm_userspace_memory_region kvm_userspace_mem; 2981 2982 r = -EFAULT; 2983 if (copy_from_user(&kvm_userspace_mem, argp, 2984 sizeof(kvm_userspace_mem))) 2985 goto out; 2986 2987 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2988 break; 2989 } 2990 case KVM_GET_DIRTY_LOG: { 2991 struct kvm_dirty_log log; 2992 2993 r = -EFAULT; 2994 if (copy_from_user(&log, argp, sizeof(log))) 2995 goto out; 2996 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2997 break; 2998 } 2999 #ifdef CONFIG_KVM_MMIO 3000 case KVM_REGISTER_COALESCED_MMIO: { 3001 struct kvm_coalesced_mmio_zone zone; 3002 3003 r = -EFAULT; 3004 if (copy_from_user(&zone, argp, sizeof(zone))) 3005 goto out; 3006 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3007 break; 3008 } 3009 case KVM_UNREGISTER_COALESCED_MMIO: { 3010 struct kvm_coalesced_mmio_zone zone; 3011 3012 r = -EFAULT; 3013 if (copy_from_user(&zone, argp, sizeof(zone))) 3014 goto out; 3015 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3016 break; 3017 } 3018 #endif 3019 case KVM_IRQFD: { 3020 struct kvm_irqfd data; 3021 3022 r = -EFAULT; 3023 if (copy_from_user(&data, argp, sizeof(data))) 3024 goto out; 3025 r = kvm_irqfd(kvm, &data); 3026 break; 3027 } 3028 case KVM_IOEVENTFD: { 3029 struct kvm_ioeventfd data; 3030 3031 r = -EFAULT; 3032 if (copy_from_user(&data, argp, sizeof(data))) 3033 goto out; 3034 r = kvm_ioeventfd(kvm, &data); 3035 break; 3036 } 3037 #ifdef CONFIG_HAVE_KVM_MSI 3038 case KVM_SIGNAL_MSI: { 3039 struct kvm_msi msi; 3040 3041 r = -EFAULT; 3042 if (copy_from_user(&msi, argp, sizeof(msi))) 3043 goto out; 3044 r = kvm_send_userspace_msi(kvm, &msi); 3045 break; 3046 } 3047 #endif 3048 #ifdef __KVM_HAVE_IRQ_LINE 3049 case KVM_IRQ_LINE_STATUS: 3050 case KVM_IRQ_LINE: { 3051 struct kvm_irq_level irq_event; 3052 3053 r = -EFAULT; 3054 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3055 goto out; 3056 3057 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3058 ioctl == KVM_IRQ_LINE_STATUS); 3059 if (r) 3060 goto out; 3061 3062 r = -EFAULT; 3063 if (ioctl == KVM_IRQ_LINE_STATUS) { 3064 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3065 goto out; 3066 } 3067 3068 r = 0; 3069 break; 3070 } 3071 #endif 3072 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3073 case KVM_SET_GSI_ROUTING: { 3074 struct kvm_irq_routing routing; 3075 struct kvm_irq_routing __user *urouting; 3076 struct kvm_irq_routing_entry *entries = NULL; 3077 3078 r = -EFAULT; 3079 if (copy_from_user(&routing, argp, sizeof(routing))) 3080 goto out; 3081 r = -EINVAL; 3082 if (!kvm_arch_can_set_irq_routing(kvm)) 3083 goto out; 3084 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3085 goto out; 3086 if (routing.flags) 3087 goto out; 3088 if (routing.nr) { 3089 r = -ENOMEM; 3090 entries = vmalloc(array_size(sizeof(*entries), 3091 routing.nr)); 3092 if (!entries) 3093 goto out; 3094 r = -EFAULT; 3095 urouting = argp; 3096 if (copy_from_user(entries, urouting->entries, 3097 routing.nr * sizeof(*entries))) 3098 goto out_free_irq_routing; 3099 } 3100 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3101 routing.flags); 3102 out_free_irq_routing: 3103 vfree(entries); 3104 break; 3105 } 3106 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3107 case KVM_CREATE_DEVICE: { 3108 struct kvm_create_device cd; 3109 3110 r = -EFAULT; 3111 if (copy_from_user(&cd, argp, sizeof(cd))) 3112 goto out; 3113 3114 r = kvm_ioctl_create_device(kvm, &cd); 3115 if (r) 3116 goto out; 3117 3118 r = -EFAULT; 3119 if (copy_to_user(argp, &cd, sizeof(cd))) 3120 goto out; 3121 3122 r = 0; 3123 break; 3124 } 3125 case KVM_CHECK_EXTENSION: 3126 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3127 break; 3128 default: 3129 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3130 } 3131 out: 3132 return r; 3133 } 3134 3135 #ifdef CONFIG_KVM_COMPAT 3136 struct compat_kvm_dirty_log { 3137 __u32 slot; 3138 __u32 padding1; 3139 union { 3140 compat_uptr_t dirty_bitmap; /* one bit per page */ 3141 __u64 padding2; 3142 }; 3143 }; 3144 3145 static long kvm_vm_compat_ioctl(struct file *filp, 3146 unsigned int ioctl, unsigned long arg) 3147 { 3148 struct kvm *kvm = filp->private_data; 3149 int r; 3150 3151 if (kvm->mm != current->mm) 3152 return -EIO; 3153 switch (ioctl) { 3154 case KVM_GET_DIRTY_LOG: { 3155 struct compat_kvm_dirty_log compat_log; 3156 struct kvm_dirty_log log; 3157 3158 if (copy_from_user(&compat_log, (void __user *)arg, 3159 sizeof(compat_log))) 3160 return -EFAULT; 3161 log.slot = compat_log.slot; 3162 log.padding1 = compat_log.padding1; 3163 log.padding2 = compat_log.padding2; 3164 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3165 3166 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3167 break; 3168 } 3169 default: 3170 r = kvm_vm_ioctl(filp, ioctl, arg); 3171 } 3172 return r; 3173 } 3174 #endif 3175 3176 static struct file_operations kvm_vm_fops = { 3177 .release = kvm_vm_release, 3178 .unlocked_ioctl = kvm_vm_ioctl, 3179 .llseek = noop_llseek, 3180 KVM_COMPAT(kvm_vm_compat_ioctl), 3181 }; 3182 3183 static int kvm_dev_ioctl_create_vm(unsigned long type) 3184 { 3185 int r; 3186 struct kvm *kvm; 3187 struct file *file; 3188 3189 kvm = kvm_create_vm(type); 3190 if (IS_ERR(kvm)) 3191 return PTR_ERR(kvm); 3192 #ifdef CONFIG_KVM_MMIO 3193 r = kvm_coalesced_mmio_init(kvm); 3194 if (r < 0) 3195 goto put_kvm; 3196 #endif 3197 r = get_unused_fd_flags(O_CLOEXEC); 3198 if (r < 0) 3199 goto put_kvm; 3200 3201 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3202 if (IS_ERR(file)) { 3203 put_unused_fd(r); 3204 r = PTR_ERR(file); 3205 goto put_kvm; 3206 } 3207 3208 /* 3209 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3210 * already set, with ->release() being kvm_vm_release(). In error 3211 * cases it will be called by the final fput(file) and will take 3212 * care of doing kvm_put_kvm(kvm). 3213 */ 3214 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3215 put_unused_fd(r); 3216 fput(file); 3217 return -ENOMEM; 3218 } 3219 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3220 3221 fd_install(r, file); 3222 return r; 3223 3224 put_kvm: 3225 kvm_put_kvm(kvm); 3226 return r; 3227 } 3228 3229 static long kvm_dev_ioctl(struct file *filp, 3230 unsigned int ioctl, unsigned long arg) 3231 { 3232 long r = -EINVAL; 3233 3234 switch (ioctl) { 3235 case KVM_GET_API_VERSION: 3236 if (arg) 3237 goto out; 3238 r = KVM_API_VERSION; 3239 break; 3240 case KVM_CREATE_VM: 3241 r = kvm_dev_ioctl_create_vm(arg); 3242 break; 3243 case KVM_CHECK_EXTENSION: 3244 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3245 break; 3246 case KVM_GET_VCPU_MMAP_SIZE: 3247 if (arg) 3248 goto out; 3249 r = PAGE_SIZE; /* struct kvm_run */ 3250 #ifdef CONFIG_X86 3251 r += PAGE_SIZE; /* pio data page */ 3252 #endif 3253 #ifdef CONFIG_KVM_MMIO 3254 r += PAGE_SIZE; /* coalesced mmio ring page */ 3255 #endif 3256 break; 3257 case KVM_TRACE_ENABLE: 3258 case KVM_TRACE_PAUSE: 3259 case KVM_TRACE_DISABLE: 3260 r = -EOPNOTSUPP; 3261 break; 3262 default: 3263 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3264 } 3265 out: 3266 return r; 3267 } 3268 3269 static struct file_operations kvm_chardev_ops = { 3270 .unlocked_ioctl = kvm_dev_ioctl, 3271 .llseek = noop_llseek, 3272 KVM_COMPAT(kvm_dev_ioctl), 3273 }; 3274 3275 static struct miscdevice kvm_dev = { 3276 KVM_MINOR, 3277 "kvm", 3278 &kvm_chardev_ops, 3279 }; 3280 3281 static void hardware_enable_nolock(void *junk) 3282 { 3283 int cpu = raw_smp_processor_id(); 3284 int r; 3285 3286 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3287 return; 3288 3289 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3290 3291 r = kvm_arch_hardware_enable(); 3292 3293 if (r) { 3294 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3295 atomic_inc(&hardware_enable_failed); 3296 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3297 } 3298 } 3299 3300 static int kvm_starting_cpu(unsigned int cpu) 3301 { 3302 raw_spin_lock(&kvm_count_lock); 3303 if (kvm_usage_count) 3304 hardware_enable_nolock(NULL); 3305 raw_spin_unlock(&kvm_count_lock); 3306 return 0; 3307 } 3308 3309 static void hardware_disable_nolock(void *junk) 3310 { 3311 int cpu = raw_smp_processor_id(); 3312 3313 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3314 return; 3315 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3316 kvm_arch_hardware_disable(); 3317 } 3318 3319 static int kvm_dying_cpu(unsigned int cpu) 3320 { 3321 raw_spin_lock(&kvm_count_lock); 3322 if (kvm_usage_count) 3323 hardware_disable_nolock(NULL); 3324 raw_spin_unlock(&kvm_count_lock); 3325 return 0; 3326 } 3327 3328 static void hardware_disable_all_nolock(void) 3329 { 3330 BUG_ON(!kvm_usage_count); 3331 3332 kvm_usage_count--; 3333 if (!kvm_usage_count) 3334 on_each_cpu(hardware_disable_nolock, NULL, 1); 3335 } 3336 3337 static void hardware_disable_all(void) 3338 { 3339 raw_spin_lock(&kvm_count_lock); 3340 hardware_disable_all_nolock(); 3341 raw_spin_unlock(&kvm_count_lock); 3342 } 3343 3344 static int hardware_enable_all(void) 3345 { 3346 int r = 0; 3347 3348 raw_spin_lock(&kvm_count_lock); 3349 3350 kvm_usage_count++; 3351 if (kvm_usage_count == 1) { 3352 atomic_set(&hardware_enable_failed, 0); 3353 on_each_cpu(hardware_enable_nolock, NULL, 1); 3354 3355 if (atomic_read(&hardware_enable_failed)) { 3356 hardware_disable_all_nolock(); 3357 r = -EBUSY; 3358 } 3359 } 3360 3361 raw_spin_unlock(&kvm_count_lock); 3362 3363 return r; 3364 } 3365 3366 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3367 void *v) 3368 { 3369 /* 3370 * Some (well, at least mine) BIOSes hang on reboot if 3371 * in vmx root mode. 3372 * 3373 * And Intel TXT required VMX off for all cpu when system shutdown. 3374 */ 3375 pr_info("kvm: exiting hardware virtualization\n"); 3376 kvm_rebooting = true; 3377 on_each_cpu(hardware_disable_nolock, NULL, 1); 3378 return NOTIFY_OK; 3379 } 3380 3381 static struct notifier_block kvm_reboot_notifier = { 3382 .notifier_call = kvm_reboot, 3383 .priority = 0, 3384 }; 3385 3386 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3387 { 3388 int i; 3389 3390 for (i = 0; i < bus->dev_count; i++) { 3391 struct kvm_io_device *pos = bus->range[i].dev; 3392 3393 kvm_iodevice_destructor(pos); 3394 } 3395 kfree(bus); 3396 } 3397 3398 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3399 const struct kvm_io_range *r2) 3400 { 3401 gpa_t addr1 = r1->addr; 3402 gpa_t addr2 = r2->addr; 3403 3404 if (addr1 < addr2) 3405 return -1; 3406 3407 /* If r2->len == 0, match the exact address. If r2->len != 0, 3408 * accept any overlapping write. Any order is acceptable for 3409 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3410 * we process all of them. 3411 */ 3412 if (r2->len) { 3413 addr1 += r1->len; 3414 addr2 += r2->len; 3415 } 3416 3417 if (addr1 > addr2) 3418 return 1; 3419 3420 return 0; 3421 } 3422 3423 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3424 { 3425 return kvm_io_bus_cmp(p1, p2); 3426 } 3427 3428 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3429 gpa_t addr, int len) 3430 { 3431 struct kvm_io_range *range, key; 3432 int off; 3433 3434 key = (struct kvm_io_range) { 3435 .addr = addr, 3436 .len = len, 3437 }; 3438 3439 range = bsearch(&key, bus->range, bus->dev_count, 3440 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3441 if (range == NULL) 3442 return -ENOENT; 3443 3444 off = range - bus->range; 3445 3446 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3447 off--; 3448 3449 return off; 3450 } 3451 3452 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3453 struct kvm_io_range *range, const void *val) 3454 { 3455 int idx; 3456 3457 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3458 if (idx < 0) 3459 return -EOPNOTSUPP; 3460 3461 while (idx < bus->dev_count && 3462 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3463 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3464 range->len, val)) 3465 return idx; 3466 idx++; 3467 } 3468 3469 return -EOPNOTSUPP; 3470 } 3471 3472 /* kvm_io_bus_write - called under kvm->slots_lock */ 3473 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3474 int len, const void *val) 3475 { 3476 struct kvm_io_bus *bus; 3477 struct kvm_io_range range; 3478 int r; 3479 3480 range = (struct kvm_io_range) { 3481 .addr = addr, 3482 .len = len, 3483 }; 3484 3485 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3486 if (!bus) 3487 return -ENOMEM; 3488 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3489 return r < 0 ? r : 0; 3490 } 3491 3492 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3493 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3494 gpa_t addr, int len, const void *val, long cookie) 3495 { 3496 struct kvm_io_bus *bus; 3497 struct kvm_io_range range; 3498 3499 range = (struct kvm_io_range) { 3500 .addr = addr, 3501 .len = len, 3502 }; 3503 3504 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3505 if (!bus) 3506 return -ENOMEM; 3507 3508 /* First try the device referenced by cookie. */ 3509 if ((cookie >= 0) && (cookie < bus->dev_count) && 3510 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3511 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3512 val)) 3513 return cookie; 3514 3515 /* 3516 * cookie contained garbage; fall back to search and return the 3517 * correct cookie value. 3518 */ 3519 return __kvm_io_bus_write(vcpu, bus, &range, val); 3520 } 3521 3522 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3523 struct kvm_io_range *range, void *val) 3524 { 3525 int idx; 3526 3527 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3528 if (idx < 0) 3529 return -EOPNOTSUPP; 3530 3531 while (idx < bus->dev_count && 3532 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3533 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3534 range->len, val)) 3535 return idx; 3536 idx++; 3537 } 3538 3539 return -EOPNOTSUPP; 3540 } 3541 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3542 3543 /* kvm_io_bus_read - called under kvm->slots_lock */ 3544 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3545 int len, void *val) 3546 { 3547 struct kvm_io_bus *bus; 3548 struct kvm_io_range range; 3549 int r; 3550 3551 range = (struct kvm_io_range) { 3552 .addr = addr, 3553 .len = len, 3554 }; 3555 3556 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3557 if (!bus) 3558 return -ENOMEM; 3559 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3560 return r < 0 ? r : 0; 3561 } 3562 3563 3564 /* Caller must hold slots_lock. */ 3565 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3566 int len, struct kvm_io_device *dev) 3567 { 3568 int i; 3569 struct kvm_io_bus *new_bus, *bus; 3570 struct kvm_io_range range; 3571 3572 bus = kvm_get_bus(kvm, bus_idx); 3573 if (!bus) 3574 return -ENOMEM; 3575 3576 /* exclude ioeventfd which is limited by maximum fd */ 3577 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3578 return -ENOSPC; 3579 3580 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3581 sizeof(struct kvm_io_range)), GFP_KERNEL); 3582 if (!new_bus) 3583 return -ENOMEM; 3584 3585 range = (struct kvm_io_range) { 3586 .addr = addr, 3587 .len = len, 3588 .dev = dev, 3589 }; 3590 3591 for (i = 0; i < bus->dev_count; i++) 3592 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 3593 break; 3594 3595 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3596 new_bus->dev_count++; 3597 new_bus->range[i] = range; 3598 memcpy(new_bus->range + i + 1, bus->range + i, 3599 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 3600 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3601 synchronize_srcu_expedited(&kvm->srcu); 3602 kfree(bus); 3603 3604 return 0; 3605 } 3606 3607 /* Caller must hold slots_lock. */ 3608 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3609 struct kvm_io_device *dev) 3610 { 3611 int i; 3612 struct kvm_io_bus *new_bus, *bus; 3613 3614 bus = kvm_get_bus(kvm, bus_idx); 3615 if (!bus) 3616 return; 3617 3618 for (i = 0; i < bus->dev_count; i++) 3619 if (bus->range[i].dev == dev) { 3620 break; 3621 } 3622 3623 if (i == bus->dev_count) 3624 return; 3625 3626 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3627 sizeof(struct kvm_io_range)), GFP_KERNEL); 3628 if (!new_bus) { 3629 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3630 goto broken; 3631 } 3632 3633 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3634 new_bus->dev_count--; 3635 memcpy(new_bus->range + i, bus->range + i + 1, 3636 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3637 3638 broken: 3639 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3640 synchronize_srcu_expedited(&kvm->srcu); 3641 kfree(bus); 3642 return; 3643 } 3644 3645 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3646 gpa_t addr) 3647 { 3648 struct kvm_io_bus *bus; 3649 int dev_idx, srcu_idx; 3650 struct kvm_io_device *iodev = NULL; 3651 3652 srcu_idx = srcu_read_lock(&kvm->srcu); 3653 3654 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 3655 if (!bus) 3656 goto out_unlock; 3657 3658 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 3659 if (dev_idx < 0) 3660 goto out_unlock; 3661 3662 iodev = bus->range[dev_idx].dev; 3663 3664 out_unlock: 3665 srcu_read_unlock(&kvm->srcu, srcu_idx); 3666 3667 return iodev; 3668 } 3669 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 3670 3671 static int kvm_debugfs_open(struct inode *inode, struct file *file, 3672 int (*get)(void *, u64 *), int (*set)(void *, u64), 3673 const char *fmt) 3674 { 3675 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3676 inode->i_private; 3677 3678 /* The debugfs files are a reference to the kvm struct which 3679 * is still valid when kvm_destroy_vm is called. 3680 * To avoid the race between open and the removal of the debugfs 3681 * directory we test against the users count. 3682 */ 3683 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 3684 return -ENOENT; 3685 3686 if (simple_attr_open(inode, file, get, set, fmt)) { 3687 kvm_put_kvm(stat_data->kvm); 3688 return -ENOMEM; 3689 } 3690 3691 return 0; 3692 } 3693 3694 static int kvm_debugfs_release(struct inode *inode, struct file *file) 3695 { 3696 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3697 inode->i_private; 3698 3699 simple_attr_release(inode, file); 3700 kvm_put_kvm(stat_data->kvm); 3701 3702 return 0; 3703 } 3704 3705 static int vm_stat_get_per_vm(void *data, u64 *val) 3706 { 3707 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3708 3709 *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 3710 3711 return 0; 3712 } 3713 3714 static int vm_stat_clear_per_vm(void *data, u64 val) 3715 { 3716 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3717 3718 if (val) 3719 return -EINVAL; 3720 3721 *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 3722 3723 return 0; 3724 } 3725 3726 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 3727 { 3728 __simple_attr_check_format("%llu\n", 0ull); 3729 return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 3730 vm_stat_clear_per_vm, "%llu\n"); 3731 } 3732 3733 static const struct file_operations vm_stat_get_per_vm_fops = { 3734 .owner = THIS_MODULE, 3735 .open = vm_stat_get_per_vm_open, 3736 .release = kvm_debugfs_release, 3737 .read = simple_attr_read, 3738 .write = simple_attr_write, 3739 .llseek = no_llseek, 3740 }; 3741 3742 static int vcpu_stat_get_per_vm(void *data, u64 *val) 3743 { 3744 int i; 3745 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3746 struct kvm_vcpu *vcpu; 3747 3748 *val = 0; 3749 3750 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3751 *val += *(u64 *)((void *)vcpu + stat_data->offset); 3752 3753 return 0; 3754 } 3755 3756 static int vcpu_stat_clear_per_vm(void *data, u64 val) 3757 { 3758 int i; 3759 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3760 struct kvm_vcpu *vcpu; 3761 3762 if (val) 3763 return -EINVAL; 3764 3765 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3766 *(u64 *)((void *)vcpu + stat_data->offset) = 0; 3767 3768 return 0; 3769 } 3770 3771 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 3772 { 3773 __simple_attr_check_format("%llu\n", 0ull); 3774 return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 3775 vcpu_stat_clear_per_vm, "%llu\n"); 3776 } 3777 3778 static const struct file_operations vcpu_stat_get_per_vm_fops = { 3779 .owner = THIS_MODULE, 3780 .open = vcpu_stat_get_per_vm_open, 3781 .release = kvm_debugfs_release, 3782 .read = simple_attr_read, 3783 .write = simple_attr_write, 3784 .llseek = no_llseek, 3785 }; 3786 3787 static const struct file_operations *stat_fops_per_vm[] = { 3788 [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 3789 [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 3790 }; 3791 3792 static int vm_stat_get(void *_offset, u64 *val) 3793 { 3794 unsigned offset = (long)_offset; 3795 struct kvm *kvm; 3796 struct kvm_stat_data stat_tmp = {.offset = offset}; 3797 u64 tmp_val; 3798 3799 *val = 0; 3800 spin_lock(&kvm_lock); 3801 list_for_each_entry(kvm, &vm_list, vm_list) { 3802 stat_tmp.kvm = kvm; 3803 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3804 *val += tmp_val; 3805 } 3806 spin_unlock(&kvm_lock); 3807 return 0; 3808 } 3809 3810 static int vm_stat_clear(void *_offset, u64 val) 3811 { 3812 unsigned offset = (long)_offset; 3813 struct kvm *kvm; 3814 struct kvm_stat_data stat_tmp = {.offset = offset}; 3815 3816 if (val) 3817 return -EINVAL; 3818 3819 spin_lock(&kvm_lock); 3820 list_for_each_entry(kvm, &vm_list, vm_list) { 3821 stat_tmp.kvm = kvm; 3822 vm_stat_clear_per_vm((void *)&stat_tmp, 0); 3823 } 3824 spin_unlock(&kvm_lock); 3825 3826 return 0; 3827 } 3828 3829 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 3830 3831 static int vcpu_stat_get(void *_offset, u64 *val) 3832 { 3833 unsigned offset = (long)_offset; 3834 struct kvm *kvm; 3835 struct kvm_stat_data stat_tmp = {.offset = offset}; 3836 u64 tmp_val; 3837 3838 *val = 0; 3839 spin_lock(&kvm_lock); 3840 list_for_each_entry(kvm, &vm_list, vm_list) { 3841 stat_tmp.kvm = kvm; 3842 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3843 *val += tmp_val; 3844 } 3845 spin_unlock(&kvm_lock); 3846 return 0; 3847 } 3848 3849 static int vcpu_stat_clear(void *_offset, u64 val) 3850 { 3851 unsigned offset = (long)_offset; 3852 struct kvm *kvm; 3853 struct kvm_stat_data stat_tmp = {.offset = offset}; 3854 3855 if (val) 3856 return -EINVAL; 3857 3858 spin_lock(&kvm_lock); 3859 list_for_each_entry(kvm, &vm_list, vm_list) { 3860 stat_tmp.kvm = kvm; 3861 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 3862 } 3863 spin_unlock(&kvm_lock); 3864 3865 return 0; 3866 } 3867 3868 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 3869 "%llu\n"); 3870 3871 static const struct file_operations *stat_fops[] = { 3872 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3873 [KVM_STAT_VM] = &vm_stat_fops, 3874 }; 3875 3876 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 3877 { 3878 struct kobj_uevent_env *env; 3879 unsigned long long created, active; 3880 3881 if (!kvm_dev.this_device || !kvm) 3882 return; 3883 3884 spin_lock(&kvm_lock); 3885 if (type == KVM_EVENT_CREATE_VM) { 3886 kvm_createvm_count++; 3887 kvm_active_vms++; 3888 } else if (type == KVM_EVENT_DESTROY_VM) { 3889 kvm_active_vms--; 3890 } 3891 created = kvm_createvm_count; 3892 active = kvm_active_vms; 3893 spin_unlock(&kvm_lock); 3894 3895 env = kzalloc(sizeof(*env), GFP_KERNEL); 3896 if (!env) 3897 return; 3898 3899 add_uevent_var(env, "CREATED=%llu", created); 3900 add_uevent_var(env, "COUNT=%llu", active); 3901 3902 if (type == KVM_EVENT_CREATE_VM) { 3903 add_uevent_var(env, "EVENT=create"); 3904 kvm->userspace_pid = task_pid_nr(current); 3905 } else if (type == KVM_EVENT_DESTROY_VM) { 3906 add_uevent_var(env, "EVENT=destroy"); 3907 } 3908 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 3909 3910 if (kvm->debugfs_dentry) { 3911 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); 3912 3913 if (p) { 3914 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 3915 if (!IS_ERR(tmp)) 3916 add_uevent_var(env, "STATS_PATH=%s", tmp); 3917 kfree(p); 3918 } 3919 } 3920 /* no need for checks, since we are adding at most only 5 keys */ 3921 env->envp[env->envp_idx++] = NULL; 3922 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 3923 kfree(env); 3924 } 3925 3926 static void kvm_init_debug(void) 3927 { 3928 struct kvm_stats_debugfs_item *p; 3929 3930 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 3931 3932 kvm_debugfs_num_entries = 0; 3933 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 3934 debugfs_create_file(p->name, 0644, kvm_debugfs_dir, 3935 (void *)(long)p->offset, 3936 stat_fops[p->kind]); 3937 } 3938 } 3939 3940 static int kvm_suspend(void) 3941 { 3942 if (kvm_usage_count) 3943 hardware_disable_nolock(NULL); 3944 return 0; 3945 } 3946 3947 static void kvm_resume(void) 3948 { 3949 if (kvm_usage_count) { 3950 WARN_ON(raw_spin_is_locked(&kvm_count_lock)); 3951 hardware_enable_nolock(NULL); 3952 } 3953 } 3954 3955 static struct syscore_ops kvm_syscore_ops = { 3956 .suspend = kvm_suspend, 3957 .resume = kvm_resume, 3958 }; 3959 3960 static inline 3961 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 3962 { 3963 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3964 } 3965 3966 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3967 { 3968 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3969 3970 if (vcpu->preempted) 3971 vcpu->preempted = false; 3972 3973 kvm_arch_sched_in(vcpu, cpu); 3974 3975 kvm_arch_vcpu_load(vcpu, cpu); 3976 } 3977 3978 static void kvm_sched_out(struct preempt_notifier *pn, 3979 struct task_struct *next) 3980 { 3981 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3982 3983 if (current->state == TASK_RUNNING) 3984 vcpu->preempted = true; 3985 kvm_arch_vcpu_put(vcpu); 3986 } 3987 3988 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 3989 struct module *module) 3990 { 3991 int r; 3992 int cpu; 3993 3994 r = kvm_arch_init(opaque); 3995 if (r) 3996 goto out_fail; 3997 3998 /* 3999 * kvm_arch_init makes sure there's at most one caller 4000 * for architectures that support multiple implementations, 4001 * like intel and amd on x86. 4002 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4003 * conflicts in case kvm is already setup for another implementation. 4004 */ 4005 r = kvm_irqfd_init(); 4006 if (r) 4007 goto out_irqfd; 4008 4009 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4010 r = -ENOMEM; 4011 goto out_free_0; 4012 } 4013 4014 r = kvm_arch_hardware_setup(); 4015 if (r < 0) 4016 goto out_free_0a; 4017 4018 for_each_online_cpu(cpu) { 4019 smp_call_function_single(cpu, 4020 kvm_arch_check_processor_compat, 4021 &r, 1); 4022 if (r < 0) 4023 goto out_free_1; 4024 } 4025 4026 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4027 kvm_starting_cpu, kvm_dying_cpu); 4028 if (r) 4029 goto out_free_2; 4030 register_reboot_notifier(&kvm_reboot_notifier); 4031 4032 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4033 if (!vcpu_align) 4034 vcpu_align = __alignof__(struct kvm_vcpu); 4035 kvm_vcpu_cache = 4036 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4037 SLAB_ACCOUNT, 4038 offsetof(struct kvm_vcpu, arch), 4039 sizeof_field(struct kvm_vcpu, arch), 4040 NULL); 4041 if (!kvm_vcpu_cache) { 4042 r = -ENOMEM; 4043 goto out_free_3; 4044 } 4045 4046 r = kvm_async_pf_init(); 4047 if (r) 4048 goto out_free; 4049 4050 kvm_chardev_ops.owner = module; 4051 kvm_vm_fops.owner = module; 4052 kvm_vcpu_fops.owner = module; 4053 4054 r = misc_register(&kvm_dev); 4055 if (r) { 4056 pr_err("kvm: misc device register failed\n"); 4057 goto out_unreg; 4058 } 4059 4060 register_syscore_ops(&kvm_syscore_ops); 4061 4062 kvm_preempt_ops.sched_in = kvm_sched_in; 4063 kvm_preempt_ops.sched_out = kvm_sched_out; 4064 4065 kvm_init_debug(); 4066 4067 r = kvm_vfio_ops_init(); 4068 WARN_ON(r); 4069 4070 return 0; 4071 4072 out_unreg: 4073 kvm_async_pf_deinit(); 4074 out_free: 4075 kmem_cache_destroy(kvm_vcpu_cache); 4076 out_free_3: 4077 unregister_reboot_notifier(&kvm_reboot_notifier); 4078 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4079 out_free_2: 4080 out_free_1: 4081 kvm_arch_hardware_unsetup(); 4082 out_free_0a: 4083 free_cpumask_var(cpus_hardware_enabled); 4084 out_free_0: 4085 kvm_irqfd_exit(); 4086 out_irqfd: 4087 kvm_arch_exit(); 4088 out_fail: 4089 return r; 4090 } 4091 EXPORT_SYMBOL_GPL(kvm_init); 4092 4093 void kvm_exit(void) 4094 { 4095 debugfs_remove_recursive(kvm_debugfs_dir); 4096 misc_deregister(&kvm_dev); 4097 kmem_cache_destroy(kvm_vcpu_cache); 4098 kvm_async_pf_deinit(); 4099 unregister_syscore_ops(&kvm_syscore_ops); 4100 unregister_reboot_notifier(&kvm_reboot_notifier); 4101 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4102 on_each_cpu(hardware_disable_nolock, NULL, 1); 4103 kvm_arch_hardware_unsetup(); 4104 kvm_arch_exit(); 4105 kvm_irqfd_exit(); 4106 free_cpumask_var(cpus_hardware_enabled); 4107 kvm_vfio_ops_exit(); 4108 } 4109 EXPORT_SYMBOL_GPL(kvm_exit); 4110