1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 9 * 10 * Authors: 11 * Avi Kivity <avi@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com> 13 * 14 * This work is licensed under the terms of the GNU GPL, version 2. See 15 * the COPYING file in the top-level directory. 16 * 17 */ 18 19 #include <kvm/iodev.h> 20 21 #include <linux/kvm_host.h> 22 #include <linux/kvm.h> 23 #include <linux/module.h> 24 #include <linux/errno.h> 25 #include <linux/percpu.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/syscore_ops.h> 34 #include <linux/cpu.h> 35 #include <linux/sched/signal.h> 36 #include <linux/sched/mm.h> 37 #include <linux/sched/stat.h> 38 #include <linux/cpumask.h> 39 #include <linux/smp.h> 40 #include <linux/anon_inodes.h> 41 #include <linux/profile.h> 42 #include <linux/kvm_para.h> 43 #include <linux/pagemap.h> 44 #include <linux/mman.h> 45 #include <linux/swap.h> 46 #include <linux/bitops.h> 47 #include <linux/spinlock.h> 48 #include <linux/compat.h> 49 #include <linux/srcu.h> 50 #include <linux/hugetlb.h> 51 #include <linux/slab.h> 52 #include <linux/sort.h> 53 #include <linux/bsearch.h> 54 55 #include <asm/processor.h> 56 #include <asm/io.h> 57 #include <asm/ioctl.h> 58 #include <linux/uaccess.h> 59 #include <asm/pgtable.h> 60 61 #include "coalesced_mmio.h" 62 #include "async_pf.h" 63 #include "vfio.h" 64 65 #define CREATE_TRACE_POINTS 66 #include <trace/events/kvm.h> 67 68 /* Worst case buffer size needed for holding an integer. */ 69 #define ITOA_MAX_LEN 12 70 71 MODULE_AUTHOR("Qumranet"); 72 MODULE_LICENSE("GPL"); 73 74 /* Architectures should define their poll value according to the halt latency */ 75 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 76 module_param(halt_poll_ns, uint, 0644); 77 EXPORT_SYMBOL_GPL(halt_poll_ns); 78 79 /* Default doubles per-vcpu halt_poll_ns. */ 80 unsigned int halt_poll_ns_grow = 2; 81 module_param(halt_poll_ns_grow, uint, 0644); 82 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 83 84 /* Default resets per-vcpu halt_poll_ns . */ 85 unsigned int halt_poll_ns_shrink; 86 module_param(halt_poll_ns_shrink, uint, 0644); 87 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 88 89 /* 90 * Ordering of locks: 91 * 92 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 93 */ 94 95 DEFINE_SPINLOCK(kvm_lock); 96 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 97 LIST_HEAD(vm_list); 98 99 static cpumask_var_t cpus_hardware_enabled; 100 static int kvm_usage_count; 101 static atomic_t hardware_enable_failed; 102 103 struct kmem_cache *kvm_vcpu_cache; 104 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 105 106 static __read_mostly struct preempt_ops kvm_preempt_ops; 107 108 struct dentry *kvm_debugfs_dir; 109 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 110 111 static int kvm_debugfs_num_entries; 112 static const struct file_operations *stat_fops_per_vm[]; 113 114 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 115 unsigned long arg); 116 #ifdef CONFIG_KVM_COMPAT 117 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 118 unsigned long arg); 119 #define KVM_COMPAT(c) .compat_ioctl = (c) 120 #else 121 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg) { return -EINVAL; } 123 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl 124 #endif 125 static int hardware_enable_all(void); 126 static void hardware_disable_all(void); 127 128 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 129 130 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 131 132 __visible bool kvm_rebooting; 133 EXPORT_SYMBOL_GPL(kvm_rebooting); 134 135 static bool largepages_enabled = true; 136 137 #define KVM_EVENT_CREATE_VM 0 138 #define KVM_EVENT_DESTROY_VM 1 139 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 140 static unsigned long long kvm_createvm_count; 141 static unsigned long long kvm_active_vms; 142 143 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 144 unsigned long start, unsigned long end, bool blockable) 145 { 146 return 0; 147 } 148 149 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 150 { 151 if (pfn_valid(pfn)) 152 return PageReserved(pfn_to_page(pfn)); 153 154 return true; 155 } 156 157 /* 158 * Switches to specified vcpu, until a matching vcpu_put() 159 */ 160 void vcpu_load(struct kvm_vcpu *vcpu) 161 { 162 int cpu = get_cpu(); 163 preempt_notifier_register(&vcpu->preempt_notifier); 164 kvm_arch_vcpu_load(vcpu, cpu); 165 put_cpu(); 166 } 167 EXPORT_SYMBOL_GPL(vcpu_load); 168 169 void vcpu_put(struct kvm_vcpu *vcpu) 170 { 171 preempt_disable(); 172 kvm_arch_vcpu_put(vcpu); 173 preempt_notifier_unregister(&vcpu->preempt_notifier); 174 preempt_enable(); 175 } 176 EXPORT_SYMBOL_GPL(vcpu_put); 177 178 /* TODO: merge with kvm_arch_vcpu_should_kick */ 179 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 180 { 181 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 182 183 /* 184 * We need to wait for the VCPU to reenable interrupts and get out of 185 * READING_SHADOW_PAGE_TABLES mode. 186 */ 187 if (req & KVM_REQUEST_WAIT) 188 return mode != OUTSIDE_GUEST_MODE; 189 190 /* 191 * Need to kick a running VCPU, but otherwise there is nothing to do. 192 */ 193 return mode == IN_GUEST_MODE; 194 } 195 196 static void ack_flush(void *_completed) 197 { 198 } 199 200 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 201 { 202 if (unlikely(!cpus)) 203 cpus = cpu_online_mask; 204 205 if (cpumask_empty(cpus)) 206 return false; 207 208 smp_call_function_many(cpus, ack_flush, NULL, wait); 209 return true; 210 } 211 212 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 213 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 214 { 215 int i, cpu, me; 216 struct kvm_vcpu *vcpu; 217 bool called; 218 219 me = get_cpu(); 220 221 kvm_for_each_vcpu(i, vcpu, kvm) { 222 if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) 223 continue; 224 225 kvm_make_request(req, vcpu); 226 cpu = vcpu->cpu; 227 228 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 229 continue; 230 231 if (tmp != NULL && cpu != -1 && cpu != me && 232 kvm_request_needs_ipi(vcpu, req)) 233 __cpumask_set_cpu(cpu, tmp); 234 } 235 236 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 237 put_cpu(); 238 239 return called; 240 } 241 242 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 243 { 244 cpumask_var_t cpus; 245 bool called; 246 247 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 248 249 called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); 250 251 free_cpumask_var(cpus); 252 return called; 253 } 254 255 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 256 void kvm_flush_remote_tlbs(struct kvm *kvm) 257 { 258 /* 259 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 260 * kvm_make_all_cpus_request. 261 */ 262 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 263 264 /* 265 * We want to publish modifications to the page tables before reading 266 * mode. Pairs with a memory barrier in arch-specific code. 267 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 268 * and smp_mb in walk_shadow_page_lockless_begin/end. 269 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 270 * 271 * There is already an smp_mb__after_atomic() before 272 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 273 * barrier here. 274 */ 275 if (!kvm_arch_flush_remote_tlb(kvm) 276 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 277 ++kvm->stat.remote_tlb_flush; 278 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 279 } 280 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 281 #endif 282 283 void kvm_reload_remote_mmus(struct kvm *kvm) 284 { 285 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 286 } 287 288 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 289 { 290 struct page *page; 291 int r; 292 293 mutex_init(&vcpu->mutex); 294 vcpu->cpu = -1; 295 vcpu->kvm = kvm; 296 vcpu->vcpu_id = id; 297 vcpu->pid = NULL; 298 init_swait_queue_head(&vcpu->wq); 299 kvm_async_pf_vcpu_init(vcpu); 300 301 vcpu->pre_pcpu = -1; 302 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 303 304 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 305 if (!page) { 306 r = -ENOMEM; 307 goto fail; 308 } 309 vcpu->run = page_address(page); 310 311 kvm_vcpu_set_in_spin_loop(vcpu, false); 312 kvm_vcpu_set_dy_eligible(vcpu, false); 313 vcpu->preempted = false; 314 315 r = kvm_arch_vcpu_init(vcpu); 316 if (r < 0) 317 goto fail_free_run; 318 return 0; 319 320 fail_free_run: 321 free_page((unsigned long)vcpu->run); 322 fail: 323 return r; 324 } 325 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 326 327 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 328 { 329 /* 330 * no need for rcu_read_lock as VCPU_RUN is the only place that 331 * will change the vcpu->pid pointer and on uninit all file 332 * descriptors are already gone. 333 */ 334 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 335 kvm_arch_vcpu_uninit(vcpu); 336 free_page((unsigned long)vcpu->run); 337 } 338 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 339 340 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 341 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 342 { 343 return container_of(mn, struct kvm, mmu_notifier); 344 } 345 346 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 347 struct mm_struct *mm, 348 unsigned long address, 349 pte_t pte) 350 { 351 struct kvm *kvm = mmu_notifier_to_kvm(mn); 352 int idx; 353 354 idx = srcu_read_lock(&kvm->srcu); 355 spin_lock(&kvm->mmu_lock); 356 kvm->mmu_notifier_seq++; 357 kvm_set_spte_hva(kvm, address, pte); 358 spin_unlock(&kvm->mmu_lock); 359 srcu_read_unlock(&kvm->srcu, idx); 360 } 361 362 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 363 struct mm_struct *mm, 364 unsigned long start, 365 unsigned long end, 366 bool blockable) 367 { 368 struct kvm *kvm = mmu_notifier_to_kvm(mn); 369 int need_tlb_flush = 0, idx; 370 int ret; 371 372 idx = srcu_read_lock(&kvm->srcu); 373 spin_lock(&kvm->mmu_lock); 374 /* 375 * The count increase must become visible at unlock time as no 376 * spte can be established without taking the mmu_lock and 377 * count is also read inside the mmu_lock critical section. 378 */ 379 kvm->mmu_notifier_count++; 380 need_tlb_flush = kvm_unmap_hva_range(kvm, start, end); 381 need_tlb_flush |= kvm->tlbs_dirty; 382 /* we've to flush the tlb before the pages can be freed */ 383 if (need_tlb_flush) 384 kvm_flush_remote_tlbs(kvm); 385 386 spin_unlock(&kvm->mmu_lock); 387 388 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); 389 390 srcu_read_unlock(&kvm->srcu, idx); 391 392 return ret; 393 } 394 395 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 396 struct mm_struct *mm, 397 unsigned long start, 398 unsigned long end) 399 { 400 struct kvm *kvm = mmu_notifier_to_kvm(mn); 401 402 spin_lock(&kvm->mmu_lock); 403 /* 404 * This sequence increase will notify the kvm page fault that 405 * the page that is going to be mapped in the spte could have 406 * been freed. 407 */ 408 kvm->mmu_notifier_seq++; 409 smp_wmb(); 410 /* 411 * The above sequence increase must be visible before the 412 * below count decrease, which is ensured by the smp_wmb above 413 * in conjunction with the smp_rmb in mmu_notifier_retry(). 414 */ 415 kvm->mmu_notifier_count--; 416 spin_unlock(&kvm->mmu_lock); 417 418 BUG_ON(kvm->mmu_notifier_count < 0); 419 } 420 421 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 422 struct mm_struct *mm, 423 unsigned long start, 424 unsigned long end) 425 { 426 struct kvm *kvm = mmu_notifier_to_kvm(mn); 427 int young, idx; 428 429 idx = srcu_read_lock(&kvm->srcu); 430 spin_lock(&kvm->mmu_lock); 431 432 young = kvm_age_hva(kvm, start, end); 433 if (young) 434 kvm_flush_remote_tlbs(kvm); 435 436 spin_unlock(&kvm->mmu_lock); 437 srcu_read_unlock(&kvm->srcu, idx); 438 439 return young; 440 } 441 442 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 443 struct mm_struct *mm, 444 unsigned long start, 445 unsigned long end) 446 { 447 struct kvm *kvm = mmu_notifier_to_kvm(mn); 448 int young, idx; 449 450 idx = srcu_read_lock(&kvm->srcu); 451 spin_lock(&kvm->mmu_lock); 452 /* 453 * Even though we do not flush TLB, this will still adversely 454 * affect performance on pre-Haswell Intel EPT, where there is 455 * no EPT Access Bit to clear so that we have to tear down EPT 456 * tables instead. If we find this unacceptable, we can always 457 * add a parameter to kvm_age_hva so that it effectively doesn't 458 * do anything on clear_young. 459 * 460 * Also note that currently we never issue secondary TLB flushes 461 * from clear_young, leaving this job up to the regular system 462 * cadence. If we find this inaccurate, we might come up with a 463 * more sophisticated heuristic later. 464 */ 465 young = kvm_age_hva(kvm, start, end); 466 spin_unlock(&kvm->mmu_lock); 467 srcu_read_unlock(&kvm->srcu, idx); 468 469 return young; 470 } 471 472 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 473 struct mm_struct *mm, 474 unsigned long address) 475 { 476 struct kvm *kvm = mmu_notifier_to_kvm(mn); 477 int young, idx; 478 479 idx = srcu_read_lock(&kvm->srcu); 480 spin_lock(&kvm->mmu_lock); 481 young = kvm_test_age_hva(kvm, address); 482 spin_unlock(&kvm->mmu_lock); 483 srcu_read_unlock(&kvm->srcu, idx); 484 485 return young; 486 } 487 488 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 489 struct mm_struct *mm) 490 { 491 struct kvm *kvm = mmu_notifier_to_kvm(mn); 492 int idx; 493 494 idx = srcu_read_lock(&kvm->srcu); 495 kvm_arch_flush_shadow_all(kvm); 496 srcu_read_unlock(&kvm->srcu, idx); 497 } 498 499 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 500 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 501 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 502 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 503 .clear_young = kvm_mmu_notifier_clear_young, 504 .test_young = kvm_mmu_notifier_test_young, 505 .change_pte = kvm_mmu_notifier_change_pte, 506 .release = kvm_mmu_notifier_release, 507 }; 508 509 static int kvm_init_mmu_notifier(struct kvm *kvm) 510 { 511 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 512 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 513 } 514 515 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 516 517 static int kvm_init_mmu_notifier(struct kvm *kvm) 518 { 519 return 0; 520 } 521 522 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 523 524 static struct kvm_memslots *kvm_alloc_memslots(void) 525 { 526 int i; 527 struct kvm_memslots *slots; 528 529 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 530 if (!slots) 531 return NULL; 532 533 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 534 slots->id_to_index[i] = slots->memslots[i].id = i; 535 536 return slots; 537 } 538 539 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 540 { 541 if (!memslot->dirty_bitmap) 542 return; 543 544 kvfree(memslot->dirty_bitmap); 545 memslot->dirty_bitmap = NULL; 546 } 547 548 /* 549 * Free any memory in @free but not in @dont. 550 */ 551 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 552 struct kvm_memory_slot *dont) 553 { 554 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 555 kvm_destroy_dirty_bitmap(free); 556 557 kvm_arch_free_memslot(kvm, free, dont); 558 559 free->npages = 0; 560 } 561 562 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 563 { 564 struct kvm_memory_slot *memslot; 565 566 if (!slots) 567 return; 568 569 kvm_for_each_memslot(memslot, slots) 570 kvm_free_memslot(kvm, memslot, NULL); 571 572 kvfree(slots); 573 } 574 575 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 576 { 577 int i; 578 579 if (!kvm->debugfs_dentry) 580 return; 581 582 debugfs_remove_recursive(kvm->debugfs_dentry); 583 584 if (kvm->debugfs_stat_data) { 585 for (i = 0; i < kvm_debugfs_num_entries; i++) 586 kfree(kvm->debugfs_stat_data[i]); 587 kfree(kvm->debugfs_stat_data); 588 } 589 } 590 591 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 592 { 593 char dir_name[ITOA_MAX_LEN * 2]; 594 struct kvm_stat_data *stat_data; 595 struct kvm_stats_debugfs_item *p; 596 597 if (!debugfs_initialized()) 598 return 0; 599 600 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 601 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 602 603 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 604 sizeof(*kvm->debugfs_stat_data), 605 GFP_KERNEL); 606 if (!kvm->debugfs_stat_data) 607 return -ENOMEM; 608 609 for (p = debugfs_entries; p->name; p++) { 610 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL); 611 if (!stat_data) 612 return -ENOMEM; 613 614 stat_data->kvm = kvm; 615 stat_data->offset = p->offset; 616 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 617 debugfs_create_file(p->name, 0644, kvm->debugfs_dentry, 618 stat_data, stat_fops_per_vm[p->kind]); 619 } 620 return 0; 621 } 622 623 static struct kvm *kvm_create_vm(unsigned long type) 624 { 625 int r, i; 626 struct kvm *kvm = kvm_arch_alloc_vm(); 627 628 if (!kvm) 629 return ERR_PTR(-ENOMEM); 630 631 spin_lock_init(&kvm->mmu_lock); 632 mmgrab(current->mm); 633 kvm->mm = current->mm; 634 kvm_eventfd_init(kvm); 635 mutex_init(&kvm->lock); 636 mutex_init(&kvm->irq_lock); 637 mutex_init(&kvm->slots_lock); 638 refcount_set(&kvm->users_count, 1); 639 INIT_LIST_HEAD(&kvm->devices); 640 641 r = kvm_arch_init_vm(kvm, type); 642 if (r) 643 goto out_err_no_disable; 644 645 r = hardware_enable_all(); 646 if (r) 647 goto out_err_no_disable; 648 649 #ifdef CONFIG_HAVE_KVM_IRQFD 650 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 651 #endif 652 653 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 654 655 r = -ENOMEM; 656 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 657 struct kvm_memslots *slots = kvm_alloc_memslots(); 658 if (!slots) 659 goto out_err_no_srcu; 660 /* 661 * Generations must be different for each address space. 662 * Init kvm generation close to the maximum to easily test the 663 * code of handling generation number wrap-around. 664 */ 665 slots->generation = i * 2 - 150; 666 rcu_assign_pointer(kvm->memslots[i], slots); 667 } 668 669 if (init_srcu_struct(&kvm->srcu)) 670 goto out_err_no_srcu; 671 if (init_srcu_struct(&kvm->irq_srcu)) 672 goto out_err_no_irq_srcu; 673 for (i = 0; i < KVM_NR_BUSES; i++) { 674 rcu_assign_pointer(kvm->buses[i], 675 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL)); 676 if (!kvm->buses[i]) 677 goto out_err; 678 } 679 680 r = kvm_init_mmu_notifier(kvm); 681 if (r) 682 goto out_err; 683 684 spin_lock(&kvm_lock); 685 list_add(&kvm->vm_list, &vm_list); 686 spin_unlock(&kvm_lock); 687 688 preempt_notifier_inc(); 689 690 return kvm; 691 692 out_err: 693 cleanup_srcu_struct(&kvm->irq_srcu); 694 out_err_no_irq_srcu: 695 cleanup_srcu_struct(&kvm->srcu); 696 out_err_no_srcu: 697 hardware_disable_all(); 698 out_err_no_disable: 699 refcount_set(&kvm->users_count, 0); 700 for (i = 0; i < KVM_NR_BUSES; i++) 701 kfree(kvm_get_bus(kvm, i)); 702 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 703 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 704 kvm_arch_free_vm(kvm); 705 mmdrop(current->mm); 706 return ERR_PTR(r); 707 } 708 709 static void kvm_destroy_devices(struct kvm *kvm) 710 { 711 struct kvm_device *dev, *tmp; 712 713 /* 714 * We do not need to take the kvm->lock here, because nobody else 715 * has a reference to the struct kvm at this point and therefore 716 * cannot access the devices list anyhow. 717 */ 718 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 719 list_del(&dev->vm_node); 720 dev->ops->destroy(dev); 721 } 722 } 723 724 static void kvm_destroy_vm(struct kvm *kvm) 725 { 726 int i; 727 struct mm_struct *mm = kvm->mm; 728 729 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 730 kvm_destroy_vm_debugfs(kvm); 731 kvm_arch_sync_events(kvm); 732 spin_lock(&kvm_lock); 733 list_del(&kvm->vm_list); 734 spin_unlock(&kvm_lock); 735 kvm_free_irq_routing(kvm); 736 for (i = 0; i < KVM_NR_BUSES; i++) { 737 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 738 739 if (bus) 740 kvm_io_bus_destroy(bus); 741 kvm->buses[i] = NULL; 742 } 743 kvm_coalesced_mmio_free(kvm); 744 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 745 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 746 #else 747 kvm_arch_flush_shadow_all(kvm); 748 #endif 749 kvm_arch_destroy_vm(kvm); 750 kvm_destroy_devices(kvm); 751 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 752 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 753 cleanup_srcu_struct(&kvm->irq_srcu); 754 cleanup_srcu_struct(&kvm->srcu); 755 kvm_arch_free_vm(kvm); 756 preempt_notifier_dec(); 757 hardware_disable_all(); 758 mmdrop(mm); 759 } 760 761 void kvm_get_kvm(struct kvm *kvm) 762 { 763 refcount_inc(&kvm->users_count); 764 } 765 EXPORT_SYMBOL_GPL(kvm_get_kvm); 766 767 void kvm_put_kvm(struct kvm *kvm) 768 { 769 if (refcount_dec_and_test(&kvm->users_count)) 770 kvm_destroy_vm(kvm); 771 } 772 EXPORT_SYMBOL_GPL(kvm_put_kvm); 773 774 775 static int kvm_vm_release(struct inode *inode, struct file *filp) 776 { 777 struct kvm *kvm = filp->private_data; 778 779 kvm_irqfd_release(kvm); 780 781 kvm_put_kvm(kvm); 782 return 0; 783 } 784 785 /* 786 * Allocation size is twice as large as the actual dirty bitmap size. 787 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 788 */ 789 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 790 { 791 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 792 793 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL); 794 if (!memslot->dirty_bitmap) 795 return -ENOMEM; 796 797 return 0; 798 } 799 800 /* 801 * Insert memslot and re-sort memslots based on their GFN, 802 * so binary search could be used to lookup GFN. 803 * Sorting algorithm takes advantage of having initially 804 * sorted array and known changed memslot position. 805 */ 806 static void update_memslots(struct kvm_memslots *slots, 807 struct kvm_memory_slot *new, 808 enum kvm_mr_change change) 809 { 810 int id = new->id; 811 int i = slots->id_to_index[id]; 812 struct kvm_memory_slot *mslots = slots->memslots; 813 814 WARN_ON(mslots[i].id != id); 815 switch (change) { 816 case KVM_MR_CREATE: 817 slots->used_slots++; 818 WARN_ON(mslots[i].npages || !new->npages); 819 break; 820 case KVM_MR_DELETE: 821 slots->used_slots--; 822 WARN_ON(new->npages || !mslots[i].npages); 823 break; 824 default: 825 break; 826 } 827 828 while (i < KVM_MEM_SLOTS_NUM - 1 && 829 new->base_gfn <= mslots[i + 1].base_gfn) { 830 if (!mslots[i + 1].npages) 831 break; 832 mslots[i] = mslots[i + 1]; 833 slots->id_to_index[mslots[i].id] = i; 834 i++; 835 } 836 837 /* 838 * The ">=" is needed when creating a slot with base_gfn == 0, 839 * so that it moves before all those with base_gfn == npages == 0. 840 * 841 * On the other hand, if new->npages is zero, the above loop has 842 * already left i pointing to the beginning of the empty part of 843 * mslots, and the ">=" would move the hole backwards in this 844 * case---which is wrong. So skip the loop when deleting a slot. 845 */ 846 if (new->npages) { 847 while (i > 0 && 848 new->base_gfn >= mslots[i - 1].base_gfn) { 849 mslots[i] = mslots[i - 1]; 850 slots->id_to_index[mslots[i].id] = i; 851 i--; 852 } 853 } else 854 WARN_ON_ONCE(i != slots->used_slots); 855 856 mslots[i] = *new; 857 slots->id_to_index[mslots[i].id] = i; 858 } 859 860 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 861 { 862 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 863 864 #ifdef __KVM_HAVE_READONLY_MEM 865 valid_flags |= KVM_MEM_READONLY; 866 #endif 867 868 if (mem->flags & ~valid_flags) 869 return -EINVAL; 870 871 return 0; 872 } 873 874 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 875 int as_id, struct kvm_memslots *slots) 876 { 877 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 878 879 /* 880 * Set the low bit in the generation, which disables SPTE caching 881 * until the end of synchronize_srcu_expedited. 882 */ 883 WARN_ON(old_memslots->generation & 1); 884 slots->generation = old_memslots->generation + 1; 885 886 rcu_assign_pointer(kvm->memslots[as_id], slots); 887 synchronize_srcu_expedited(&kvm->srcu); 888 889 /* 890 * Increment the new memslot generation a second time. This prevents 891 * vm exits that race with memslot updates from caching a memslot 892 * generation that will (potentially) be valid forever. 893 * 894 * Generations must be unique even across address spaces. We do not need 895 * a global counter for that, instead the generation space is evenly split 896 * across address spaces. For example, with two address spaces, address 897 * space 0 will use generations 0, 4, 8, ... while * address space 1 will 898 * use generations 2, 6, 10, 14, ... 899 */ 900 slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1; 901 902 kvm_arch_memslots_updated(kvm, slots); 903 904 return old_memslots; 905 } 906 907 /* 908 * Allocate some memory and give it an address in the guest physical address 909 * space. 910 * 911 * Discontiguous memory is allowed, mostly for framebuffers. 912 * 913 * Must be called holding kvm->slots_lock for write. 914 */ 915 int __kvm_set_memory_region(struct kvm *kvm, 916 const struct kvm_userspace_memory_region *mem) 917 { 918 int r; 919 gfn_t base_gfn; 920 unsigned long npages; 921 struct kvm_memory_slot *slot; 922 struct kvm_memory_slot old, new; 923 struct kvm_memslots *slots = NULL, *old_memslots; 924 int as_id, id; 925 enum kvm_mr_change change; 926 927 r = check_memory_region_flags(mem); 928 if (r) 929 goto out; 930 931 r = -EINVAL; 932 as_id = mem->slot >> 16; 933 id = (u16)mem->slot; 934 935 /* General sanity checks */ 936 if (mem->memory_size & (PAGE_SIZE - 1)) 937 goto out; 938 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 939 goto out; 940 /* We can read the guest memory with __xxx_user() later on. */ 941 if ((id < KVM_USER_MEM_SLOTS) && 942 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 943 !access_ok(VERIFY_WRITE, 944 (void __user *)(unsigned long)mem->userspace_addr, 945 mem->memory_size))) 946 goto out; 947 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 948 goto out; 949 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 950 goto out; 951 952 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 953 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 954 npages = mem->memory_size >> PAGE_SHIFT; 955 956 if (npages > KVM_MEM_MAX_NR_PAGES) 957 goto out; 958 959 new = old = *slot; 960 961 new.id = id; 962 new.base_gfn = base_gfn; 963 new.npages = npages; 964 new.flags = mem->flags; 965 966 if (npages) { 967 if (!old.npages) 968 change = KVM_MR_CREATE; 969 else { /* Modify an existing slot. */ 970 if ((mem->userspace_addr != old.userspace_addr) || 971 (npages != old.npages) || 972 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 973 goto out; 974 975 if (base_gfn != old.base_gfn) 976 change = KVM_MR_MOVE; 977 else if (new.flags != old.flags) 978 change = KVM_MR_FLAGS_ONLY; 979 else { /* Nothing to change. */ 980 r = 0; 981 goto out; 982 } 983 } 984 } else { 985 if (!old.npages) 986 goto out; 987 988 change = KVM_MR_DELETE; 989 new.base_gfn = 0; 990 new.flags = 0; 991 } 992 993 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 994 /* Check for overlaps */ 995 r = -EEXIST; 996 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 997 if (slot->id == id) 998 continue; 999 if (!((base_gfn + npages <= slot->base_gfn) || 1000 (base_gfn >= slot->base_gfn + slot->npages))) 1001 goto out; 1002 } 1003 } 1004 1005 /* Free page dirty bitmap if unneeded */ 1006 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1007 new.dirty_bitmap = NULL; 1008 1009 r = -ENOMEM; 1010 if (change == KVM_MR_CREATE) { 1011 new.userspace_addr = mem->userspace_addr; 1012 1013 if (kvm_arch_create_memslot(kvm, &new, npages)) 1014 goto out_free; 1015 } 1016 1017 /* Allocate page dirty bitmap if needed */ 1018 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1019 if (kvm_create_dirty_bitmap(&new) < 0) 1020 goto out_free; 1021 } 1022 1023 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 1024 if (!slots) 1025 goto out_free; 1026 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1027 1028 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1029 slot = id_to_memslot(slots, id); 1030 slot->flags |= KVM_MEMSLOT_INVALID; 1031 1032 old_memslots = install_new_memslots(kvm, as_id, slots); 1033 1034 /* From this point no new shadow pages pointing to a deleted, 1035 * or moved, memslot will be created. 1036 * 1037 * validation of sp->gfn happens in: 1038 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1039 * - kvm_is_visible_gfn (mmu_check_roots) 1040 */ 1041 kvm_arch_flush_shadow_memslot(kvm, slot); 1042 1043 /* 1044 * We can re-use the old_memslots from above, the only difference 1045 * from the currently installed memslots is the invalid flag. This 1046 * will get overwritten by update_memslots anyway. 1047 */ 1048 slots = old_memslots; 1049 } 1050 1051 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1052 if (r) 1053 goto out_slots; 1054 1055 /* actual memory is freed via old in kvm_free_memslot below */ 1056 if (change == KVM_MR_DELETE) { 1057 new.dirty_bitmap = NULL; 1058 memset(&new.arch, 0, sizeof(new.arch)); 1059 } 1060 1061 update_memslots(slots, &new, change); 1062 old_memslots = install_new_memslots(kvm, as_id, slots); 1063 1064 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1065 1066 kvm_free_memslot(kvm, &old, &new); 1067 kvfree(old_memslots); 1068 return 0; 1069 1070 out_slots: 1071 kvfree(slots); 1072 out_free: 1073 kvm_free_memslot(kvm, &new, &old); 1074 out: 1075 return r; 1076 } 1077 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1078 1079 int kvm_set_memory_region(struct kvm *kvm, 1080 const struct kvm_userspace_memory_region *mem) 1081 { 1082 int r; 1083 1084 mutex_lock(&kvm->slots_lock); 1085 r = __kvm_set_memory_region(kvm, mem); 1086 mutex_unlock(&kvm->slots_lock); 1087 return r; 1088 } 1089 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1090 1091 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1092 struct kvm_userspace_memory_region *mem) 1093 { 1094 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1095 return -EINVAL; 1096 1097 return kvm_set_memory_region(kvm, mem); 1098 } 1099 1100 int kvm_get_dirty_log(struct kvm *kvm, 1101 struct kvm_dirty_log *log, int *is_dirty) 1102 { 1103 struct kvm_memslots *slots; 1104 struct kvm_memory_slot *memslot; 1105 int i, as_id, id; 1106 unsigned long n; 1107 unsigned long any = 0; 1108 1109 as_id = log->slot >> 16; 1110 id = (u16)log->slot; 1111 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1112 return -EINVAL; 1113 1114 slots = __kvm_memslots(kvm, as_id); 1115 memslot = id_to_memslot(slots, id); 1116 if (!memslot->dirty_bitmap) 1117 return -ENOENT; 1118 1119 n = kvm_dirty_bitmap_bytes(memslot); 1120 1121 for (i = 0; !any && i < n/sizeof(long); ++i) 1122 any = memslot->dirty_bitmap[i]; 1123 1124 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1125 return -EFAULT; 1126 1127 if (any) 1128 *is_dirty = 1; 1129 return 0; 1130 } 1131 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1132 1133 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1134 /** 1135 * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages 1136 * are dirty write protect them for next write. 1137 * @kvm: pointer to kvm instance 1138 * @log: slot id and address to which we copy the log 1139 * @is_dirty: flag set if any page is dirty 1140 * 1141 * We need to keep it in mind that VCPU threads can write to the bitmap 1142 * concurrently. So, to avoid losing track of dirty pages we keep the 1143 * following order: 1144 * 1145 * 1. Take a snapshot of the bit and clear it if needed. 1146 * 2. Write protect the corresponding page. 1147 * 3. Copy the snapshot to the userspace. 1148 * 4. Upon return caller flushes TLB's if needed. 1149 * 1150 * Between 2 and 4, the guest may write to the page using the remaining TLB 1151 * entry. This is not a problem because the page is reported dirty using 1152 * the snapshot taken before and step 4 ensures that writes done after 1153 * exiting to userspace will be logged for the next call. 1154 * 1155 */ 1156 int kvm_get_dirty_log_protect(struct kvm *kvm, 1157 struct kvm_dirty_log *log, bool *is_dirty) 1158 { 1159 struct kvm_memslots *slots; 1160 struct kvm_memory_slot *memslot; 1161 int i, as_id, id; 1162 unsigned long n; 1163 unsigned long *dirty_bitmap; 1164 unsigned long *dirty_bitmap_buffer; 1165 1166 as_id = log->slot >> 16; 1167 id = (u16)log->slot; 1168 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1169 return -EINVAL; 1170 1171 slots = __kvm_memslots(kvm, as_id); 1172 memslot = id_to_memslot(slots, id); 1173 1174 dirty_bitmap = memslot->dirty_bitmap; 1175 if (!dirty_bitmap) 1176 return -ENOENT; 1177 1178 n = kvm_dirty_bitmap_bytes(memslot); 1179 1180 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1181 memset(dirty_bitmap_buffer, 0, n); 1182 1183 spin_lock(&kvm->mmu_lock); 1184 *is_dirty = false; 1185 for (i = 0; i < n / sizeof(long); i++) { 1186 unsigned long mask; 1187 gfn_t offset; 1188 1189 if (!dirty_bitmap[i]) 1190 continue; 1191 1192 *is_dirty = true; 1193 1194 mask = xchg(&dirty_bitmap[i], 0); 1195 dirty_bitmap_buffer[i] = mask; 1196 1197 if (mask) { 1198 offset = i * BITS_PER_LONG; 1199 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1200 offset, mask); 1201 } 1202 } 1203 1204 spin_unlock(&kvm->mmu_lock); 1205 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1206 return -EFAULT; 1207 return 0; 1208 } 1209 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1210 #endif 1211 1212 bool kvm_largepages_enabled(void) 1213 { 1214 return largepages_enabled; 1215 } 1216 1217 void kvm_disable_largepages(void) 1218 { 1219 largepages_enabled = false; 1220 } 1221 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1222 1223 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1224 { 1225 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1226 } 1227 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1228 1229 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1230 { 1231 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1232 } 1233 1234 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1235 { 1236 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1237 1238 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1239 memslot->flags & KVM_MEMSLOT_INVALID) 1240 return false; 1241 1242 return true; 1243 } 1244 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1245 1246 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1247 { 1248 struct vm_area_struct *vma; 1249 unsigned long addr, size; 1250 1251 size = PAGE_SIZE; 1252 1253 addr = gfn_to_hva(kvm, gfn); 1254 if (kvm_is_error_hva(addr)) 1255 return PAGE_SIZE; 1256 1257 down_read(¤t->mm->mmap_sem); 1258 vma = find_vma(current->mm, addr); 1259 if (!vma) 1260 goto out; 1261 1262 size = vma_kernel_pagesize(vma); 1263 1264 out: 1265 up_read(¤t->mm->mmap_sem); 1266 1267 return size; 1268 } 1269 1270 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1271 { 1272 return slot->flags & KVM_MEM_READONLY; 1273 } 1274 1275 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1276 gfn_t *nr_pages, bool write) 1277 { 1278 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1279 return KVM_HVA_ERR_BAD; 1280 1281 if (memslot_is_readonly(slot) && write) 1282 return KVM_HVA_ERR_RO_BAD; 1283 1284 if (nr_pages) 1285 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1286 1287 return __gfn_to_hva_memslot(slot, gfn); 1288 } 1289 1290 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1291 gfn_t *nr_pages) 1292 { 1293 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1294 } 1295 1296 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1297 gfn_t gfn) 1298 { 1299 return gfn_to_hva_many(slot, gfn, NULL); 1300 } 1301 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1302 1303 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1304 { 1305 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1306 } 1307 EXPORT_SYMBOL_GPL(gfn_to_hva); 1308 1309 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1310 { 1311 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1312 } 1313 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1314 1315 /* 1316 * Return the hva of a @gfn and the R/W attribute if possible. 1317 * 1318 * @slot: the kvm_memory_slot which contains @gfn 1319 * @gfn: the gfn to be translated 1320 * @writable: used to return the read/write attribute of the @slot if the hva 1321 * is valid and @writable is not NULL 1322 */ 1323 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1324 gfn_t gfn, bool *writable) 1325 { 1326 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1327 1328 if (!kvm_is_error_hva(hva) && writable) 1329 *writable = !memslot_is_readonly(slot); 1330 1331 return hva; 1332 } 1333 1334 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1335 { 1336 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1337 1338 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1339 } 1340 1341 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1342 { 1343 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1344 1345 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1346 } 1347 1348 static inline int check_user_page_hwpoison(unsigned long addr) 1349 { 1350 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1351 1352 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1353 return rc == -EHWPOISON; 1354 } 1355 1356 /* 1357 * The fast path to get the writable pfn which will be stored in @pfn, 1358 * true indicates success, otherwise false is returned. It's also the 1359 * only part that runs if we can are in atomic context. 1360 */ 1361 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1362 bool *writable, kvm_pfn_t *pfn) 1363 { 1364 struct page *page[1]; 1365 int npages; 1366 1367 /* 1368 * Fast pin a writable pfn only if it is a write fault request 1369 * or the caller allows to map a writable pfn for a read fault 1370 * request. 1371 */ 1372 if (!(write_fault || writable)) 1373 return false; 1374 1375 npages = __get_user_pages_fast(addr, 1, 1, page); 1376 if (npages == 1) { 1377 *pfn = page_to_pfn(page[0]); 1378 1379 if (writable) 1380 *writable = true; 1381 return true; 1382 } 1383 1384 return false; 1385 } 1386 1387 /* 1388 * The slow path to get the pfn of the specified host virtual address, 1389 * 1 indicates success, -errno is returned if error is detected. 1390 */ 1391 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1392 bool *writable, kvm_pfn_t *pfn) 1393 { 1394 unsigned int flags = FOLL_HWPOISON; 1395 struct page *page; 1396 int npages = 0; 1397 1398 might_sleep(); 1399 1400 if (writable) 1401 *writable = write_fault; 1402 1403 if (write_fault) 1404 flags |= FOLL_WRITE; 1405 if (async) 1406 flags |= FOLL_NOWAIT; 1407 1408 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1409 if (npages != 1) 1410 return npages; 1411 1412 /* map read fault as writable if possible */ 1413 if (unlikely(!write_fault) && writable) { 1414 struct page *wpage; 1415 1416 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1417 *writable = true; 1418 put_page(page); 1419 page = wpage; 1420 } 1421 } 1422 *pfn = page_to_pfn(page); 1423 return npages; 1424 } 1425 1426 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1427 { 1428 if (unlikely(!(vma->vm_flags & VM_READ))) 1429 return false; 1430 1431 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1432 return false; 1433 1434 return true; 1435 } 1436 1437 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1438 unsigned long addr, bool *async, 1439 bool write_fault, bool *writable, 1440 kvm_pfn_t *p_pfn) 1441 { 1442 unsigned long pfn; 1443 int r; 1444 1445 r = follow_pfn(vma, addr, &pfn); 1446 if (r) { 1447 /* 1448 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1449 * not call the fault handler, so do it here. 1450 */ 1451 bool unlocked = false; 1452 r = fixup_user_fault(current, current->mm, addr, 1453 (write_fault ? FAULT_FLAG_WRITE : 0), 1454 &unlocked); 1455 if (unlocked) 1456 return -EAGAIN; 1457 if (r) 1458 return r; 1459 1460 r = follow_pfn(vma, addr, &pfn); 1461 if (r) 1462 return r; 1463 1464 } 1465 1466 if (writable) 1467 *writable = true; 1468 1469 /* 1470 * Get a reference here because callers of *hva_to_pfn* and 1471 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1472 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1473 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1474 * simply do nothing for reserved pfns. 1475 * 1476 * Whoever called remap_pfn_range is also going to call e.g. 1477 * unmap_mapping_range before the underlying pages are freed, 1478 * causing a call to our MMU notifier. 1479 */ 1480 kvm_get_pfn(pfn); 1481 1482 *p_pfn = pfn; 1483 return 0; 1484 } 1485 1486 /* 1487 * Pin guest page in memory and return its pfn. 1488 * @addr: host virtual address which maps memory to the guest 1489 * @atomic: whether this function can sleep 1490 * @async: whether this function need to wait IO complete if the 1491 * host page is not in the memory 1492 * @write_fault: whether we should get a writable host page 1493 * @writable: whether it allows to map a writable host page for !@write_fault 1494 * 1495 * The function will map a writable host page for these two cases: 1496 * 1): @write_fault = true 1497 * 2): @write_fault = false && @writable, @writable will tell the caller 1498 * whether the mapping is writable. 1499 */ 1500 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1501 bool write_fault, bool *writable) 1502 { 1503 struct vm_area_struct *vma; 1504 kvm_pfn_t pfn = 0; 1505 int npages, r; 1506 1507 /* we can do it either atomically or asynchronously, not both */ 1508 BUG_ON(atomic && async); 1509 1510 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1511 return pfn; 1512 1513 if (atomic) 1514 return KVM_PFN_ERR_FAULT; 1515 1516 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1517 if (npages == 1) 1518 return pfn; 1519 1520 down_read(¤t->mm->mmap_sem); 1521 if (npages == -EHWPOISON || 1522 (!async && check_user_page_hwpoison(addr))) { 1523 pfn = KVM_PFN_ERR_HWPOISON; 1524 goto exit; 1525 } 1526 1527 retry: 1528 vma = find_vma_intersection(current->mm, addr, addr + 1); 1529 1530 if (vma == NULL) 1531 pfn = KVM_PFN_ERR_FAULT; 1532 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1533 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1534 if (r == -EAGAIN) 1535 goto retry; 1536 if (r < 0) 1537 pfn = KVM_PFN_ERR_FAULT; 1538 } else { 1539 if (async && vma_is_valid(vma, write_fault)) 1540 *async = true; 1541 pfn = KVM_PFN_ERR_FAULT; 1542 } 1543 exit: 1544 up_read(¤t->mm->mmap_sem); 1545 return pfn; 1546 } 1547 1548 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1549 bool atomic, bool *async, bool write_fault, 1550 bool *writable) 1551 { 1552 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1553 1554 if (addr == KVM_HVA_ERR_RO_BAD) { 1555 if (writable) 1556 *writable = false; 1557 return KVM_PFN_ERR_RO_FAULT; 1558 } 1559 1560 if (kvm_is_error_hva(addr)) { 1561 if (writable) 1562 *writable = false; 1563 return KVM_PFN_NOSLOT; 1564 } 1565 1566 /* Do not map writable pfn in the readonly memslot. */ 1567 if (writable && memslot_is_readonly(slot)) { 1568 *writable = false; 1569 writable = NULL; 1570 } 1571 1572 return hva_to_pfn(addr, atomic, async, write_fault, 1573 writable); 1574 } 1575 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1576 1577 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1578 bool *writable) 1579 { 1580 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1581 write_fault, writable); 1582 } 1583 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1584 1585 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1586 { 1587 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1588 } 1589 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1590 1591 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1592 { 1593 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1594 } 1595 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1596 1597 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1598 { 1599 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1600 } 1601 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1602 1603 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1604 { 1605 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1606 } 1607 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1608 1609 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1610 { 1611 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1612 } 1613 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1614 1615 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1616 { 1617 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1618 } 1619 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1620 1621 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1622 struct page **pages, int nr_pages) 1623 { 1624 unsigned long addr; 1625 gfn_t entry = 0; 1626 1627 addr = gfn_to_hva_many(slot, gfn, &entry); 1628 if (kvm_is_error_hva(addr)) 1629 return -1; 1630 1631 if (entry < nr_pages) 1632 return 0; 1633 1634 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1635 } 1636 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1637 1638 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1639 { 1640 if (is_error_noslot_pfn(pfn)) 1641 return KVM_ERR_PTR_BAD_PAGE; 1642 1643 if (kvm_is_reserved_pfn(pfn)) { 1644 WARN_ON(1); 1645 return KVM_ERR_PTR_BAD_PAGE; 1646 } 1647 1648 return pfn_to_page(pfn); 1649 } 1650 1651 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1652 { 1653 kvm_pfn_t pfn; 1654 1655 pfn = gfn_to_pfn(kvm, gfn); 1656 1657 return kvm_pfn_to_page(pfn); 1658 } 1659 EXPORT_SYMBOL_GPL(gfn_to_page); 1660 1661 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1662 { 1663 kvm_pfn_t pfn; 1664 1665 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1666 1667 return kvm_pfn_to_page(pfn); 1668 } 1669 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1670 1671 void kvm_release_page_clean(struct page *page) 1672 { 1673 WARN_ON(is_error_page(page)); 1674 1675 kvm_release_pfn_clean(page_to_pfn(page)); 1676 } 1677 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1678 1679 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1680 { 1681 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1682 put_page(pfn_to_page(pfn)); 1683 } 1684 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1685 1686 void kvm_release_page_dirty(struct page *page) 1687 { 1688 WARN_ON(is_error_page(page)); 1689 1690 kvm_release_pfn_dirty(page_to_pfn(page)); 1691 } 1692 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1693 1694 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1695 { 1696 kvm_set_pfn_dirty(pfn); 1697 kvm_release_pfn_clean(pfn); 1698 } 1699 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1700 1701 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1702 { 1703 if (!kvm_is_reserved_pfn(pfn)) { 1704 struct page *page = pfn_to_page(pfn); 1705 1706 if (!PageReserved(page)) 1707 SetPageDirty(page); 1708 } 1709 } 1710 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1711 1712 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 1713 { 1714 if (!kvm_is_reserved_pfn(pfn)) 1715 mark_page_accessed(pfn_to_page(pfn)); 1716 } 1717 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1718 1719 void kvm_get_pfn(kvm_pfn_t pfn) 1720 { 1721 if (!kvm_is_reserved_pfn(pfn)) 1722 get_page(pfn_to_page(pfn)); 1723 } 1724 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1725 1726 static int next_segment(unsigned long len, int offset) 1727 { 1728 if (len > PAGE_SIZE - offset) 1729 return PAGE_SIZE - offset; 1730 else 1731 return len; 1732 } 1733 1734 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1735 void *data, int offset, int len) 1736 { 1737 int r; 1738 unsigned long addr; 1739 1740 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1741 if (kvm_is_error_hva(addr)) 1742 return -EFAULT; 1743 r = __copy_from_user(data, (void __user *)addr + offset, len); 1744 if (r) 1745 return -EFAULT; 1746 return 0; 1747 } 1748 1749 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1750 int len) 1751 { 1752 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1753 1754 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1755 } 1756 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1757 1758 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1759 int offset, int len) 1760 { 1761 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1762 1763 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1764 } 1765 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1766 1767 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1768 { 1769 gfn_t gfn = gpa >> PAGE_SHIFT; 1770 int seg; 1771 int offset = offset_in_page(gpa); 1772 int ret; 1773 1774 while ((seg = next_segment(len, offset)) != 0) { 1775 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1776 if (ret < 0) 1777 return ret; 1778 offset = 0; 1779 len -= seg; 1780 data += seg; 1781 ++gfn; 1782 } 1783 return 0; 1784 } 1785 EXPORT_SYMBOL_GPL(kvm_read_guest); 1786 1787 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 1788 { 1789 gfn_t gfn = gpa >> PAGE_SHIFT; 1790 int seg; 1791 int offset = offset_in_page(gpa); 1792 int ret; 1793 1794 while ((seg = next_segment(len, offset)) != 0) { 1795 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 1796 if (ret < 0) 1797 return ret; 1798 offset = 0; 1799 len -= seg; 1800 data += seg; 1801 ++gfn; 1802 } 1803 return 0; 1804 } 1805 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 1806 1807 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1808 void *data, int offset, unsigned long len) 1809 { 1810 int r; 1811 unsigned long addr; 1812 1813 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1814 if (kvm_is_error_hva(addr)) 1815 return -EFAULT; 1816 pagefault_disable(); 1817 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1818 pagefault_enable(); 1819 if (r) 1820 return -EFAULT; 1821 return 0; 1822 } 1823 1824 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1825 unsigned long len) 1826 { 1827 gfn_t gfn = gpa >> PAGE_SHIFT; 1828 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1829 int offset = offset_in_page(gpa); 1830 1831 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1832 } 1833 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 1834 1835 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 1836 void *data, unsigned long len) 1837 { 1838 gfn_t gfn = gpa >> PAGE_SHIFT; 1839 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1840 int offset = offset_in_page(gpa); 1841 1842 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 1843 } 1844 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 1845 1846 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 1847 const void *data, int offset, int len) 1848 { 1849 int r; 1850 unsigned long addr; 1851 1852 addr = gfn_to_hva_memslot(memslot, gfn); 1853 if (kvm_is_error_hva(addr)) 1854 return -EFAULT; 1855 r = __copy_to_user((void __user *)addr + offset, data, len); 1856 if (r) 1857 return -EFAULT; 1858 mark_page_dirty_in_slot(memslot, gfn); 1859 return 0; 1860 } 1861 1862 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 1863 const void *data, int offset, int len) 1864 { 1865 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1866 1867 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1868 } 1869 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1870 1871 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 1872 const void *data, int offset, int len) 1873 { 1874 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1875 1876 return __kvm_write_guest_page(slot, gfn, data, offset, len); 1877 } 1878 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 1879 1880 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1881 unsigned long len) 1882 { 1883 gfn_t gfn = gpa >> PAGE_SHIFT; 1884 int seg; 1885 int offset = offset_in_page(gpa); 1886 int ret; 1887 1888 while ((seg = next_segment(len, offset)) != 0) { 1889 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1890 if (ret < 0) 1891 return ret; 1892 offset = 0; 1893 len -= seg; 1894 data += seg; 1895 ++gfn; 1896 } 1897 return 0; 1898 } 1899 EXPORT_SYMBOL_GPL(kvm_write_guest); 1900 1901 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 1902 unsigned long len) 1903 { 1904 gfn_t gfn = gpa >> PAGE_SHIFT; 1905 int seg; 1906 int offset = offset_in_page(gpa); 1907 int ret; 1908 1909 while ((seg = next_segment(len, offset)) != 0) { 1910 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 1911 if (ret < 0) 1912 return ret; 1913 offset = 0; 1914 len -= seg; 1915 data += seg; 1916 ++gfn; 1917 } 1918 return 0; 1919 } 1920 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 1921 1922 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 1923 struct gfn_to_hva_cache *ghc, 1924 gpa_t gpa, unsigned long len) 1925 { 1926 int offset = offset_in_page(gpa); 1927 gfn_t start_gfn = gpa >> PAGE_SHIFT; 1928 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 1929 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 1930 gfn_t nr_pages_avail; 1931 1932 ghc->gpa = gpa; 1933 ghc->generation = slots->generation; 1934 ghc->len = len; 1935 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1936 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, NULL); 1937 if (!kvm_is_error_hva(ghc->hva) && nr_pages_needed <= 1) { 1938 ghc->hva += offset; 1939 } else { 1940 /* 1941 * If the requested region crosses two memslots, we still 1942 * verify that the entire region is valid here. 1943 */ 1944 while (start_gfn <= end_gfn) { 1945 nr_pages_avail = 0; 1946 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 1947 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 1948 &nr_pages_avail); 1949 if (kvm_is_error_hva(ghc->hva)) 1950 return -EFAULT; 1951 start_gfn += nr_pages_avail; 1952 } 1953 /* Use the slow path for cross page reads and writes. */ 1954 ghc->memslot = NULL; 1955 } 1956 return 0; 1957 } 1958 1959 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1960 gpa_t gpa, unsigned long len) 1961 { 1962 struct kvm_memslots *slots = kvm_memslots(kvm); 1963 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 1964 } 1965 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 1966 1967 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1968 void *data, int offset, unsigned long len) 1969 { 1970 struct kvm_memslots *slots = kvm_memslots(kvm); 1971 int r; 1972 gpa_t gpa = ghc->gpa + offset; 1973 1974 BUG_ON(len + offset > ghc->len); 1975 1976 if (slots->generation != ghc->generation) 1977 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 1978 1979 if (unlikely(!ghc->memslot)) 1980 return kvm_write_guest(kvm, gpa, data, len); 1981 1982 if (kvm_is_error_hva(ghc->hva)) 1983 return -EFAULT; 1984 1985 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 1986 if (r) 1987 return -EFAULT; 1988 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 1989 1990 return 0; 1991 } 1992 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 1993 1994 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 1995 void *data, unsigned long len) 1996 { 1997 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 1998 } 1999 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2000 2001 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2002 void *data, unsigned long len) 2003 { 2004 struct kvm_memslots *slots = kvm_memslots(kvm); 2005 int r; 2006 2007 BUG_ON(len > ghc->len); 2008 2009 if (slots->generation != ghc->generation) 2010 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2011 2012 if (unlikely(!ghc->memslot)) 2013 return kvm_read_guest(kvm, ghc->gpa, data, len); 2014 2015 if (kvm_is_error_hva(ghc->hva)) 2016 return -EFAULT; 2017 2018 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2019 if (r) 2020 return -EFAULT; 2021 2022 return 0; 2023 } 2024 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2025 2026 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2027 { 2028 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2029 2030 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2031 } 2032 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2033 2034 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2035 { 2036 gfn_t gfn = gpa >> PAGE_SHIFT; 2037 int seg; 2038 int offset = offset_in_page(gpa); 2039 int ret; 2040 2041 while ((seg = next_segment(len, offset)) != 0) { 2042 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2043 if (ret < 0) 2044 return ret; 2045 offset = 0; 2046 len -= seg; 2047 ++gfn; 2048 } 2049 return 0; 2050 } 2051 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2052 2053 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2054 gfn_t gfn) 2055 { 2056 if (memslot && memslot->dirty_bitmap) { 2057 unsigned long rel_gfn = gfn - memslot->base_gfn; 2058 2059 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2060 } 2061 } 2062 2063 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2064 { 2065 struct kvm_memory_slot *memslot; 2066 2067 memslot = gfn_to_memslot(kvm, gfn); 2068 mark_page_dirty_in_slot(memslot, gfn); 2069 } 2070 EXPORT_SYMBOL_GPL(mark_page_dirty); 2071 2072 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2073 { 2074 struct kvm_memory_slot *memslot; 2075 2076 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2077 mark_page_dirty_in_slot(memslot, gfn); 2078 } 2079 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2080 2081 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2082 { 2083 if (!vcpu->sigset_active) 2084 return; 2085 2086 /* 2087 * This does a lockless modification of ->real_blocked, which is fine 2088 * because, only current can change ->real_blocked and all readers of 2089 * ->real_blocked don't care as long ->real_blocked is always a subset 2090 * of ->blocked. 2091 */ 2092 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2093 } 2094 2095 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2096 { 2097 if (!vcpu->sigset_active) 2098 return; 2099 2100 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2101 sigemptyset(¤t->real_blocked); 2102 } 2103 2104 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2105 { 2106 unsigned int old, val, grow; 2107 2108 old = val = vcpu->halt_poll_ns; 2109 grow = READ_ONCE(halt_poll_ns_grow); 2110 /* 10us base */ 2111 if (val == 0 && grow) 2112 val = 10000; 2113 else 2114 val *= grow; 2115 2116 if (val > halt_poll_ns) 2117 val = halt_poll_ns; 2118 2119 vcpu->halt_poll_ns = val; 2120 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2121 } 2122 2123 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2124 { 2125 unsigned int old, val, shrink; 2126 2127 old = val = vcpu->halt_poll_ns; 2128 shrink = READ_ONCE(halt_poll_ns_shrink); 2129 if (shrink == 0) 2130 val = 0; 2131 else 2132 val /= shrink; 2133 2134 vcpu->halt_poll_ns = val; 2135 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2136 } 2137 2138 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2139 { 2140 int ret = -EINTR; 2141 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2142 2143 if (kvm_arch_vcpu_runnable(vcpu)) { 2144 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2145 goto out; 2146 } 2147 if (kvm_cpu_has_pending_timer(vcpu)) 2148 goto out; 2149 if (signal_pending(current)) 2150 goto out; 2151 2152 ret = 0; 2153 out: 2154 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2155 return ret; 2156 } 2157 2158 /* 2159 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2160 */ 2161 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2162 { 2163 ktime_t start, cur; 2164 DECLARE_SWAITQUEUE(wait); 2165 bool waited = false; 2166 u64 block_ns; 2167 2168 start = cur = ktime_get(); 2169 if (vcpu->halt_poll_ns) { 2170 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2171 2172 ++vcpu->stat.halt_attempted_poll; 2173 do { 2174 /* 2175 * This sets KVM_REQ_UNHALT if an interrupt 2176 * arrives. 2177 */ 2178 if (kvm_vcpu_check_block(vcpu) < 0) { 2179 ++vcpu->stat.halt_successful_poll; 2180 if (!vcpu_valid_wakeup(vcpu)) 2181 ++vcpu->stat.halt_poll_invalid; 2182 goto out; 2183 } 2184 cur = ktime_get(); 2185 } while (single_task_running() && ktime_before(cur, stop)); 2186 } 2187 2188 kvm_arch_vcpu_blocking(vcpu); 2189 2190 for (;;) { 2191 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2192 2193 if (kvm_vcpu_check_block(vcpu) < 0) 2194 break; 2195 2196 waited = true; 2197 schedule(); 2198 } 2199 2200 finish_swait(&vcpu->wq, &wait); 2201 cur = ktime_get(); 2202 2203 kvm_arch_vcpu_unblocking(vcpu); 2204 out: 2205 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2206 2207 if (!vcpu_valid_wakeup(vcpu)) 2208 shrink_halt_poll_ns(vcpu); 2209 else if (halt_poll_ns) { 2210 if (block_ns <= vcpu->halt_poll_ns) 2211 ; 2212 /* we had a long block, shrink polling */ 2213 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2214 shrink_halt_poll_ns(vcpu); 2215 /* we had a short halt and our poll time is too small */ 2216 else if (vcpu->halt_poll_ns < halt_poll_ns && 2217 block_ns < halt_poll_ns) 2218 grow_halt_poll_ns(vcpu); 2219 } else 2220 vcpu->halt_poll_ns = 0; 2221 2222 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2223 kvm_arch_vcpu_block_finish(vcpu); 2224 } 2225 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2226 2227 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2228 { 2229 struct swait_queue_head *wqp; 2230 2231 wqp = kvm_arch_vcpu_wq(vcpu); 2232 if (swq_has_sleeper(wqp)) { 2233 swake_up_one(wqp); 2234 ++vcpu->stat.halt_wakeup; 2235 return true; 2236 } 2237 2238 return false; 2239 } 2240 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2241 2242 #ifndef CONFIG_S390 2243 /* 2244 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2245 */ 2246 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2247 { 2248 int me; 2249 int cpu = vcpu->cpu; 2250 2251 if (kvm_vcpu_wake_up(vcpu)) 2252 return; 2253 2254 me = get_cpu(); 2255 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2256 if (kvm_arch_vcpu_should_kick(vcpu)) 2257 smp_send_reschedule(cpu); 2258 put_cpu(); 2259 } 2260 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2261 #endif /* !CONFIG_S390 */ 2262 2263 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2264 { 2265 struct pid *pid; 2266 struct task_struct *task = NULL; 2267 int ret = 0; 2268 2269 rcu_read_lock(); 2270 pid = rcu_dereference(target->pid); 2271 if (pid) 2272 task = get_pid_task(pid, PIDTYPE_PID); 2273 rcu_read_unlock(); 2274 if (!task) 2275 return ret; 2276 ret = yield_to(task, 1); 2277 put_task_struct(task); 2278 2279 return ret; 2280 } 2281 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2282 2283 /* 2284 * Helper that checks whether a VCPU is eligible for directed yield. 2285 * Most eligible candidate to yield is decided by following heuristics: 2286 * 2287 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2288 * (preempted lock holder), indicated by @in_spin_loop. 2289 * Set at the beiginning and cleared at the end of interception/PLE handler. 2290 * 2291 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2292 * chance last time (mostly it has become eligible now since we have probably 2293 * yielded to lockholder in last iteration. This is done by toggling 2294 * @dy_eligible each time a VCPU checked for eligibility.) 2295 * 2296 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2297 * to preempted lock-holder could result in wrong VCPU selection and CPU 2298 * burning. Giving priority for a potential lock-holder increases lock 2299 * progress. 2300 * 2301 * Since algorithm is based on heuristics, accessing another VCPU data without 2302 * locking does not harm. It may result in trying to yield to same VCPU, fail 2303 * and continue with next VCPU and so on. 2304 */ 2305 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2306 { 2307 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2308 bool eligible; 2309 2310 eligible = !vcpu->spin_loop.in_spin_loop || 2311 vcpu->spin_loop.dy_eligible; 2312 2313 if (vcpu->spin_loop.in_spin_loop) 2314 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2315 2316 return eligible; 2317 #else 2318 return true; 2319 #endif 2320 } 2321 2322 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2323 { 2324 struct kvm *kvm = me->kvm; 2325 struct kvm_vcpu *vcpu; 2326 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2327 int yielded = 0; 2328 int try = 3; 2329 int pass; 2330 int i; 2331 2332 kvm_vcpu_set_in_spin_loop(me, true); 2333 /* 2334 * We boost the priority of a VCPU that is runnable but not 2335 * currently running, because it got preempted by something 2336 * else and called schedule in __vcpu_run. Hopefully that 2337 * VCPU is holding the lock that we need and will release it. 2338 * We approximate round-robin by starting at the last boosted VCPU. 2339 */ 2340 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2341 kvm_for_each_vcpu(i, vcpu, kvm) { 2342 if (!pass && i <= last_boosted_vcpu) { 2343 i = last_boosted_vcpu; 2344 continue; 2345 } else if (pass && i > last_boosted_vcpu) 2346 break; 2347 if (!READ_ONCE(vcpu->preempted)) 2348 continue; 2349 if (vcpu == me) 2350 continue; 2351 if (swait_active(&vcpu->wq) && !kvm_arch_vcpu_runnable(vcpu)) 2352 continue; 2353 if (yield_to_kernel_mode && !kvm_arch_vcpu_in_kernel(vcpu)) 2354 continue; 2355 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2356 continue; 2357 2358 yielded = kvm_vcpu_yield_to(vcpu); 2359 if (yielded > 0) { 2360 kvm->last_boosted_vcpu = i; 2361 break; 2362 } else if (yielded < 0) { 2363 try--; 2364 if (!try) 2365 break; 2366 } 2367 } 2368 } 2369 kvm_vcpu_set_in_spin_loop(me, false); 2370 2371 /* Ensure vcpu is not eligible during next spinloop */ 2372 kvm_vcpu_set_dy_eligible(me, false); 2373 } 2374 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2375 2376 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2377 { 2378 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2379 struct page *page; 2380 2381 if (vmf->pgoff == 0) 2382 page = virt_to_page(vcpu->run); 2383 #ifdef CONFIG_X86 2384 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2385 page = virt_to_page(vcpu->arch.pio_data); 2386 #endif 2387 #ifdef CONFIG_KVM_MMIO 2388 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2389 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2390 #endif 2391 else 2392 return kvm_arch_vcpu_fault(vcpu, vmf); 2393 get_page(page); 2394 vmf->page = page; 2395 return 0; 2396 } 2397 2398 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2399 .fault = kvm_vcpu_fault, 2400 }; 2401 2402 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2403 { 2404 vma->vm_ops = &kvm_vcpu_vm_ops; 2405 return 0; 2406 } 2407 2408 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2409 { 2410 struct kvm_vcpu *vcpu = filp->private_data; 2411 2412 debugfs_remove_recursive(vcpu->debugfs_dentry); 2413 kvm_put_kvm(vcpu->kvm); 2414 return 0; 2415 } 2416 2417 static struct file_operations kvm_vcpu_fops = { 2418 .release = kvm_vcpu_release, 2419 .unlocked_ioctl = kvm_vcpu_ioctl, 2420 .mmap = kvm_vcpu_mmap, 2421 .llseek = noop_llseek, 2422 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2423 }; 2424 2425 /* 2426 * Allocates an inode for the vcpu. 2427 */ 2428 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2429 { 2430 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2431 2432 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2433 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2434 } 2435 2436 static int kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2437 { 2438 char dir_name[ITOA_MAX_LEN * 2]; 2439 int ret; 2440 2441 if (!kvm_arch_has_vcpu_debugfs()) 2442 return 0; 2443 2444 if (!debugfs_initialized()) 2445 return 0; 2446 2447 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2448 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2449 vcpu->kvm->debugfs_dentry); 2450 if (!vcpu->debugfs_dentry) 2451 return -ENOMEM; 2452 2453 ret = kvm_arch_create_vcpu_debugfs(vcpu); 2454 if (ret < 0) { 2455 debugfs_remove_recursive(vcpu->debugfs_dentry); 2456 return ret; 2457 } 2458 2459 return 0; 2460 } 2461 2462 /* 2463 * Creates some virtual cpus. Good luck creating more than one. 2464 */ 2465 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2466 { 2467 int r; 2468 struct kvm_vcpu *vcpu; 2469 2470 if (id >= KVM_MAX_VCPU_ID) 2471 return -EINVAL; 2472 2473 mutex_lock(&kvm->lock); 2474 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2475 mutex_unlock(&kvm->lock); 2476 return -EINVAL; 2477 } 2478 2479 kvm->created_vcpus++; 2480 mutex_unlock(&kvm->lock); 2481 2482 vcpu = kvm_arch_vcpu_create(kvm, id); 2483 if (IS_ERR(vcpu)) { 2484 r = PTR_ERR(vcpu); 2485 goto vcpu_decrement; 2486 } 2487 2488 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2489 2490 r = kvm_arch_vcpu_setup(vcpu); 2491 if (r) 2492 goto vcpu_destroy; 2493 2494 r = kvm_create_vcpu_debugfs(vcpu); 2495 if (r) 2496 goto vcpu_destroy; 2497 2498 mutex_lock(&kvm->lock); 2499 if (kvm_get_vcpu_by_id(kvm, id)) { 2500 r = -EEXIST; 2501 goto unlock_vcpu_destroy; 2502 } 2503 2504 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2505 2506 /* Now it's all set up, let userspace reach it */ 2507 kvm_get_kvm(kvm); 2508 r = create_vcpu_fd(vcpu); 2509 if (r < 0) { 2510 kvm_put_kvm(kvm); 2511 goto unlock_vcpu_destroy; 2512 } 2513 2514 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2515 2516 /* 2517 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2518 * before kvm->online_vcpu's incremented value. 2519 */ 2520 smp_wmb(); 2521 atomic_inc(&kvm->online_vcpus); 2522 2523 mutex_unlock(&kvm->lock); 2524 kvm_arch_vcpu_postcreate(vcpu); 2525 return r; 2526 2527 unlock_vcpu_destroy: 2528 mutex_unlock(&kvm->lock); 2529 debugfs_remove_recursive(vcpu->debugfs_dentry); 2530 vcpu_destroy: 2531 kvm_arch_vcpu_destroy(vcpu); 2532 vcpu_decrement: 2533 mutex_lock(&kvm->lock); 2534 kvm->created_vcpus--; 2535 mutex_unlock(&kvm->lock); 2536 return r; 2537 } 2538 2539 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2540 { 2541 if (sigset) { 2542 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2543 vcpu->sigset_active = 1; 2544 vcpu->sigset = *sigset; 2545 } else 2546 vcpu->sigset_active = 0; 2547 return 0; 2548 } 2549 2550 static long kvm_vcpu_ioctl(struct file *filp, 2551 unsigned int ioctl, unsigned long arg) 2552 { 2553 struct kvm_vcpu *vcpu = filp->private_data; 2554 void __user *argp = (void __user *)arg; 2555 int r; 2556 struct kvm_fpu *fpu = NULL; 2557 struct kvm_sregs *kvm_sregs = NULL; 2558 2559 if (vcpu->kvm->mm != current->mm) 2560 return -EIO; 2561 2562 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2563 return -EINVAL; 2564 2565 /* 2566 * Some architectures have vcpu ioctls that are asynchronous to vcpu 2567 * execution; mutex_lock() would break them. 2568 */ 2569 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 2570 if (r != -ENOIOCTLCMD) 2571 return r; 2572 2573 if (mutex_lock_killable(&vcpu->mutex)) 2574 return -EINTR; 2575 switch (ioctl) { 2576 case KVM_RUN: { 2577 struct pid *oldpid; 2578 r = -EINVAL; 2579 if (arg) 2580 goto out; 2581 oldpid = rcu_access_pointer(vcpu->pid); 2582 if (unlikely(oldpid != task_pid(current))) { 2583 /* The thread running this VCPU changed. */ 2584 struct pid *newpid; 2585 2586 r = kvm_arch_vcpu_run_pid_change(vcpu); 2587 if (r) 2588 break; 2589 2590 newpid = get_task_pid(current, PIDTYPE_PID); 2591 rcu_assign_pointer(vcpu->pid, newpid); 2592 if (oldpid) 2593 synchronize_rcu(); 2594 put_pid(oldpid); 2595 } 2596 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2597 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2598 break; 2599 } 2600 case KVM_GET_REGS: { 2601 struct kvm_regs *kvm_regs; 2602 2603 r = -ENOMEM; 2604 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 2605 if (!kvm_regs) 2606 goto out; 2607 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2608 if (r) 2609 goto out_free1; 2610 r = -EFAULT; 2611 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2612 goto out_free1; 2613 r = 0; 2614 out_free1: 2615 kfree(kvm_regs); 2616 break; 2617 } 2618 case KVM_SET_REGS: { 2619 struct kvm_regs *kvm_regs; 2620 2621 r = -ENOMEM; 2622 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2623 if (IS_ERR(kvm_regs)) { 2624 r = PTR_ERR(kvm_regs); 2625 goto out; 2626 } 2627 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2628 kfree(kvm_regs); 2629 break; 2630 } 2631 case KVM_GET_SREGS: { 2632 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 2633 r = -ENOMEM; 2634 if (!kvm_sregs) 2635 goto out; 2636 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2637 if (r) 2638 goto out; 2639 r = -EFAULT; 2640 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2641 goto out; 2642 r = 0; 2643 break; 2644 } 2645 case KVM_SET_SREGS: { 2646 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2647 if (IS_ERR(kvm_sregs)) { 2648 r = PTR_ERR(kvm_sregs); 2649 kvm_sregs = NULL; 2650 goto out; 2651 } 2652 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2653 break; 2654 } 2655 case KVM_GET_MP_STATE: { 2656 struct kvm_mp_state mp_state; 2657 2658 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2659 if (r) 2660 goto out; 2661 r = -EFAULT; 2662 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2663 goto out; 2664 r = 0; 2665 break; 2666 } 2667 case KVM_SET_MP_STATE: { 2668 struct kvm_mp_state mp_state; 2669 2670 r = -EFAULT; 2671 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2672 goto out; 2673 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2674 break; 2675 } 2676 case KVM_TRANSLATE: { 2677 struct kvm_translation tr; 2678 2679 r = -EFAULT; 2680 if (copy_from_user(&tr, argp, sizeof(tr))) 2681 goto out; 2682 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2683 if (r) 2684 goto out; 2685 r = -EFAULT; 2686 if (copy_to_user(argp, &tr, sizeof(tr))) 2687 goto out; 2688 r = 0; 2689 break; 2690 } 2691 case KVM_SET_GUEST_DEBUG: { 2692 struct kvm_guest_debug dbg; 2693 2694 r = -EFAULT; 2695 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2696 goto out; 2697 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2698 break; 2699 } 2700 case KVM_SET_SIGNAL_MASK: { 2701 struct kvm_signal_mask __user *sigmask_arg = argp; 2702 struct kvm_signal_mask kvm_sigmask; 2703 sigset_t sigset, *p; 2704 2705 p = NULL; 2706 if (argp) { 2707 r = -EFAULT; 2708 if (copy_from_user(&kvm_sigmask, argp, 2709 sizeof(kvm_sigmask))) 2710 goto out; 2711 r = -EINVAL; 2712 if (kvm_sigmask.len != sizeof(sigset)) 2713 goto out; 2714 r = -EFAULT; 2715 if (copy_from_user(&sigset, sigmask_arg->sigset, 2716 sizeof(sigset))) 2717 goto out; 2718 p = &sigset; 2719 } 2720 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2721 break; 2722 } 2723 case KVM_GET_FPU: { 2724 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2725 r = -ENOMEM; 2726 if (!fpu) 2727 goto out; 2728 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2729 if (r) 2730 goto out; 2731 r = -EFAULT; 2732 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2733 goto out; 2734 r = 0; 2735 break; 2736 } 2737 case KVM_SET_FPU: { 2738 fpu = memdup_user(argp, sizeof(*fpu)); 2739 if (IS_ERR(fpu)) { 2740 r = PTR_ERR(fpu); 2741 fpu = NULL; 2742 goto out; 2743 } 2744 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2745 break; 2746 } 2747 default: 2748 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2749 } 2750 out: 2751 mutex_unlock(&vcpu->mutex); 2752 kfree(fpu); 2753 kfree(kvm_sregs); 2754 return r; 2755 } 2756 2757 #ifdef CONFIG_KVM_COMPAT 2758 static long kvm_vcpu_compat_ioctl(struct file *filp, 2759 unsigned int ioctl, unsigned long arg) 2760 { 2761 struct kvm_vcpu *vcpu = filp->private_data; 2762 void __user *argp = compat_ptr(arg); 2763 int r; 2764 2765 if (vcpu->kvm->mm != current->mm) 2766 return -EIO; 2767 2768 switch (ioctl) { 2769 case KVM_SET_SIGNAL_MASK: { 2770 struct kvm_signal_mask __user *sigmask_arg = argp; 2771 struct kvm_signal_mask kvm_sigmask; 2772 sigset_t sigset; 2773 2774 if (argp) { 2775 r = -EFAULT; 2776 if (copy_from_user(&kvm_sigmask, argp, 2777 sizeof(kvm_sigmask))) 2778 goto out; 2779 r = -EINVAL; 2780 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 2781 goto out; 2782 r = -EFAULT; 2783 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 2784 goto out; 2785 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2786 } else 2787 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2788 break; 2789 } 2790 default: 2791 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2792 } 2793 2794 out: 2795 return r; 2796 } 2797 #endif 2798 2799 static int kvm_device_ioctl_attr(struct kvm_device *dev, 2800 int (*accessor)(struct kvm_device *dev, 2801 struct kvm_device_attr *attr), 2802 unsigned long arg) 2803 { 2804 struct kvm_device_attr attr; 2805 2806 if (!accessor) 2807 return -EPERM; 2808 2809 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 2810 return -EFAULT; 2811 2812 return accessor(dev, &attr); 2813 } 2814 2815 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 2816 unsigned long arg) 2817 { 2818 struct kvm_device *dev = filp->private_data; 2819 2820 switch (ioctl) { 2821 case KVM_SET_DEVICE_ATTR: 2822 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 2823 case KVM_GET_DEVICE_ATTR: 2824 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 2825 case KVM_HAS_DEVICE_ATTR: 2826 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 2827 default: 2828 if (dev->ops->ioctl) 2829 return dev->ops->ioctl(dev, ioctl, arg); 2830 2831 return -ENOTTY; 2832 } 2833 } 2834 2835 static int kvm_device_release(struct inode *inode, struct file *filp) 2836 { 2837 struct kvm_device *dev = filp->private_data; 2838 struct kvm *kvm = dev->kvm; 2839 2840 kvm_put_kvm(kvm); 2841 return 0; 2842 } 2843 2844 static const struct file_operations kvm_device_fops = { 2845 .unlocked_ioctl = kvm_device_ioctl, 2846 .release = kvm_device_release, 2847 KVM_COMPAT(kvm_device_ioctl), 2848 }; 2849 2850 struct kvm_device *kvm_device_from_filp(struct file *filp) 2851 { 2852 if (filp->f_op != &kvm_device_fops) 2853 return NULL; 2854 2855 return filp->private_data; 2856 } 2857 2858 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 2859 #ifdef CONFIG_KVM_MPIC 2860 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 2861 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 2862 #endif 2863 }; 2864 2865 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 2866 { 2867 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 2868 return -ENOSPC; 2869 2870 if (kvm_device_ops_table[type] != NULL) 2871 return -EEXIST; 2872 2873 kvm_device_ops_table[type] = ops; 2874 return 0; 2875 } 2876 2877 void kvm_unregister_device_ops(u32 type) 2878 { 2879 if (kvm_device_ops_table[type] != NULL) 2880 kvm_device_ops_table[type] = NULL; 2881 } 2882 2883 static int kvm_ioctl_create_device(struct kvm *kvm, 2884 struct kvm_create_device *cd) 2885 { 2886 struct kvm_device_ops *ops = NULL; 2887 struct kvm_device *dev; 2888 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 2889 int ret; 2890 2891 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 2892 return -ENODEV; 2893 2894 ops = kvm_device_ops_table[cd->type]; 2895 if (ops == NULL) 2896 return -ENODEV; 2897 2898 if (test) 2899 return 0; 2900 2901 dev = kzalloc(sizeof(*dev), GFP_KERNEL); 2902 if (!dev) 2903 return -ENOMEM; 2904 2905 dev->ops = ops; 2906 dev->kvm = kvm; 2907 2908 mutex_lock(&kvm->lock); 2909 ret = ops->create(dev, cd->type); 2910 if (ret < 0) { 2911 mutex_unlock(&kvm->lock); 2912 kfree(dev); 2913 return ret; 2914 } 2915 list_add(&dev->vm_node, &kvm->devices); 2916 mutex_unlock(&kvm->lock); 2917 2918 if (ops->init) 2919 ops->init(dev); 2920 2921 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 2922 if (ret < 0) { 2923 mutex_lock(&kvm->lock); 2924 list_del(&dev->vm_node); 2925 mutex_unlock(&kvm->lock); 2926 ops->destroy(dev); 2927 return ret; 2928 } 2929 2930 kvm_get_kvm(kvm); 2931 cd->fd = ret; 2932 return 0; 2933 } 2934 2935 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 2936 { 2937 switch (arg) { 2938 case KVM_CAP_USER_MEMORY: 2939 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2940 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2941 case KVM_CAP_INTERNAL_ERROR_DATA: 2942 #ifdef CONFIG_HAVE_KVM_MSI 2943 case KVM_CAP_SIGNAL_MSI: 2944 #endif 2945 #ifdef CONFIG_HAVE_KVM_IRQFD 2946 case KVM_CAP_IRQFD: 2947 case KVM_CAP_IRQFD_RESAMPLE: 2948 #endif 2949 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 2950 case KVM_CAP_CHECK_EXTENSION_VM: 2951 return 1; 2952 #ifdef CONFIG_KVM_MMIO 2953 case KVM_CAP_COALESCED_MMIO: 2954 return KVM_COALESCED_MMIO_PAGE_OFFSET; 2955 case KVM_CAP_COALESCED_PIO: 2956 return 1; 2957 #endif 2958 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 2959 case KVM_CAP_IRQ_ROUTING: 2960 return KVM_MAX_IRQ_ROUTES; 2961 #endif 2962 #if KVM_ADDRESS_SPACE_NUM > 1 2963 case KVM_CAP_MULTI_ADDRESS_SPACE: 2964 return KVM_ADDRESS_SPACE_NUM; 2965 #endif 2966 case KVM_CAP_MAX_VCPU_ID: 2967 return KVM_MAX_VCPU_ID; 2968 default: 2969 break; 2970 } 2971 return kvm_vm_ioctl_check_extension(kvm, arg); 2972 } 2973 2974 static long kvm_vm_ioctl(struct file *filp, 2975 unsigned int ioctl, unsigned long arg) 2976 { 2977 struct kvm *kvm = filp->private_data; 2978 void __user *argp = (void __user *)arg; 2979 int r; 2980 2981 if (kvm->mm != current->mm) 2982 return -EIO; 2983 switch (ioctl) { 2984 case KVM_CREATE_VCPU: 2985 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2986 break; 2987 case KVM_SET_USER_MEMORY_REGION: { 2988 struct kvm_userspace_memory_region kvm_userspace_mem; 2989 2990 r = -EFAULT; 2991 if (copy_from_user(&kvm_userspace_mem, argp, 2992 sizeof(kvm_userspace_mem))) 2993 goto out; 2994 2995 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 2996 break; 2997 } 2998 case KVM_GET_DIRTY_LOG: { 2999 struct kvm_dirty_log log; 3000 3001 r = -EFAULT; 3002 if (copy_from_user(&log, argp, sizeof(log))) 3003 goto out; 3004 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3005 break; 3006 } 3007 #ifdef CONFIG_KVM_MMIO 3008 case KVM_REGISTER_COALESCED_MMIO: { 3009 struct kvm_coalesced_mmio_zone zone; 3010 3011 r = -EFAULT; 3012 if (copy_from_user(&zone, argp, sizeof(zone))) 3013 goto out; 3014 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3015 break; 3016 } 3017 case KVM_UNREGISTER_COALESCED_MMIO: { 3018 struct kvm_coalesced_mmio_zone zone; 3019 3020 r = -EFAULT; 3021 if (copy_from_user(&zone, argp, sizeof(zone))) 3022 goto out; 3023 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3024 break; 3025 } 3026 #endif 3027 case KVM_IRQFD: { 3028 struct kvm_irqfd data; 3029 3030 r = -EFAULT; 3031 if (copy_from_user(&data, argp, sizeof(data))) 3032 goto out; 3033 r = kvm_irqfd(kvm, &data); 3034 break; 3035 } 3036 case KVM_IOEVENTFD: { 3037 struct kvm_ioeventfd data; 3038 3039 r = -EFAULT; 3040 if (copy_from_user(&data, argp, sizeof(data))) 3041 goto out; 3042 r = kvm_ioeventfd(kvm, &data); 3043 break; 3044 } 3045 #ifdef CONFIG_HAVE_KVM_MSI 3046 case KVM_SIGNAL_MSI: { 3047 struct kvm_msi msi; 3048 3049 r = -EFAULT; 3050 if (copy_from_user(&msi, argp, sizeof(msi))) 3051 goto out; 3052 r = kvm_send_userspace_msi(kvm, &msi); 3053 break; 3054 } 3055 #endif 3056 #ifdef __KVM_HAVE_IRQ_LINE 3057 case KVM_IRQ_LINE_STATUS: 3058 case KVM_IRQ_LINE: { 3059 struct kvm_irq_level irq_event; 3060 3061 r = -EFAULT; 3062 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3063 goto out; 3064 3065 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3066 ioctl == KVM_IRQ_LINE_STATUS); 3067 if (r) 3068 goto out; 3069 3070 r = -EFAULT; 3071 if (ioctl == KVM_IRQ_LINE_STATUS) { 3072 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3073 goto out; 3074 } 3075 3076 r = 0; 3077 break; 3078 } 3079 #endif 3080 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3081 case KVM_SET_GSI_ROUTING: { 3082 struct kvm_irq_routing routing; 3083 struct kvm_irq_routing __user *urouting; 3084 struct kvm_irq_routing_entry *entries = NULL; 3085 3086 r = -EFAULT; 3087 if (copy_from_user(&routing, argp, sizeof(routing))) 3088 goto out; 3089 r = -EINVAL; 3090 if (!kvm_arch_can_set_irq_routing(kvm)) 3091 goto out; 3092 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3093 goto out; 3094 if (routing.flags) 3095 goto out; 3096 if (routing.nr) { 3097 r = -ENOMEM; 3098 entries = vmalloc(array_size(sizeof(*entries), 3099 routing.nr)); 3100 if (!entries) 3101 goto out; 3102 r = -EFAULT; 3103 urouting = argp; 3104 if (copy_from_user(entries, urouting->entries, 3105 routing.nr * sizeof(*entries))) 3106 goto out_free_irq_routing; 3107 } 3108 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3109 routing.flags); 3110 out_free_irq_routing: 3111 vfree(entries); 3112 break; 3113 } 3114 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3115 case KVM_CREATE_DEVICE: { 3116 struct kvm_create_device cd; 3117 3118 r = -EFAULT; 3119 if (copy_from_user(&cd, argp, sizeof(cd))) 3120 goto out; 3121 3122 r = kvm_ioctl_create_device(kvm, &cd); 3123 if (r) 3124 goto out; 3125 3126 r = -EFAULT; 3127 if (copy_to_user(argp, &cd, sizeof(cd))) 3128 goto out; 3129 3130 r = 0; 3131 break; 3132 } 3133 case KVM_CHECK_EXTENSION: 3134 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3135 break; 3136 default: 3137 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3138 } 3139 out: 3140 return r; 3141 } 3142 3143 #ifdef CONFIG_KVM_COMPAT 3144 struct compat_kvm_dirty_log { 3145 __u32 slot; 3146 __u32 padding1; 3147 union { 3148 compat_uptr_t dirty_bitmap; /* one bit per page */ 3149 __u64 padding2; 3150 }; 3151 }; 3152 3153 static long kvm_vm_compat_ioctl(struct file *filp, 3154 unsigned int ioctl, unsigned long arg) 3155 { 3156 struct kvm *kvm = filp->private_data; 3157 int r; 3158 3159 if (kvm->mm != current->mm) 3160 return -EIO; 3161 switch (ioctl) { 3162 case KVM_GET_DIRTY_LOG: { 3163 struct compat_kvm_dirty_log compat_log; 3164 struct kvm_dirty_log log; 3165 3166 if (copy_from_user(&compat_log, (void __user *)arg, 3167 sizeof(compat_log))) 3168 return -EFAULT; 3169 log.slot = compat_log.slot; 3170 log.padding1 = compat_log.padding1; 3171 log.padding2 = compat_log.padding2; 3172 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3173 3174 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3175 break; 3176 } 3177 default: 3178 r = kvm_vm_ioctl(filp, ioctl, arg); 3179 } 3180 return r; 3181 } 3182 #endif 3183 3184 static struct file_operations kvm_vm_fops = { 3185 .release = kvm_vm_release, 3186 .unlocked_ioctl = kvm_vm_ioctl, 3187 .llseek = noop_llseek, 3188 KVM_COMPAT(kvm_vm_compat_ioctl), 3189 }; 3190 3191 static int kvm_dev_ioctl_create_vm(unsigned long type) 3192 { 3193 int r; 3194 struct kvm *kvm; 3195 struct file *file; 3196 3197 kvm = kvm_create_vm(type); 3198 if (IS_ERR(kvm)) 3199 return PTR_ERR(kvm); 3200 #ifdef CONFIG_KVM_MMIO 3201 r = kvm_coalesced_mmio_init(kvm); 3202 if (r < 0) 3203 goto put_kvm; 3204 #endif 3205 r = get_unused_fd_flags(O_CLOEXEC); 3206 if (r < 0) 3207 goto put_kvm; 3208 3209 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3210 if (IS_ERR(file)) { 3211 put_unused_fd(r); 3212 r = PTR_ERR(file); 3213 goto put_kvm; 3214 } 3215 3216 /* 3217 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3218 * already set, with ->release() being kvm_vm_release(). In error 3219 * cases it will be called by the final fput(file) and will take 3220 * care of doing kvm_put_kvm(kvm). 3221 */ 3222 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3223 put_unused_fd(r); 3224 fput(file); 3225 return -ENOMEM; 3226 } 3227 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3228 3229 fd_install(r, file); 3230 return r; 3231 3232 put_kvm: 3233 kvm_put_kvm(kvm); 3234 return r; 3235 } 3236 3237 static long kvm_dev_ioctl(struct file *filp, 3238 unsigned int ioctl, unsigned long arg) 3239 { 3240 long r = -EINVAL; 3241 3242 switch (ioctl) { 3243 case KVM_GET_API_VERSION: 3244 if (arg) 3245 goto out; 3246 r = KVM_API_VERSION; 3247 break; 3248 case KVM_CREATE_VM: 3249 r = kvm_dev_ioctl_create_vm(arg); 3250 break; 3251 case KVM_CHECK_EXTENSION: 3252 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3253 break; 3254 case KVM_GET_VCPU_MMAP_SIZE: 3255 if (arg) 3256 goto out; 3257 r = PAGE_SIZE; /* struct kvm_run */ 3258 #ifdef CONFIG_X86 3259 r += PAGE_SIZE; /* pio data page */ 3260 #endif 3261 #ifdef CONFIG_KVM_MMIO 3262 r += PAGE_SIZE; /* coalesced mmio ring page */ 3263 #endif 3264 break; 3265 case KVM_TRACE_ENABLE: 3266 case KVM_TRACE_PAUSE: 3267 case KVM_TRACE_DISABLE: 3268 r = -EOPNOTSUPP; 3269 break; 3270 default: 3271 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3272 } 3273 out: 3274 return r; 3275 } 3276 3277 static struct file_operations kvm_chardev_ops = { 3278 .unlocked_ioctl = kvm_dev_ioctl, 3279 .llseek = noop_llseek, 3280 KVM_COMPAT(kvm_dev_ioctl), 3281 }; 3282 3283 static struct miscdevice kvm_dev = { 3284 KVM_MINOR, 3285 "kvm", 3286 &kvm_chardev_ops, 3287 }; 3288 3289 static void hardware_enable_nolock(void *junk) 3290 { 3291 int cpu = raw_smp_processor_id(); 3292 int r; 3293 3294 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3295 return; 3296 3297 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3298 3299 r = kvm_arch_hardware_enable(); 3300 3301 if (r) { 3302 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3303 atomic_inc(&hardware_enable_failed); 3304 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3305 } 3306 } 3307 3308 static int kvm_starting_cpu(unsigned int cpu) 3309 { 3310 raw_spin_lock(&kvm_count_lock); 3311 if (kvm_usage_count) 3312 hardware_enable_nolock(NULL); 3313 raw_spin_unlock(&kvm_count_lock); 3314 return 0; 3315 } 3316 3317 static void hardware_disable_nolock(void *junk) 3318 { 3319 int cpu = raw_smp_processor_id(); 3320 3321 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3322 return; 3323 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3324 kvm_arch_hardware_disable(); 3325 } 3326 3327 static int kvm_dying_cpu(unsigned int cpu) 3328 { 3329 raw_spin_lock(&kvm_count_lock); 3330 if (kvm_usage_count) 3331 hardware_disable_nolock(NULL); 3332 raw_spin_unlock(&kvm_count_lock); 3333 return 0; 3334 } 3335 3336 static void hardware_disable_all_nolock(void) 3337 { 3338 BUG_ON(!kvm_usage_count); 3339 3340 kvm_usage_count--; 3341 if (!kvm_usage_count) 3342 on_each_cpu(hardware_disable_nolock, NULL, 1); 3343 } 3344 3345 static void hardware_disable_all(void) 3346 { 3347 raw_spin_lock(&kvm_count_lock); 3348 hardware_disable_all_nolock(); 3349 raw_spin_unlock(&kvm_count_lock); 3350 } 3351 3352 static int hardware_enable_all(void) 3353 { 3354 int r = 0; 3355 3356 raw_spin_lock(&kvm_count_lock); 3357 3358 kvm_usage_count++; 3359 if (kvm_usage_count == 1) { 3360 atomic_set(&hardware_enable_failed, 0); 3361 on_each_cpu(hardware_enable_nolock, NULL, 1); 3362 3363 if (atomic_read(&hardware_enable_failed)) { 3364 hardware_disable_all_nolock(); 3365 r = -EBUSY; 3366 } 3367 } 3368 3369 raw_spin_unlock(&kvm_count_lock); 3370 3371 return r; 3372 } 3373 3374 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3375 void *v) 3376 { 3377 /* 3378 * Some (well, at least mine) BIOSes hang on reboot if 3379 * in vmx root mode. 3380 * 3381 * And Intel TXT required VMX off for all cpu when system shutdown. 3382 */ 3383 pr_info("kvm: exiting hardware virtualization\n"); 3384 kvm_rebooting = true; 3385 on_each_cpu(hardware_disable_nolock, NULL, 1); 3386 return NOTIFY_OK; 3387 } 3388 3389 static struct notifier_block kvm_reboot_notifier = { 3390 .notifier_call = kvm_reboot, 3391 .priority = 0, 3392 }; 3393 3394 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3395 { 3396 int i; 3397 3398 for (i = 0; i < bus->dev_count; i++) { 3399 struct kvm_io_device *pos = bus->range[i].dev; 3400 3401 kvm_iodevice_destructor(pos); 3402 } 3403 kfree(bus); 3404 } 3405 3406 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3407 const struct kvm_io_range *r2) 3408 { 3409 gpa_t addr1 = r1->addr; 3410 gpa_t addr2 = r2->addr; 3411 3412 if (addr1 < addr2) 3413 return -1; 3414 3415 /* If r2->len == 0, match the exact address. If r2->len != 0, 3416 * accept any overlapping write. Any order is acceptable for 3417 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3418 * we process all of them. 3419 */ 3420 if (r2->len) { 3421 addr1 += r1->len; 3422 addr2 += r2->len; 3423 } 3424 3425 if (addr1 > addr2) 3426 return 1; 3427 3428 return 0; 3429 } 3430 3431 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3432 { 3433 return kvm_io_bus_cmp(p1, p2); 3434 } 3435 3436 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3437 gpa_t addr, int len) 3438 { 3439 struct kvm_io_range *range, key; 3440 int off; 3441 3442 key = (struct kvm_io_range) { 3443 .addr = addr, 3444 .len = len, 3445 }; 3446 3447 range = bsearch(&key, bus->range, bus->dev_count, 3448 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3449 if (range == NULL) 3450 return -ENOENT; 3451 3452 off = range - bus->range; 3453 3454 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3455 off--; 3456 3457 return off; 3458 } 3459 3460 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3461 struct kvm_io_range *range, const void *val) 3462 { 3463 int idx; 3464 3465 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3466 if (idx < 0) 3467 return -EOPNOTSUPP; 3468 3469 while (idx < bus->dev_count && 3470 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3471 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3472 range->len, val)) 3473 return idx; 3474 idx++; 3475 } 3476 3477 return -EOPNOTSUPP; 3478 } 3479 3480 /* kvm_io_bus_write - called under kvm->slots_lock */ 3481 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3482 int len, const void *val) 3483 { 3484 struct kvm_io_bus *bus; 3485 struct kvm_io_range range; 3486 int r; 3487 3488 range = (struct kvm_io_range) { 3489 .addr = addr, 3490 .len = len, 3491 }; 3492 3493 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3494 if (!bus) 3495 return -ENOMEM; 3496 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3497 return r < 0 ? r : 0; 3498 } 3499 3500 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3501 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3502 gpa_t addr, int len, const void *val, long cookie) 3503 { 3504 struct kvm_io_bus *bus; 3505 struct kvm_io_range range; 3506 3507 range = (struct kvm_io_range) { 3508 .addr = addr, 3509 .len = len, 3510 }; 3511 3512 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3513 if (!bus) 3514 return -ENOMEM; 3515 3516 /* First try the device referenced by cookie. */ 3517 if ((cookie >= 0) && (cookie < bus->dev_count) && 3518 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3519 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3520 val)) 3521 return cookie; 3522 3523 /* 3524 * cookie contained garbage; fall back to search and return the 3525 * correct cookie value. 3526 */ 3527 return __kvm_io_bus_write(vcpu, bus, &range, val); 3528 } 3529 3530 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3531 struct kvm_io_range *range, void *val) 3532 { 3533 int idx; 3534 3535 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3536 if (idx < 0) 3537 return -EOPNOTSUPP; 3538 3539 while (idx < bus->dev_count && 3540 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3541 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3542 range->len, val)) 3543 return idx; 3544 idx++; 3545 } 3546 3547 return -EOPNOTSUPP; 3548 } 3549 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3550 3551 /* kvm_io_bus_read - called under kvm->slots_lock */ 3552 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3553 int len, void *val) 3554 { 3555 struct kvm_io_bus *bus; 3556 struct kvm_io_range range; 3557 int r; 3558 3559 range = (struct kvm_io_range) { 3560 .addr = addr, 3561 .len = len, 3562 }; 3563 3564 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3565 if (!bus) 3566 return -ENOMEM; 3567 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3568 return r < 0 ? r : 0; 3569 } 3570 3571 3572 /* Caller must hold slots_lock. */ 3573 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3574 int len, struct kvm_io_device *dev) 3575 { 3576 int i; 3577 struct kvm_io_bus *new_bus, *bus; 3578 struct kvm_io_range range; 3579 3580 bus = kvm_get_bus(kvm, bus_idx); 3581 if (!bus) 3582 return -ENOMEM; 3583 3584 /* exclude ioeventfd which is limited by maximum fd */ 3585 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3586 return -ENOSPC; 3587 3588 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) * 3589 sizeof(struct kvm_io_range)), GFP_KERNEL); 3590 if (!new_bus) 3591 return -ENOMEM; 3592 3593 range = (struct kvm_io_range) { 3594 .addr = addr, 3595 .len = len, 3596 .dev = dev, 3597 }; 3598 3599 for (i = 0; i < bus->dev_count; i++) 3600 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 3601 break; 3602 3603 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3604 new_bus->dev_count++; 3605 new_bus->range[i] = range; 3606 memcpy(new_bus->range + i + 1, bus->range + i, 3607 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 3608 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3609 synchronize_srcu_expedited(&kvm->srcu); 3610 kfree(bus); 3611 3612 return 0; 3613 } 3614 3615 /* Caller must hold slots_lock. */ 3616 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3617 struct kvm_io_device *dev) 3618 { 3619 int i; 3620 struct kvm_io_bus *new_bus, *bus; 3621 3622 bus = kvm_get_bus(kvm, bus_idx); 3623 if (!bus) 3624 return; 3625 3626 for (i = 0; i < bus->dev_count; i++) 3627 if (bus->range[i].dev == dev) { 3628 break; 3629 } 3630 3631 if (i == bus->dev_count) 3632 return; 3633 3634 new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) * 3635 sizeof(struct kvm_io_range)), GFP_KERNEL); 3636 if (!new_bus) { 3637 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3638 goto broken; 3639 } 3640 3641 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3642 new_bus->dev_count--; 3643 memcpy(new_bus->range + i, bus->range + i + 1, 3644 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3645 3646 broken: 3647 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3648 synchronize_srcu_expedited(&kvm->srcu); 3649 kfree(bus); 3650 return; 3651 } 3652 3653 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3654 gpa_t addr) 3655 { 3656 struct kvm_io_bus *bus; 3657 int dev_idx, srcu_idx; 3658 struct kvm_io_device *iodev = NULL; 3659 3660 srcu_idx = srcu_read_lock(&kvm->srcu); 3661 3662 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 3663 if (!bus) 3664 goto out_unlock; 3665 3666 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 3667 if (dev_idx < 0) 3668 goto out_unlock; 3669 3670 iodev = bus->range[dev_idx].dev; 3671 3672 out_unlock: 3673 srcu_read_unlock(&kvm->srcu, srcu_idx); 3674 3675 return iodev; 3676 } 3677 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 3678 3679 static int kvm_debugfs_open(struct inode *inode, struct file *file, 3680 int (*get)(void *, u64 *), int (*set)(void *, u64), 3681 const char *fmt) 3682 { 3683 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3684 inode->i_private; 3685 3686 /* The debugfs files are a reference to the kvm struct which 3687 * is still valid when kvm_destroy_vm is called. 3688 * To avoid the race between open and the removal of the debugfs 3689 * directory we test against the users count. 3690 */ 3691 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 3692 return -ENOENT; 3693 3694 if (simple_attr_open(inode, file, get, set, fmt)) { 3695 kvm_put_kvm(stat_data->kvm); 3696 return -ENOMEM; 3697 } 3698 3699 return 0; 3700 } 3701 3702 static int kvm_debugfs_release(struct inode *inode, struct file *file) 3703 { 3704 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3705 inode->i_private; 3706 3707 simple_attr_release(inode, file); 3708 kvm_put_kvm(stat_data->kvm); 3709 3710 return 0; 3711 } 3712 3713 static int vm_stat_get_per_vm(void *data, u64 *val) 3714 { 3715 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3716 3717 *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 3718 3719 return 0; 3720 } 3721 3722 static int vm_stat_clear_per_vm(void *data, u64 val) 3723 { 3724 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3725 3726 if (val) 3727 return -EINVAL; 3728 3729 *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 3730 3731 return 0; 3732 } 3733 3734 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 3735 { 3736 __simple_attr_check_format("%llu\n", 0ull); 3737 return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 3738 vm_stat_clear_per_vm, "%llu\n"); 3739 } 3740 3741 static const struct file_operations vm_stat_get_per_vm_fops = { 3742 .owner = THIS_MODULE, 3743 .open = vm_stat_get_per_vm_open, 3744 .release = kvm_debugfs_release, 3745 .read = simple_attr_read, 3746 .write = simple_attr_write, 3747 .llseek = no_llseek, 3748 }; 3749 3750 static int vcpu_stat_get_per_vm(void *data, u64 *val) 3751 { 3752 int i; 3753 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3754 struct kvm_vcpu *vcpu; 3755 3756 *val = 0; 3757 3758 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3759 *val += *(u64 *)((void *)vcpu + stat_data->offset); 3760 3761 return 0; 3762 } 3763 3764 static int vcpu_stat_clear_per_vm(void *data, u64 val) 3765 { 3766 int i; 3767 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3768 struct kvm_vcpu *vcpu; 3769 3770 if (val) 3771 return -EINVAL; 3772 3773 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 3774 *(u64 *)((void *)vcpu + stat_data->offset) = 0; 3775 3776 return 0; 3777 } 3778 3779 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 3780 { 3781 __simple_attr_check_format("%llu\n", 0ull); 3782 return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 3783 vcpu_stat_clear_per_vm, "%llu\n"); 3784 } 3785 3786 static const struct file_operations vcpu_stat_get_per_vm_fops = { 3787 .owner = THIS_MODULE, 3788 .open = vcpu_stat_get_per_vm_open, 3789 .release = kvm_debugfs_release, 3790 .read = simple_attr_read, 3791 .write = simple_attr_write, 3792 .llseek = no_llseek, 3793 }; 3794 3795 static const struct file_operations *stat_fops_per_vm[] = { 3796 [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 3797 [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 3798 }; 3799 3800 static int vm_stat_get(void *_offset, u64 *val) 3801 { 3802 unsigned offset = (long)_offset; 3803 struct kvm *kvm; 3804 struct kvm_stat_data stat_tmp = {.offset = offset}; 3805 u64 tmp_val; 3806 3807 *val = 0; 3808 spin_lock(&kvm_lock); 3809 list_for_each_entry(kvm, &vm_list, vm_list) { 3810 stat_tmp.kvm = kvm; 3811 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3812 *val += tmp_val; 3813 } 3814 spin_unlock(&kvm_lock); 3815 return 0; 3816 } 3817 3818 static int vm_stat_clear(void *_offset, u64 val) 3819 { 3820 unsigned offset = (long)_offset; 3821 struct kvm *kvm; 3822 struct kvm_stat_data stat_tmp = {.offset = offset}; 3823 3824 if (val) 3825 return -EINVAL; 3826 3827 spin_lock(&kvm_lock); 3828 list_for_each_entry(kvm, &vm_list, vm_list) { 3829 stat_tmp.kvm = kvm; 3830 vm_stat_clear_per_vm((void *)&stat_tmp, 0); 3831 } 3832 spin_unlock(&kvm_lock); 3833 3834 return 0; 3835 } 3836 3837 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 3838 3839 static int vcpu_stat_get(void *_offset, u64 *val) 3840 { 3841 unsigned offset = (long)_offset; 3842 struct kvm *kvm; 3843 struct kvm_stat_data stat_tmp = {.offset = offset}; 3844 u64 tmp_val; 3845 3846 *val = 0; 3847 spin_lock(&kvm_lock); 3848 list_for_each_entry(kvm, &vm_list, vm_list) { 3849 stat_tmp.kvm = kvm; 3850 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 3851 *val += tmp_val; 3852 } 3853 spin_unlock(&kvm_lock); 3854 return 0; 3855 } 3856 3857 static int vcpu_stat_clear(void *_offset, u64 val) 3858 { 3859 unsigned offset = (long)_offset; 3860 struct kvm *kvm; 3861 struct kvm_stat_data stat_tmp = {.offset = offset}; 3862 3863 if (val) 3864 return -EINVAL; 3865 3866 spin_lock(&kvm_lock); 3867 list_for_each_entry(kvm, &vm_list, vm_list) { 3868 stat_tmp.kvm = kvm; 3869 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 3870 } 3871 spin_unlock(&kvm_lock); 3872 3873 return 0; 3874 } 3875 3876 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 3877 "%llu\n"); 3878 3879 static const struct file_operations *stat_fops[] = { 3880 [KVM_STAT_VCPU] = &vcpu_stat_fops, 3881 [KVM_STAT_VM] = &vm_stat_fops, 3882 }; 3883 3884 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 3885 { 3886 struct kobj_uevent_env *env; 3887 unsigned long long created, active; 3888 3889 if (!kvm_dev.this_device || !kvm) 3890 return; 3891 3892 spin_lock(&kvm_lock); 3893 if (type == KVM_EVENT_CREATE_VM) { 3894 kvm_createvm_count++; 3895 kvm_active_vms++; 3896 } else if (type == KVM_EVENT_DESTROY_VM) { 3897 kvm_active_vms--; 3898 } 3899 created = kvm_createvm_count; 3900 active = kvm_active_vms; 3901 spin_unlock(&kvm_lock); 3902 3903 env = kzalloc(sizeof(*env), GFP_KERNEL); 3904 if (!env) 3905 return; 3906 3907 add_uevent_var(env, "CREATED=%llu", created); 3908 add_uevent_var(env, "COUNT=%llu", active); 3909 3910 if (type == KVM_EVENT_CREATE_VM) { 3911 add_uevent_var(env, "EVENT=create"); 3912 kvm->userspace_pid = task_pid_nr(current); 3913 } else if (type == KVM_EVENT_DESTROY_VM) { 3914 add_uevent_var(env, "EVENT=destroy"); 3915 } 3916 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 3917 3918 if (kvm->debugfs_dentry) { 3919 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL); 3920 3921 if (p) { 3922 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 3923 if (!IS_ERR(tmp)) 3924 add_uevent_var(env, "STATS_PATH=%s", tmp); 3925 kfree(p); 3926 } 3927 } 3928 /* no need for checks, since we are adding at most only 5 keys */ 3929 env->envp[env->envp_idx++] = NULL; 3930 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 3931 kfree(env); 3932 } 3933 3934 static void kvm_init_debug(void) 3935 { 3936 struct kvm_stats_debugfs_item *p; 3937 3938 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 3939 3940 kvm_debugfs_num_entries = 0; 3941 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 3942 debugfs_create_file(p->name, 0644, kvm_debugfs_dir, 3943 (void *)(long)p->offset, 3944 stat_fops[p->kind]); 3945 } 3946 } 3947 3948 static int kvm_suspend(void) 3949 { 3950 if (kvm_usage_count) 3951 hardware_disable_nolock(NULL); 3952 return 0; 3953 } 3954 3955 static void kvm_resume(void) 3956 { 3957 if (kvm_usage_count) { 3958 WARN_ON(raw_spin_is_locked(&kvm_count_lock)); 3959 hardware_enable_nolock(NULL); 3960 } 3961 } 3962 3963 static struct syscore_ops kvm_syscore_ops = { 3964 .suspend = kvm_suspend, 3965 .resume = kvm_resume, 3966 }; 3967 3968 static inline 3969 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 3970 { 3971 return container_of(pn, struct kvm_vcpu, preempt_notifier); 3972 } 3973 3974 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 3975 { 3976 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3977 3978 if (vcpu->preempted) 3979 vcpu->preempted = false; 3980 3981 kvm_arch_sched_in(vcpu, cpu); 3982 3983 kvm_arch_vcpu_load(vcpu, cpu); 3984 } 3985 3986 static void kvm_sched_out(struct preempt_notifier *pn, 3987 struct task_struct *next) 3988 { 3989 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 3990 3991 if (current->state == TASK_RUNNING) 3992 vcpu->preempted = true; 3993 kvm_arch_vcpu_put(vcpu); 3994 } 3995 3996 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 3997 struct module *module) 3998 { 3999 int r; 4000 int cpu; 4001 4002 r = kvm_arch_init(opaque); 4003 if (r) 4004 goto out_fail; 4005 4006 /* 4007 * kvm_arch_init makes sure there's at most one caller 4008 * for architectures that support multiple implementations, 4009 * like intel and amd on x86. 4010 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4011 * conflicts in case kvm is already setup for another implementation. 4012 */ 4013 r = kvm_irqfd_init(); 4014 if (r) 4015 goto out_irqfd; 4016 4017 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4018 r = -ENOMEM; 4019 goto out_free_0; 4020 } 4021 4022 r = kvm_arch_hardware_setup(); 4023 if (r < 0) 4024 goto out_free_0a; 4025 4026 for_each_online_cpu(cpu) { 4027 smp_call_function_single(cpu, 4028 kvm_arch_check_processor_compat, 4029 &r, 1); 4030 if (r < 0) 4031 goto out_free_1; 4032 } 4033 4034 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4035 kvm_starting_cpu, kvm_dying_cpu); 4036 if (r) 4037 goto out_free_2; 4038 register_reboot_notifier(&kvm_reboot_notifier); 4039 4040 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4041 if (!vcpu_align) 4042 vcpu_align = __alignof__(struct kvm_vcpu); 4043 kvm_vcpu_cache = 4044 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4045 SLAB_ACCOUNT, 4046 offsetof(struct kvm_vcpu, arch), 4047 sizeof_field(struct kvm_vcpu, arch), 4048 NULL); 4049 if (!kvm_vcpu_cache) { 4050 r = -ENOMEM; 4051 goto out_free_3; 4052 } 4053 4054 r = kvm_async_pf_init(); 4055 if (r) 4056 goto out_free; 4057 4058 kvm_chardev_ops.owner = module; 4059 kvm_vm_fops.owner = module; 4060 kvm_vcpu_fops.owner = module; 4061 4062 r = misc_register(&kvm_dev); 4063 if (r) { 4064 pr_err("kvm: misc device register failed\n"); 4065 goto out_unreg; 4066 } 4067 4068 register_syscore_ops(&kvm_syscore_ops); 4069 4070 kvm_preempt_ops.sched_in = kvm_sched_in; 4071 kvm_preempt_ops.sched_out = kvm_sched_out; 4072 4073 kvm_init_debug(); 4074 4075 r = kvm_vfio_ops_init(); 4076 WARN_ON(r); 4077 4078 return 0; 4079 4080 out_unreg: 4081 kvm_async_pf_deinit(); 4082 out_free: 4083 kmem_cache_destroy(kvm_vcpu_cache); 4084 out_free_3: 4085 unregister_reboot_notifier(&kvm_reboot_notifier); 4086 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4087 out_free_2: 4088 out_free_1: 4089 kvm_arch_hardware_unsetup(); 4090 out_free_0a: 4091 free_cpumask_var(cpus_hardware_enabled); 4092 out_free_0: 4093 kvm_irqfd_exit(); 4094 out_irqfd: 4095 kvm_arch_exit(); 4096 out_fail: 4097 return r; 4098 } 4099 EXPORT_SYMBOL_GPL(kvm_init); 4100 4101 void kvm_exit(void) 4102 { 4103 debugfs_remove_recursive(kvm_debugfs_dir); 4104 misc_deregister(&kvm_dev); 4105 kmem_cache_destroy(kvm_vcpu_cache); 4106 kvm_async_pf_deinit(); 4107 unregister_syscore_ops(&kvm_syscore_ops); 4108 unregister_reboot_notifier(&kvm_reboot_notifier); 4109 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4110 on_each_cpu(hardware_disable_nolock, NULL, 1); 4111 kvm_arch_hardware_unsetup(); 4112 kvm_arch_exit(); 4113 kvm_irqfd_exit(); 4114 free_cpumask_var(cpus_hardware_enabled); 4115 kvm_vfio_ops_exit(); 4116 } 4117 EXPORT_SYMBOL_GPL(kvm_exit); 4118