1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 #include <asm/pgtable.h> 59 60 #include "coalesced_mmio.h" 61 #include "async_pf.h" 62 #include "vfio.h" 63 64 #define CREATE_TRACE_POINTS 65 #include <trace/events/kvm.h> 66 67 /* Worst case buffer size needed for holding an integer. */ 68 #define ITOA_MAX_LEN 12 69 70 MODULE_AUTHOR("Qumranet"); 71 MODULE_LICENSE("GPL"); 72 73 /* Architectures should define their poll value according to the halt latency */ 74 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 75 module_param(halt_poll_ns, uint, 0644); 76 EXPORT_SYMBOL_GPL(halt_poll_ns); 77 78 /* Default doubles per-vcpu halt_poll_ns. */ 79 unsigned int halt_poll_ns_grow = 2; 80 module_param(halt_poll_ns_grow, uint, 0644); 81 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82 83 /* The start value to grow halt_poll_ns from */ 84 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 85 module_param(halt_poll_ns_grow_start, uint, 0644); 86 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 87 88 /* Default resets per-vcpu halt_poll_ns . */ 89 unsigned int halt_poll_ns_shrink; 90 module_param(halt_poll_ns_shrink, uint, 0644); 91 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 92 93 /* 94 * Ordering of locks: 95 * 96 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 97 */ 98 99 DEFINE_MUTEX(kvm_lock); 100 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 101 LIST_HEAD(vm_list); 102 103 static cpumask_var_t cpus_hardware_enabled; 104 static int kvm_usage_count; 105 static atomic_t hardware_enable_failed; 106 107 struct kmem_cache *kvm_vcpu_cache; 108 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 109 110 static __read_mostly struct preempt_ops kvm_preempt_ops; 111 112 struct dentry *kvm_debugfs_dir; 113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 114 115 static int kvm_debugfs_num_entries; 116 static const struct file_operations *stat_fops_per_vm[]; 117 118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 119 unsigned long arg); 120 #ifdef CONFIG_KVM_COMPAT 121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg); 123 #define KVM_COMPAT(c) .compat_ioctl = (c) 124 #else 125 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 126 unsigned long arg) { return -EINVAL; } 127 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl 128 #endif 129 static int hardware_enable_all(void); 130 static void hardware_disable_all(void); 131 132 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 133 134 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 135 136 __visible bool kvm_rebooting; 137 EXPORT_SYMBOL_GPL(kvm_rebooting); 138 139 static bool largepages_enabled = true; 140 141 #define KVM_EVENT_CREATE_VM 0 142 #define KVM_EVENT_DESTROY_VM 1 143 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 144 static unsigned long long kvm_createvm_count; 145 static unsigned long long kvm_active_vms; 146 147 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 148 unsigned long start, unsigned long end, bool blockable) 149 { 150 return 0; 151 } 152 153 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 154 { 155 if (pfn_valid(pfn)) 156 return PageReserved(pfn_to_page(pfn)); 157 158 return true; 159 } 160 161 /* 162 * Switches to specified vcpu, until a matching vcpu_put() 163 */ 164 void vcpu_load(struct kvm_vcpu *vcpu) 165 { 166 int cpu = get_cpu(); 167 preempt_notifier_register(&vcpu->preempt_notifier); 168 kvm_arch_vcpu_load(vcpu, cpu); 169 put_cpu(); 170 } 171 EXPORT_SYMBOL_GPL(vcpu_load); 172 173 void vcpu_put(struct kvm_vcpu *vcpu) 174 { 175 preempt_disable(); 176 kvm_arch_vcpu_put(vcpu); 177 preempt_notifier_unregister(&vcpu->preempt_notifier); 178 preempt_enable(); 179 } 180 EXPORT_SYMBOL_GPL(vcpu_put); 181 182 /* TODO: merge with kvm_arch_vcpu_should_kick */ 183 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 184 { 185 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 186 187 /* 188 * We need to wait for the VCPU to reenable interrupts and get out of 189 * READING_SHADOW_PAGE_TABLES mode. 190 */ 191 if (req & KVM_REQUEST_WAIT) 192 return mode != OUTSIDE_GUEST_MODE; 193 194 /* 195 * Need to kick a running VCPU, but otherwise there is nothing to do. 196 */ 197 return mode == IN_GUEST_MODE; 198 } 199 200 static void ack_flush(void *_completed) 201 { 202 } 203 204 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 205 { 206 if (unlikely(!cpus)) 207 cpus = cpu_online_mask; 208 209 if (cpumask_empty(cpus)) 210 return false; 211 212 smp_call_function_many(cpus, ack_flush, NULL, wait); 213 return true; 214 } 215 216 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 217 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 218 { 219 int i, cpu, me; 220 struct kvm_vcpu *vcpu; 221 bool called; 222 223 me = get_cpu(); 224 225 kvm_for_each_vcpu(i, vcpu, kvm) { 226 if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) 227 continue; 228 229 kvm_make_request(req, vcpu); 230 cpu = vcpu->cpu; 231 232 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 233 continue; 234 235 if (tmp != NULL && cpu != -1 && cpu != me && 236 kvm_request_needs_ipi(vcpu, req)) 237 __cpumask_set_cpu(cpu, tmp); 238 } 239 240 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 241 put_cpu(); 242 243 return called; 244 } 245 246 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 247 { 248 cpumask_var_t cpus; 249 bool called; 250 251 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 252 253 called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); 254 255 free_cpumask_var(cpus); 256 return called; 257 } 258 259 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 260 void kvm_flush_remote_tlbs(struct kvm *kvm) 261 { 262 /* 263 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 264 * kvm_make_all_cpus_request. 265 */ 266 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 267 268 /* 269 * We want to publish modifications to the page tables before reading 270 * mode. Pairs with a memory barrier in arch-specific code. 271 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 272 * and smp_mb in walk_shadow_page_lockless_begin/end. 273 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 274 * 275 * There is already an smp_mb__after_atomic() before 276 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 277 * barrier here. 278 */ 279 if (!kvm_arch_flush_remote_tlb(kvm) 280 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 281 ++kvm->stat.remote_tlb_flush; 282 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 283 } 284 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 285 #endif 286 287 void kvm_reload_remote_mmus(struct kvm *kvm) 288 { 289 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 290 } 291 292 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 293 { 294 struct page *page; 295 int r; 296 297 mutex_init(&vcpu->mutex); 298 vcpu->cpu = -1; 299 vcpu->kvm = kvm; 300 vcpu->vcpu_id = id; 301 vcpu->pid = NULL; 302 init_swait_queue_head(&vcpu->wq); 303 kvm_async_pf_vcpu_init(vcpu); 304 305 vcpu->pre_pcpu = -1; 306 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 307 308 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 309 if (!page) { 310 r = -ENOMEM; 311 goto fail; 312 } 313 vcpu->run = page_address(page); 314 315 kvm_vcpu_set_in_spin_loop(vcpu, false); 316 kvm_vcpu_set_dy_eligible(vcpu, false); 317 vcpu->preempted = false; 318 vcpu->ready = false; 319 320 r = kvm_arch_vcpu_init(vcpu); 321 if (r < 0) 322 goto fail_free_run; 323 return 0; 324 325 fail_free_run: 326 free_page((unsigned long)vcpu->run); 327 fail: 328 return r; 329 } 330 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 331 332 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 333 { 334 /* 335 * no need for rcu_read_lock as VCPU_RUN is the only place that 336 * will change the vcpu->pid pointer and on uninit all file 337 * descriptors are already gone. 338 */ 339 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 340 kvm_arch_vcpu_uninit(vcpu); 341 free_page((unsigned long)vcpu->run); 342 } 343 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 344 345 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 346 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 347 { 348 return container_of(mn, struct kvm, mmu_notifier); 349 } 350 351 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 352 struct mm_struct *mm, 353 unsigned long address, 354 pte_t pte) 355 { 356 struct kvm *kvm = mmu_notifier_to_kvm(mn); 357 int idx; 358 359 idx = srcu_read_lock(&kvm->srcu); 360 spin_lock(&kvm->mmu_lock); 361 kvm->mmu_notifier_seq++; 362 363 if (kvm_set_spte_hva(kvm, address, pte)) 364 kvm_flush_remote_tlbs(kvm); 365 366 spin_unlock(&kvm->mmu_lock); 367 srcu_read_unlock(&kvm->srcu, idx); 368 } 369 370 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 371 const struct mmu_notifier_range *range) 372 { 373 struct kvm *kvm = mmu_notifier_to_kvm(mn); 374 int need_tlb_flush = 0, idx; 375 int ret; 376 377 idx = srcu_read_lock(&kvm->srcu); 378 spin_lock(&kvm->mmu_lock); 379 /* 380 * The count increase must become visible at unlock time as no 381 * spte can be established without taking the mmu_lock and 382 * count is also read inside the mmu_lock critical section. 383 */ 384 kvm->mmu_notifier_count++; 385 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); 386 need_tlb_flush |= kvm->tlbs_dirty; 387 /* we've to flush the tlb before the pages can be freed */ 388 if (need_tlb_flush) 389 kvm_flush_remote_tlbs(kvm); 390 391 spin_unlock(&kvm->mmu_lock); 392 393 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, 394 range->end, 395 mmu_notifier_range_blockable(range)); 396 397 srcu_read_unlock(&kvm->srcu, idx); 398 399 return ret; 400 } 401 402 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 403 const struct mmu_notifier_range *range) 404 { 405 struct kvm *kvm = mmu_notifier_to_kvm(mn); 406 407 spin_lock(&kvm->mmu_lock); 408 /* 409 * This sequence increase will notify the kvm page fault that 410 * the page that is going to be mapped in the spte could have 411 * been freed. 412 */ 413 kvm->mmu_notifier_seq++; 414 smp_wmb(); 415 /* 416 * The above sequence increase must be visible before the 417 * below count decrease, which is ensured by the smp_wmb above 418 * in conjunction with the smp_rmb in mmu_notifier_retry(). 419 */ 420 kvm->mmu_notifier_count--; 421 spin_unlock(&kvm->mmu_lock); 422 423 BUG_ON(kvm->mmu_notifier_count < 0); 424 } 425 426 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 427 struct mm_struct *mm, 428 unsigned long start, 429 unsigned long end) 430 { 431 struct kvm *kvm = mmu_notifier_to_kvm(mn); 432 int young, idx; 433 434 idx = srcu_read_lock(&kvm->srcu); 435 spin_lock(&kvm->mmu_lock); 436 437 young = kvm_age_hva(kvm, start, end); 438 if (young) 439 kvm_flush_remote_tlbs(kvm); 440 441 spin_unlock(&kvm->mmu_lock); 442 srcu_read_unlock(&kvm->srcu, idx); 443 444 return young; 445 } 446 447 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 448 struct mm_struct *mm, 449 unsigned long start, 450 unsigned long end) 451 { 452 struct kvm *kvm = mmu_notifier_to_kvm(mn); 453 int young, idx; 454 455 idx = srcu_read_lock(&kvm->srcu); 456 spin_lock(&kvm->mmu_lock); 457 /* 458 * Even though we do not flush TLB, this will still adversely 459 * affect performance on pre-Haswell Intel EPT, where there is 460 * no EPT Access Bit to clear so that we have to tear down EPT 461 * tables instead. If we find this unacceptable, we can always 462 * add a parameter to kvm_age_hva so that it effectively doesn't 463 * do anything on clear_young. 464 * 465 * Also note that currently we never issue secondary TLB flushes 466 * from clear_young, leaving this job up to the regular system 467 * cadence. If we find this inaccurate, we might come up with a 468 * more sophisticated heuristic later. 469 */ 470 young = kvm_age_hva(kvm, start, end); 471 spin_unlock(&kvm->mmu_lock); 472 srcu_read_unlock(&kvm->srcu, idx); 473 474 return young; 475 } 476 477 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 478 struct mm_struct *mm, 479 unsigned long address) 480 { 481 struct kvm *kvm = mmu_notifier_to_kvm(mn); 482 int young, idx; 483 484 idx = srcu_read_lock(&kvm->srcu); 485 spin_lock(&kvm->mmu_lock); 486 young = kvm_test_age_hva(kvm, address); 487 spin_unlock(&kvm->mmu_lock); 488 srcu_read_unlock(&kvm->srcu, idx); 489 490 return young; 491 } 492 493 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 494 struct mm_struct *mm) 495 { 496 struct kvm *kvm = mmu_notifier_to_kvm(mn); 497 int idx; 498 499 idx = srcu_read_lock(&kvm->srcu); 500 kvm_arch_flush_shadow_all(kvm); 501 srcu_read_unlock(&kvm->srcu, idx); 502 } 503 504 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 505 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 506 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 507 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 508 .clear_young = kvm_mmu_notifier_clear_young, 509 .test_young = kvm_mmu_notifier_test_young, 510 .change_pte = kvm_mmu_notifier_change_pte, 511 .release = kvm_mmu_notifier_release, 512 }; 513 514 static int kvm_init_mmu_notifier(struct kvm *kvm) 515 { 516 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 517 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 518 } 519 520 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 521 522 static int kvm_init_mmu_notifier(struct kvm *kvm) 523 { 524 return 0; 525 } 526 527 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 528 529 static struct kvm_memslots *kvm_alloc_memslots(void) 530 { 531 int i; 532 struct kvm_memslots *slots; 533 534 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 535 if (!slots) 536 return NULL; 537 538 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 539 slots->id_to_index[i] = slots->memslots[i].id = i; 540 541 return slots; 542 } 543 544 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 545 { 546 if (!memslot->dirty_bitmap) 547 return; 548 549 kvfree(memslot->dirty_bitmap); 550 memslot->dirty_bitmap = NULL; 551 } 552 553 /* 554 * Free any memory in @free but not in @dont. 555 */ 556 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 557 struct kvm_memory_slot *dont) 558 { 559 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 560 kvm_destroy_dirty_bitmap(free); 561 562 kvm_arch_free_memslot(kvm, free, dont); 563 564 free->npages = 0; 565 } 566 567 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 568 { 569 struct kvm_memory_slot *memslot; 570 571 if (!slots) 572 return; 573 574 kvm_for_each_memslot(memslot, slots) 575 kvm_free_memslot(kvm, memslot, NULL); 576 577 kvfree(slots); 578 } 579 580 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 581 { 582 int i; 583 584 if (!kvm->debugfs_dentry) 585 return; 586 587 debugfs_remove_recursive(kvm->debugfs_dentry); 588 589 if (kvm->debugfs_stat_data) { 590 for (i = 0; i < kvm_debugfs_num_entries; i++) 591 kfree(kvm->debugfs_stat_data[i]); 592 kfree(kvm->debugfs_stat_data); 593 } 594 } 595 596 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 597 { 598 char dir_name[ITOA_MAX_LEN * 2]; 599 struct kvm_stat_data *stat_data; 600 struct kvm_stats_debugfs_item *p; 601 602 if (!debugfs_initialized()) 603 return 0; 604 605 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 606 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 607 608 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 609 sizeof(*kvm->debugfs_stat_data), 610 GFP_KERNEL_ACCOUNT); 611 if (!kvm->debugfs_stat_data) 612 return -ENOMEM; 613 614 for (p = debugfs_entries; p->name; p++) { 615 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 616 if (!stat_data) 617 return -ENOMEM; 618 619 stat_data->kvm = kvm; 620 stat_data->offset = p->offset; 621 stat_data->mode = p->mode ? p->mode : 0644; 622 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 623 debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, 624 stat_data, stat_fops_per_vm[p->kind]); 625 } 626 return 0; 627 } 628 629 /* 630 * Called after the VM is otherwise initialized, but just before adding it to 631 * the vm_list. 632 */ 633 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 634 { 635 return 0; 636 } 637 638 /* 639 * Called just after removing the VM from the vm_list, but before doing any 640 * other destruction. 641 */ 642 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 643 { 644 } 645 646 static struct kvm *kvm_create_vm(unsigned long type) 647 { 648 struct kvm *kvm = kvm_arch_alloc_vm(); 649 int r = -ENOMEM; 650 int i; 651 652 if (!kvm) 653 return ERR_PTR(-ENOMEM); 654 655 spin_lock_init(&kvm->mmu_lock); 656 mmgrab(current->mm); 657 kvm->mm = current->mm; 658 kvm_eventfd_init(kvm); 659 mutex_init(&kvm->lock); 660 mutex_init(&kvm->irq_lock); 661 mutex_init(&kvm->slots_lock); 662 INIT_LIST_HEAD(&kvm->devices); 663 664 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 665 666 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 667 struct kvm_memslots *slots = kvm_alloc_memslots(); 668 669 if (!slots) 670 goto out_err_no_arch_destroy_vm; 671 /* Generations must be different for each address space. */ 672 slots->generation = i; 673 rcu_assign_pointer(kvm->memslots[i], slots); 674 } 675 676 for (i = 0; i < KVM_NR_BUSES; i++) { 677 rcu_assign_pointer(kvm->buses[i], 678 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 679 if (!kvm->buses[i]) 680 goto out_err_no_arch_destroy_vm; 681 } 682 683 refcount_set(&kvm->users_count, 1); 684 r = kvm_arch_init_vm(kvm, type); 685 if (r) 686 goto out_err_no_arch_destroy_vm; 687 688 r = hardware_enable_all(); 689 if (r) 690 goto out_err_no_disable; 691 692 #ifdef CONFIG_HAVE_KVM_IRQFD 693 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 694 #endif 695 696 if (init_srcu_struct(&kvm->srcu)) 697 goto out_err_no_srcu; 698 if (init_srcu_struct(&kvm->irq_srcu)) 699 goto out_err_no_irq_srcu; 700 701 r = kvm_init_mmu_notifier(kvm); 702 if (r) 703 goto out_err_no_mmu_notifier; 704 705 r = kvm_arch_post_init_vm(kvm); 706 if (r) 707 goto out_err; 708 709 mutex_lock(&kvm_lock); 710 list_add(&kvm->vm_list, &vm_list); 711 mutex_unlock(&kvm_lock); 712 713 preempt_notifier_inc(); 714 715 return kvm; 716 717 out_err: 718 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 719 if (kvm->mmu_notifier.ops) 720 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 721 #endif 722 out_err_no_mmu_notifier: 723 cleanup_srcu_struct(&kvm->irq_srcu); 724 out_err_no_irq_srcu: 725 cleanup_srcu_struct(&kvm->srcu); 726 out_err_no_srcu: 727 hardware_disable_all(); 728 out_err_no_disable: 729 kvm_arch_destroy_vm(kvm); 730 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 731 out_err_no_arch_destroy_vm: 732 for (i = 0; i < KVM_NR_BUSES; i++) 733 kfree(kvm_get_bus(kvm, i)); 734 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 735 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 736 kvm_arch_free_vm(kvm); 737 mmdrop(current->mm); 738 return ERR_PTR(r); 739 } 740 741 static void kvm_destroy_devices(struct kvm *kvm) 742 { 743 struct kvm_device *dev, *tmp; 744 745 /* 746 * We do not need to take the kvm->lock here, because nobody else 747 * has a reference to the struct kvm at this point and therefore 748 * cannot access the devices list anyhow. 749 */ 750 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 751 list_del(&dev->vm_node); 752 dev->ops->destroy(dev); 753 } 754 } 755 756 static void kvm_destroy_vm(struct kvm *kvm) 757 { 758 int i; 759 struct mm_struct *mm = kvm->mm; 760 761 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 762 kvm_destroy_vm_debugfs(kvm); 763 kvm_arch_sync_events(kvm); 764 mutex_lock(&kvm_lock); 765 list_del(&kvm->vm_list); 766 mutex_unlock(&kvm_lock); 767 kvm_arch_pre_destroy_vm(kvm); 768 769 kvm_free_irq_routing(kvm); 770 for (i = 0; i < KVM_NR_BUSES; i++) { 771 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 772 773 if (bus) 774 kvm_io_bus_destroy(bus); 775 kvm->buses[i] = NULL; 776 } 777 kvm_coalesced_mmio_free(kvm); 778 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 779 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 780 #else 781 kvm_arch_flush_shadow_all(kvm); 782 #endif 783 kvm_arch_destroy_vm(kvm); 784 kvm_destroy_devices(kvm); 785 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 786 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 787 cleanup_srcu_struct(&kvm->irq_srcu); 788 cleanup_srcu_struct(&kvm->srcu); 789 kvm_arch_free_vm(kvm); 790 preempt_notifier_dec(); 791 hardware_disable_all(); 792 mmdrop(mm); 793 } 794 795 void kvm_get_kvm(struct kvm *kvm) 796 { 797 refcount_inc(&kvm->users_count); 798 } 799 EXPORT_SYMBOL_GPL(kvm_get_kvm); 800 801 void kvm_put_kvm(struct kvm *kvm) 802 { 803 if (refcount_dec_and_test(&kvm->users_count)) 804 kvm_destroy_vm(kvm); 805 } 806 EXPORT_SYMBOL_GPL(kvm_put_kvm); 807 808 809 static int kvm_vm_release(struct inode *inode, struct file *filp) 810 { 811 struct kvm *kvm = filp->private_data; 812 813 kvm_irqfd_release(kvm); 814 815 kvm_put_kvm(kvm); 816 return 0; 817 } 818 819 /* 820 * Allocation size is twice as large as the actual dirty bitmap size. 821 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 822 */ 823 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 824 { 825 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 826 827 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 828 if (!memslot->dirty_bitmap) 829 return -ENOMEM; 830 831 return 0; 832 } 833 834 /* 835 * Insert memslot and re-sort memslots based on their GFN, 836 * so binary search could be used to lookup GFN. 837 * Sorting algorithm takes advantage of having initially 838 * sorted array and known changed memslot position. 839 */ 840 static void update_memslots(struct kvm_memslots *slots, 841 struct kvm_memory_slot *new, 842 enum kvm_mr_change change) 843 { 844 int id = new->id; 845 int i = slots->id_to_index[id]; 846 struct kvm_memory_slot *mslots = slots->memslots; 847 848 WARN_ON(mslots[i].id != id); 849 switch (change) { 850 case KVM_MR_CREATE: 851 slots->used_slots++; 852 WARN_ON(mslots[i].npages || !new->npages); 853 break; 854 case KVM_MR_DELETE: 855 slots->used_slots--; 856 WARN_ON(new->npages || !mslots[i].npages); 857 break; 858 default: 859 break; 860 } 861 862 while (i < KVM_MEM_SLOTS_NUM - 1 && 863 new->base_gfn <= mslots[i + 1].base_gfn) { 864 if (!mslots[i + 1].npages) 865 break; 866 mslots[i] = mslots[i + 1]; 867 slots->id_to_index[mslots[i].id] = i; 868 i++; 869 } 870 871 /* 872 * The ">=" is needed when creating a slot with base_gfn == 0, 873 * so that it moves before all those with base_gfn == npages == 0. 874 * 875 * On the other hand, if new->npages is zero, the above loop has 876 * already left i pointing to the beginning of the empty part of 877 * mslots, and the ">=" would move the hole backwards in this 878 * case---which is wrong. So skip the loop when deleting a slot. 879 */ 880 if (new->npages) { 881 while (i > 0 && 882 new->base_gfn >= mslots[i - 1].base_gfn) { 883 mslots[i] = mslots[i - 1]; 884 slots->id_to_index[mslots[i].id] = i; 885 i--; 886 } 887 } else 888 WARN_ON_ONCE(i != slots->used_slots); 889 890 mslots[i] = *new; 891 slots->id_to_index[mslots[i].id] = i; 892 } 893 894 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 895 { 896 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 897 898 #ifdef __KVM_HAVE_READONLY_MEM 899 valid_flags |= KVM_MEM_READONLY; 900 #endif 901 902 if (mem->flags & ~valid_flags) 903 return -EINVAL; 904 905 return 0; 906 } 907 908 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 909 int as_id, struct kvm_memslots *slots) 910 { 911 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 912 u64 gen = old_memslots->generation; 913 914 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 915 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 916 917 rcu_assign_pointer(kvm->memslots[as_id], slots); 918 synchronize_srcu_expedited(&kvm->srcu); 919 920 /* 921 * Increment the new memslot generation a second time, dropping the 922 * update in-progress flag and incrementing then generation based on 923 * the number of address spaces. This provides a unique and easily 924 * identifiable generation number while the memslots are in flux. 925 */ 926 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 927 928 /* 929 * Generations must be unique even across address spaces. We do not need 930 * a global counter for that, instead the generation space is evenly split 931 * across address spaces. For example, with two address spaces, address 932 * space 0 will use generations 0, 2, 4, ... while address space 1 will 933 * use generations 1, 3, 5, ... 934 */ 935 gen += KVM_ADDRESS_SPACE_NUM; 936 937 kvm_arch_memslots_updated(kvm, gen); 938 939 slots->generation = gen; 940 941 return old_memslots; 942 } 943 944 /* 945 * Allocate some memory and give it an address in the guest physical address 946 * space. 947 * 948 * Discontiguous memory is allowed, mostly for framebuffers. 949 * 950 * Must be called holding kvm->slots_lock for write. 951 */ 952 int __kvm_set_memory_region(struct kvm *kvm, 953 const struct kvm_userspace_memory_region *mem) 954 { 955 int r; 956 gfn_t base_gfn; 957 unsigned long npages; 958 struct kvm_memory_slot *slot; 959 struct kvm_memory_slot old, new; 960 struct kvm_memslots *slots = NULL, *old_memslots; 961 int as_id, id; 962 enum kvm_mr_change change; 963 964 r = check_memory_region_flags(mem); 965 if (r) 966 goto out; 967 968 r = -EINVAL; 969 as_id = mem->slot >> 16; 970 id = (u16)mem->slot; 971 972 /* General sanity checks */ 973 if (mem->memory_size & (PAGE_SIZE - 1)) 974 goto out; 975 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 976 goto out; 977 /* We can read the guest memory with __xxx_user() later on. */ 978 if ((id < KVM_USER_MEM_SLOTS) && 979 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 980 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 981 mem->memory_size))) 982 goto out; 983 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 984 goto out; 985 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 986 goto out; 987 988 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 989 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 990 npages = mem->memory_size >> PAGE_SHIFT; 991 992 if (npages > KVM_MEM_MAX_NR_PAGES) 993 goto out; 994 995 new = old = *slot; 996 997 new.id = id; 998 new.base_gfn = base_gfn; 999 new.npages = npages; 1000 new.flags = mem->flags; 1001 1002 if (npages) { 1003 if (!old.npages) 1004 change = KVM_MR_CREATE; 1005 else { /* Modify an existing slot. */ 1006 if ((mem->userspace_addr != old.userspace_addr) || 1007 (npages != old.npages) || 1008 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1009 goto out; 1010 1011 if (base_gfn != old.base_gfn) 1012 change = KVM_MR_MOVE; 1013 else if (new.flags != old.flags) 1014 change = KVM_MR_FLAGS_ONLY; 1015 else { /* Nothing to change. */ 1016 r = 0; 1017 goto out; 1018 } 1019 } 1020 } else { 1021 if (!old.npages) 1022 goto out; 1023 1024 change = KVM_MR_DELETE; 1025 new.base_gfn = 0; 1026 new.flags = 0; 1027 } 1028 1029 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1030 /* Check for overlaps */ 1031 r = -EEXIST; 1032 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 1033 if (slot->id == id) 1034 continue; 1035 if (!((base_gfn + npages <= slot->base_gfn) || 1036 (base_gfn >= slot->base_gfn + slot->npages))) 1037 goto out; 1038 } 1039 } 1040 1041 /* Free page dirty bitmap if unneeded */ 1042 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1043 new.dirty_bitmap = NULL; 1044 1045 r = -ENOMEM; 1046 if (change == KVM_MR_CREATE) { 1047 new.userspace_addr = mem->userspace_addr; 1048 1049 if (kvm_arch_create_memslot(kvm, &new, npages)) 1050 goto out_free; 1051 } 1052 1053 /* Allocate page dirty bitmap if needed */ 1054 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1055 if (kvm_create_dirty_bitmap(&new) < 0) 1056 goto out_free; 1057 } 1058 1059 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 1060 if (!slots) 1061 goto out_free; 1062 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1063 1064 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1065 slot = id_to_memslot(slots, id); 1066 slot->flags |= KVM_MEMSLOT_INVALID; 1067 1068 old_memslots = install_new_memslots(kvm, as_id, slots); 1069 1070 /* From this point no new shadow pages pointing to a deleted, 1071 * or moved, memslot will be created. 1072 * 1073 * validation of sp->gfn happens in: 1074 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1075 * - kvm_is_visible_gfn (mmu_check_roots) 1076 */ 1077 kvm_arch_flush_shadow_memslot(kvm, slot); 1078 1079 /* 1080 * We can re-use the old_memslots from above, the only difference 1081 * from the currently installed memslots is the invalid flag. This 1082 * will get overwritten by update_memslots anyway. 1083 */ 1084 slots = old_memslots; 1085 } 1086 1087 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1088 if (r) 1089 goto out_slots; 1090 1091 /* actual memory is freed via old in kvm_free_memslot below */ 1092 if (change == KVM_MR_DELETE) { 1093 new.dirty_bitmap = NULL; 1094 memset(&new.arch, 0, sizeof(new.arch)); 1095 } 1096 1097 update_memslots(slots, &new, change); 1098 old_memslots = install_new_memslots(kvm, as_id, slots); 1099 1100 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1101 1102 kvm_free_memslot(kvm, &old, &new); 1103 kvfree(old_memslots); 1104 return 0; 1105 1106 out_slots: 1107 kvfree(slots); 1108 out_free: 1109 kvm_free_memslot(kvm, &new, &old); 1110 out: 1111 return r; 1112 } 1113 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1114 1115 int kvm_set_memory_region(struct kvm *kvm, 1116 const struct kvm_userspace_memory_region *mem) 1117 { 1118 int r; 1119 1120 mutex_lock(&kvm->slots_lock); 1121 r = __kvm_set_memory_region(kvm, mem); 1122 mutex_unlock(&kvm->slots_lock); 1123 return r; 1124 } 1125 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1126 1127 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1128 struct kvm_userspace_memory_region *mem) 1129 { 1130 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1131 return -EINVAL; 1132 1133 return kvm_set_memory_region(kvm, mem); 1134 } 1135 1136 int kvm_get_dirty_log(struct kvm *kvm, 1137 struct kvm_dirty_log *log, int *is_dirty) 1138 { 1139 struct kvm_memslots *slots; 1140 struct kvm_memory_slot *memslot; 1141 int i, as_id, id; 1142 unsigned long n; 1143 unsigned long any = 0; 1144 1145 as_id = log->slot >> 16; 1146 id = (u16)log->slot; 1147 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1148 return -EINVAL; 1149 1150 slots = __kvm_memslots(kvm, as_id); 1151 memslot = id_to_memslot(slots, id); 1152 if (!memslot->dirty_bitmap) 1153 return -ENOENT; 1154 1155 n = kvm_dirty_bitmap_bytes(memslot); 1156 1157 for (i = 0; !any && i < n/sizeof(long); ++i) 1158 any = memslot->dirty_bitmap[i]; 1159 1160 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1161 return -EFAULT; 1162 1163 if (any) 1164 *is_dirty = 1; 1165 return 0; 1166 } 1167 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1168 1169 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1170 /** 1171 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1172 * and reenable dirty page tracking for the corresponding pages. 1173 * @kvm: pointer to kvm instance 1174 * @log: slot id and address to which we copy the log 1175 * @flush: true if TLB flush is needed by caller 1176 * 1177 * We need to keep it in mind that VCPU threads can write to the bitmap 1178 * concurrently. So, to avoid losing track of dirty pages we keep the 1179 * following order: 1180 * 1181 * 1. Take a snapshot of the bit and clear it if needed. 1182 * 2. Write protect the corresponding page. 1183 * 3. Copy the snapshot to the userspace. 1184 * 4. Upon return caller flushes TLB's if needed. 1185 * 1186 * Between 2 and 4, the guest may write to the page using the remaining TLB 1187 * entry. This is not a problem because the page is reported dirty using 1188 * the snapshot taken before and step 4 ensures that writes done after 1189 * exiting to userspace will be logged for the next call. 1190 * 1191 */ 1192 int kvm_get_dirty_log_protect(struct kvm *kvm, 1193 struct kvm_dirty_log *log, bool *flush) 1194 { 1195 struct kvm_memslots *slots; 1196 struct kvm_memory_slot *memslot; 1197 int i, as_id, id; 1198 unsigned long n; 1199 unsigned long *dirty_bitmap; 1200 unsigned long *dirty_bitmap_buffer; 1201 1202 as_id = log->slot >> 16; 1203 id = (u16)log->slot; 1204 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1205 return -EINVAL; 1206 1207 slots = __kvm_memslots(kvm, as_id); 1208 memslot = id_to_memslot(slots, id); 1209 1210 dirty_bitmap = memslot->dirty_bitmap; 1211 if (!dirty_bitmap) 1212 return -ENOENT; 1213 1214 n = kvm_dirty_bitmap_bytes(memslot); 1215 *flush = false; 1216 if (kvm->manual_dirty_log_protect) { 1217 /* 1218 * Unlike kvm_get_dirty_log, we always return false in *flush, 1219 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1220 * is some code duplication between this function and 1221 * kvm_get_dirty_log, but hopefully all architecture 1222 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1223 * can be eliminated. 1224 */ 1225 dirty_bitmap_buffer = dirty_bitmap; 1226 } else { 1227 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1228 memset(dirty_bitmap_buffer, 0, n); 1229 1230 spin_lock(&kvm->mmu_lock); 1231 for (i = 0; i < n / sizeof(long); i++) { 1232 unsigned long mask; 1233 gfn_t offset; 1234 1235 if (!dirty_bitmap[i]) 1236 continue; 1237 1238 *flush = true; 1239 mask = xchg(&dirty_bitmap[i], 0); 1240 dirty_bitmap_buffer[i] = mask; 1241 1242 offset = i * BITS_PER_LONG; 1243 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1244 offset, mask); 1245 } 1246 spin_unlock(&kvm->mmu_lock); 1247 } 1248 1249 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1250 return -EFAULT; 1251 return 0; 1252 } 1253 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1254 1255 /** 1256 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1257 * and reenable dirty page tracking for the corresponding pages. 1258 * @kvm: pointer to kvm instance 1259 * @log: slot id and address from which to fetch the bitmap of dirty pages 1260 * @flush: true if TLB flush is needed by caller 1261 */ 1262 int kvm_clear_dirty_log_protect(struct kvm *kvm, 1263 struct kvm_clear_dirty_log *log, bool *flush) 1264 { 1265 struct kvm_memslots *slots; 1266 struct kvm_memory_slot *memslot; 1267 int as_id, id; 1268 gfn_t offset; 1269 unsigned long i, n; 1270 unsigned long *dirty_bitmap; 1271 unsigned long *dirty_bitmap_buffer; 1272 1273 as_id = log->slot >> 16; 1274 id = (u16)log->slot; 1275 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1276 return -EINVAL; 1277 1278 if (log->first_page & 63) 1279 return -EINVAL; 1280 1281 slots = __kvm_memslots(kvm, as_id); 1282 memslot = id_to_memslot(slots, id); 1283 1284 dirty_bitmap = memslot->dirty_bitmap; 1285 if (!dirty_bitmap) 1286 return -ENOENT; 1287 1288 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1289 1290 if (log->first_page > memslot->npages || 1291 log->num_pages > memslot->npages - log->first_page || 1292 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1293 return -EINVAL; 1294 1295 *flush = false; 1296 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1297 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1298 return -EFAULT; 1299 1300 spin_lock(&kvm->mmu_lock); 1301 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1302 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1303 i++, offset += BITS_PER_LONG) { 1304 unsigned long mask = *dirty_bitmap_buffer++; 1305 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1306 if (!mask) 1307 continue; 1308 1309 mask &= atomic_long_fetch_andnot(mask, p); 1310 1311 /* 1312 * mask contains the bits that really have been cleared. This 1313 * never includes any bits beyond the length of the memslot (if 1314 * the length is not aligned to 64 pages), therefore it is not 1315 * a problem if userspace sets them in log->dirty_bitmap. 1316 */ 1317 if (mask) { 1318 *flush = true; 1319 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1320 offset, mask); 1321 } 1322 } 1323 spin_unlock(&kvm->mmu_lock); 1324 1325 return 0; 1326 } 1327 EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); 1328 #endif 1329 1330 bool kvm_largepages_enabled(void) 1331 { 1332 return largepages_enabled; 1333 } 1334 1335 void kvm_disable_largepages(void) 1336 { 1337 largepages_enabled = false; 1338 } 1339 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1340 1341 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1342 { 1343 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1344 } 1345 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1346 1347 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1348 { 1349 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1350 } 1351 1352 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1353 { 1354 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1355 1356 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1357 memslot->flags & KVM_MEMSLOT_INVALID) 1358 return false; 1359 1360 return true; 1361 } 1362 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1363 1364 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1365 { 1366 struct vm_area_struct *vma; 1367 unsigned long addr, size; 1368 1369 size = PAGE_SIZE; 1370 1371 addr = gfn_to_hva(kvm, gfn); 1372 if (kvm_is_error_hva(addr)) 1373 return PAGE_SIZE; 1374 1375 down_read(¤t->mm->mmap_sem); 1376 vma = find_vma(current->mm, addr); 1377 if (!vma) 1378 goto out; 1379 1380 size = vma_kernel_pagesize(vma); 1381 1382 out: 1383 up_read(¤t->mm->mmap_sem); 1384 1385 return size; 1386 } 1387 1388 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1389 { 1390 return slot->flags & KVM_MEM_READONLY; 1391 } 1392 1393 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1394 gfn_t *nr_pages, bool write) 1395 { 1396 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1397 return KVM_HVA_ERR_BAD; 1398 1399 if (memslot_is_readonly(slot) && write) 1400 return KVM_HVA_ERR_RO_BAD; 1401 1402 if (nr_pages) 1403 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1404 1405 return __gfn_to_hva_memslot(slot, gfn); 1406 } 1407 1408 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1409 gfn_t *nr_pages) 1410 { 1411 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1412 } 1413 1414 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1415 gfn_t gfn) 1416 { 1417 return gfn_to_hva_many(slot, gfn, NULL); 1418 } 1419 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1420 1421 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1422 { 1423 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1424 } 1425 EXPORT_SYMBOL_GPL(gfn_to_hva); 1426 1427 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1428 { 1429 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1430 } 1431 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1432 1433 /* 1434 * Return the hva of a @gfn and the R/W attribute if possible. 1435 * 1436 * @slot: the kvm_memory_slot which contains @gfn 1437 * @gfn: the gfn to be translated 1438 * @writable: used to return the read/write attribute of the @slot if the hva 1439 * is valid and @writable is not NULL 1440 */ 1441 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1442 gfn_t gfn, bool *writable) 1443 { 1444 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1445 1446 if (!kvm_is_error_hva(hva) && writable) 1447 *writable = !memslot_is_readonly(slot); 1448 1449 return hva; 1450 } 1451 1452 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1453 { 1454 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1455 1456 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1457 } 1458 1459 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1460 { 1461 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1462 1463 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1464 } 1465 1466 static inline int check_user_page_hwpoison(unsigned long addr) 1467 { 1468 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1469 1470 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1471 return rc == -EHWPOISON; 1472 } 1473 1474 /* 1475 * The fast path to get the writable pfn which will be stored in @pfn, 1476 * true indicates success, otherwise false is returned. It's also the 1477 * only part that runs if we can are in atomic context. 1478 */ 1479 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1480 bool *writable, kvm_pfn_t *pfn) 1481 { 1482 struct page *page[1]; 1483 int npages; 1484 1485 /* 1486 * Fast pin a writable pfn only if it is a write fault request 1487 * or the caller allows to map a writable pfn for a read fault 1488 * request. 1489 */ 1490 if (!(write_fault || writable)) 1491 return false; 1492 1493 npages = __get_user_pages_fast(addr, 1, 1, page); 1494 if (npages == 1) { 1495 *pfn = page_to_pfn(page[0]); 1496 1497 if (writable) 1498 *writable = true; 1499 return true; 1500 } 1501 1502 return false; 1503 } 1504 1505 /* 1506 * The slow path to get the pfn of the specified host virtual address, 1507 * 1 indicates success, -errno is returned if error is detected. 1508 */ 1509 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1510 bool *writable, kvm_pfn_t *pfn) 1511 { 1512 unsigned int flags = FOLL_HWPOISON; 1513 struct page *page; 1514 int npages = 0; 1515 1516 might_sleep(); 1517 1518 if (writable) 1519 *writable = write_fault; 1520 1521 if (write_fault) 1522 flags |= FOLL_WRITE; 1523 if (async) 1524 flags |= FOLL_NOWAIT; 1525 1526 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1527 if (npages != 1) 1528 return npages; 1529 1530 /* map read fault as writable if possible */ 1531 if (unlikely(!write_fault) && writable) { 1532 struct page *wpage; 1533 1534 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1535 *writable = true; 1536 put_page(page); 1537 page = wpage; 1538 } 1539 } 1540 *pfn = page_to_pfn(page); 1541 return npages; 1542 } 1543 1544 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1545 { 1546 if (unlikely(!(vma->vm_flags & VM_READ))) 1547 return false; 1548 1549 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1550 return false; 1551 1552 return true; 1553 } 1554 1555 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1556 unsigned long addr, bool *async, 1557 bool write_fault, bool *writable, 1558 kvm_pfn_t *p_pfn) 1559 { 1560 unsigned long pfn; 1561 int r; 1562 1563 r = follow_pfn(vma, addr, &pfn); 1564 if (r) { 1565 /* 1566 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1567 * not call the fault handler, so do it here. 1568 */ 1569 bool unlocked = false; 1570 r = fixup_user_fault(current, current->mm, addr, 1571 (write_fault ? FAULT_FLAG_WRITE : 0), 1572 &unlocked); 1573 if (unlocked) 1574 return -EAGAIN; 1575 if (r) 1576 return r; 1577 1578 r = follow_pfn(vma, addr, &pfn); 1579 if (r) 1580 return r; 1581 1582 } 1583 1584 if (writable) 1585 *writable = true; 1586 1587 /* 1588 * Get a reference here because callers of *hva_to_pfn* and 1589 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1590 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1591 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1592 * simply do nothing for reserved pfns. 1593 * 1594 * Whoever called remap_pfn_range is also going to call e.g. 1595 * unmap_mapping_range before the underlying pages are freed, 1596 * causing a call to our MMU notifier. 1597 */ 1598 kvm_get_pfn(pfn); 1599 1600 *p_pfn = pfn; 1601 return 0; 1602 } 1603 1604 /* 1605 * Pin guest page in memory and return its pfn. 1606 * @addr: host virtual address which maps memory to the guest 1607 * @atomic: whether this function can sleep 1608 * @async: whether this function need to wait IO complete if the 1609 * host page is not in the memory 1610 * @write_fault: whether we should get a writable host page 1611 * @writable: whether it allows to map a writable host page for !@write_fault 1612 * 1613 * The function will map a writable host page for these two cases: 1614 * 1): @write_fault = true 1615 * 2): @write_fault = false && @writable, @writable will tell the caller 1616 * whether the mapping is writable. 1617 */ 1618 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1619 bool write_fault, bool *writable) 1620 { 1621 struct vm_area_struct *vma; 1622 kvm_pfn_t pfn = 0; 1623 int npages, r; 1624 1625 /* we can do it either atomically or asynchronously, not both */ 1626 BUG_ON(atomic && async); 1627 1628 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1629 return pfn; 1630 1631 if (atomic) 1632 return KVM_PFN_ERR_FAULT; 1633 1634 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1635 if (npages == 1) 1636 return pfn; 1637 1638 down_read(¤t->mm->mmap_sem); 1639 if (npages == -EHWPOISON || 1640 (!async && check_user_page_hwpoison(addr))) { 1641 pfn = KVM_PFN_ERR_HWPOISON; 1642 goto exit; 1643 } 1644 1645 retry: 1646 vma = find_vma_intersection(current->mm, addr, addr + 1); 1647 1648 if (vma == NULL) 1649 pfn = KVM_PFN_ERR_FAULT; 1650 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1651 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1652 if (r == -EAGAIN) 1653 goto retry; 1654 if (r < 0) 1655 pfn = KVM_PFN_ERR_FAULT; 1656 } else { 1657 if (async && vma_is_valid(vma, write_fault)) 1658 *async = true; 1659 pfn = KVM_PFN_ERR_FAULT; 1660 } 1661 exit: 1662 up_read(¤t->mm->mmap_sem); 1663 return pfn; 1664 } 1665 1666 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1667 bool atomic, bool *async, bool write_fault, 1668 bool *writable) 1669 { 1670 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1671 1672 if (addr == KVM_HVA_ERR_RO_BAD) { 1673 if (writable) 1674 *writable = false; 1675 return KVM_PFN_ERR_RO_FAULT; 1676 } 1677 1678 if (kvm_is_error_hva(addr)) { 1679 if (writable) 1680 *writable = false; 1681 return KVM_PFN_NOSLOT; 1682 } 1683 1684 /* Do not map writable pfn in the readonly memslot. */ 1685 if (writable && memslot_is_readonly(slot)) { 1686 *writable = false; 1687 writable = NULL; 1688 } 1689 1690 return hva_to_pfn(addr, atomic, async, write_fault, 1691 writable); 1692 } 1693 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1694 1695 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1696 bool *writable) 1697 { 1698 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1699 write_fault, writable); 1700 } 1701 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1702 1703 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1704 { 1705 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1706 } 1707 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1708 1709 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1710 { 1711 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1712 } 1713 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1714 1715 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1716 { 1717 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1718 } 1719 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1720 1721 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1722 { 1723 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1724 } 1725 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1726 1727 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1728 { 1729 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1730 } 1731 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1732 1733 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1734 { 1735 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1736 } 1737 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1738 1739 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1740 struct page **pages, int nr_pages) 1741 { 1742 unsigned long addr; 1743 gfn_t entry = 0; 1744 1745 addr = gfn_to_hva_many(slot, gfn, &entry); 1746 if (kvm_is_error_hva(addr)) 1747 return -1; 1748 1749 if (entry < nr_pages) 1750 return 0; 1751 1752 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1753 } 1754 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1755 1756 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1757 { 1758 if (is_error_noslot_pfn(pfn)) 1759 return KVM_ERR_PTR_BAD_PAGE; 1760 1761 if (kvm_is_reserved_pfn(pfn)) { 1762 WARN_ON(1); 1763 return KVM_ERR_PTR_BAD_PAGE; 1764 } 1765 1766 return pfn_to_page(pfn); 1767 } 1768 1769 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1770 { 1771 kvm_pfn_t pfn; 1772 1773 pfn = gfn_to_pfn(kvm, gfn); 1774 1775 return kvm_pfn_to_page(pfn); 1776 } 1777 EXPORT_SYMBOL_GPL(gfn_to_page); 1778 1779 static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, 1780 struct kvm_host_map *map) 1781 { 1782 kvm_pfn_t pfn; 1783 void *hva = NULL; 1784 struct page *page = KVM_UNMAPPED_PAGE; 1785 1786 if (!map) 1787 return -EINVAL; 1788 1789 pfn = gfn_to_pfn_memslot(slot, gfn); 1790 if (is_error_noslot_pfn(pfn)) 1791 return -EINVAL; 1792 1793 if (pfn_valid(pfn)) { 1794 page = pfn_to_page(pfn); 1795 hva = kmap(page); 1796 #ifdef CONFIG_HAS_IOMEM 1797 } else { 1798 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 1799 #endif 1800 } 1801 1802 if (!hva) 1803 return -EFAULT; 1804 1805 map->page = page; 1806 map->hva = hva; 1807 map->pfn = pfn; 1808 map->gfn = gfn; 1809 1810 return 0; 1811 } 1812 1813 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 1814 { 1815 return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map); 1816 } 1817 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 1818 1819 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 1820 bool dirty) 1821 { 1822 if (!map) 1823 return; 1824 1825 if (!map->hva) 1826 return; 1827 1828 if (map->page != KVM_UNMAPPED_PAGE) 1829 kunmap(map->page); 1830 #ifdef CONFIG_HAS_IOMEM 1831 else 1832 memunmap(map->hva); 1833 #endif 1834 1835 if (dirty) { 1836 kvm_vcpu_mark_page_dirty(vcpu, map->gfn); 1837 kvm_release_pfn_dirty(map->pfn); 1838 } else { 1839 kvm_release_pfn_clean(map->pfn); 1840 } 1841 1842 map->hva = NULL; 1843 map->page = NULL; 1844 } 1845 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 1846 1847 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1848 { 1849 kvm_pfn_t pfn; 1850 1851 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1852 1853 return kvm_pfn_to_page(pfn); 1854 } 1855 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1856 1857 void kvm_release_page_clean(struct page *page) 1858 { 1859 WARN_ON(is_error_page(page)); 1860 1861 kvm_release_pfn_clean(page_to_pfn(page)); 1862 } 1863 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1864 1865 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1866 { 1867 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1868 put_page(pfn_to_page(pfn)); 1869 } 1870 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1871 1872 void kvm_release_page_dirty(struct page *page) 1873 { 1874 WARN_ON(is_error_page(page)); 1875 1876 kvm_release_pfn_dirty(page_to_pfn(page)); 1877 } 1878 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1879 1880 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1881 { 1882 kvm_set_pfn_dirty(pfn); 1883 kvm_release_pfn_clean(pfn); 1884 } 1885 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1886 1887 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1888 { 1889 if (!kvm_is_reserved_pfn(pfn)) { 1890 struct page *page = pfn_to_page(pfn); 1891 1892 SetPageDirty(page); 1893 } 1894 } 1895 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1896 1897 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 1898 { 1899 if (!kvm_is_reserved_pfn(pfn)) 1900 mark_page_accessed(pfn_to_page(pfn)); 1901 } 1902 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1903 1904 void kvm_get_pfn(kvm_pfn_t pfn) 1905 { 1906 if (!kvm_is_reserved_pfn(pfn)) 1907 get_page(pfn_to_page(pfn)); 1908 } 1909 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1910 1911 static int next_segment(unsigned long len, int offset) 1912 { 1913 if (len > PAGE_SIZE - offset) 1914 return PAGE_SIZE - offset; 1915 else 1916 return len; 1917 } 1918 1919 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1920 void *data, int offset, int len) 1921 { 1922 int r; 1923 unsigned long addr; 1924 1925 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1926 if (kvm_is_error_hva(addr)) 1927 return -EFAULT; 1928 r = __copy_from_user(data, (void __user *)addr + offset, len); 1929 if (r) 1930 return -EFAULT; 1931 return 0; 1932 } 1933 1934 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1935 int len) 1936 { 1937 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1938 1939 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1940 } 1941 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1942 1943 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1944 int offset, int len) 1945 { 1946 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1947 1948 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1949 } 1950 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1951 1952 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1953 { 1954 gfn_t gfn = gpa >> PAGE_SHIFT; 1955 int seg; 1956 int offset = offset_in_page(gpa); 1957 int ret; 1958 1959 while ((seg = next_segment(len, offset)) != 0) { 1960 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1961 if (ret < 0) 1962 return ret; 1963 offset = 0; 1964 len -= seg; 1965 data += seg; 1966 ++gfn; 1967 } 1968 return 0; 1969 } 1970 EXPORT_SYMBOL_GPL(kvm_read_guest); 1971 1972 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 1973 { 1974 gfn_t gfn = gpa >> PAGE_SHIFT; 1975 int seg; 1976 int offset = offset_in_page(gpa); 1977 int ret; 1978 1979 while ((seg = next_segment(len, offset)) != 0) { 1980 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 1981 if (ret < 0) 1982 return ret; 1983 offset = 0; 1984 len -= seg; 1985 data += seg; 1986 ++gfn; 1987 } 1988 return 0; 1989 } 1990 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 1991 1992 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1993 void *data, int offset, unsigned long len) 1994 { 1995 int r; 1996 unsigned long addr; 1997 1998 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1999 if (kvm_is_error_hva(addr)) 2000 return -EFAULT; 2001 pagefault_disable(); 2002 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2003 pagefault_enable(); 2004 if (r) 2005 return -EFAULT; 2006 return 0; 2007 } 2008 2009 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 2010 unsigned long len) 2011 { 2012 gfn_t gfn = gpa >> PAGE_SHIFT; 2013 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2014 int offset = offset_in_page(gpa); 2015 2016 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2017 } 2018 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 2019 2020 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2021 void *data, unsigned long len) 2022 { 2023 gfn_t gfn = gpa >> PAGE_SHIFT; 2024 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2025 int offset = offset_in_page(gpa); 2026 2027 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2028 } 2029 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2030 2031 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2032 const void *data, int offset, int len) 2033 { 2034 int r; 2035 unsigned long addr; 2036 2037 addr = gfn_to_hva_memslot(memslot, gfn); 2038 if (kvm_is_error_hva(addr)) 2039 return -EFAULT; 2040 r = __copy_to_user((void __user *)addr + offset, data, len); 2041 if (r) 2042 return -EFAULT; 2043 mark_page_dirty_in_slot(memslot, gfn); 2044 return 0; 2045 } 2046 2047 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2048 const void *data, int offset, int len) 2049 { 2050 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2051 2052 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2053 } 2054 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2055 2056 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2057 const void *data, int offset, int len) 2058 { 2059 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2060 2061 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2062 } 2063 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2064 2065 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2066 unsigned long len) 2067 { 2068 gfn_t gfn = gpa >> PAGE_SHIFT; 2069 int seg; 2070 int offset = offset_in_page(gpa); 2071 int ret; 2072 2073 while ((seg = next_segment(len, offset)) != 0) { 2074 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2075 if (ret < 0) 2076 return ret; 2077 offset = 0; 2078 len -= seg; 2079 data += seg; 2080 ++gfn; 2081 } 2082 return 0; 2083 } 2084 EXPORT_SYMBOL_GPL(kvm_write_guest); 2085 2086 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2087 unsigned long len) 2088 { 2089 gfn_t gfn = gpa >> PAGE_SHIFT; 2090 int seg; 2091 int offset = offset_in_page(gpa); 2092 int ret; 2093 2094 while ((seg = next_segment(len, offset)) != 0) { 2095 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2096 if (ret < 0) 2097 return ret; 2098 offset = 0; 2099 len -= seg; 2100 data += seg; 2101 ++gfn; 2102 } 2103 return 0; 2104 } 2105 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2106 2107 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2108 struct gfn_to_hva_cache *ghc, 2109 gpa_t gpa, unsigned long len) 2110 { 2111 int offset = offset_in_page(gpa); 2112 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2113 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2114 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2115 gfn_t nr_pages_avail; 2116 int r = start_gfn <= end_gfn ? 0 : -EINVAL; 2117 2118 ghc->gpa = gpa; 2119 ghc->generation = slots->generation; 2120 ghc->len = len; 2121 ghc->hva = KVM_HVA_ERR_BAD; 2122 2123 /* 2124 * If the requested region crosses two memslots, we still 2125 * verify that the entire region is valid here. 2126 */ 2127 while (!r && start_gfn <= end_gfn) { 2128 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2129 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2130 &nr_pages_avail); 2131 if (kvm_is_error_hva(ghc->hva)) 2132 r = -EFAULT; 2133 start_gfn += nr_pages_avail; 2134 } 2135 2136 /* Use the slow path for cross page reads and writes. */ 2137 if (!r && nr_pages_needed == 1) 2138 ghc->hva += offset; 2139 else 2140 ghc->memslot = NULL; 2141 2142 return r; 2143 } 2144 2145 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2146 gpa_t gpa, unsigned long len) 2147 { 2148 struct kvm_memslots *slots = kvm_memslots(kvm); 2149 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2150 } 2151 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2152 2153 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2154 void *data, unsigned int offset, 2155 unsigned long len) 2156 { 2157 struct kvm_memslots *slots = kvm_memslots(kvm); 2158 int r; 2159 gpa_t gpa = ghc->gpa + offset; 2160 2161 BUG_ON(len + offset > ghc->len); 2162 2163 if (slots->generation != ghc->generation) 2164 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2165 2166 if (unlikely(!ghc->memslot)) 2167 return kvm_write_guest(kvm, gpa, data, len); 2168 2169 if (kvm_is_error_hva(ghc->hva)) 2170 return -EFAULT; 2171 2172 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2173 if (r) 2174 return -EFAULT; 2175 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2176 2177 return 0; 2178 } 2179 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2180 2181 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2182 void *data, unsigned long len) 2183 { 2184 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2185 } 2186 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2187 2188 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2189 void *data, unsigned long len) 2190 { 2191 struct kvm_memslots *slots = kvm_memslots(kvm); 2192 int r; 2193 2194 BUG_ON(len > ghc->len); 2195 2196 if (slots->generation != ghc->generation) 2197 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2198 2199 if (unlikely(!ghc->memslot)) 2200 return kvm_read_guest(kvm, ghc->gpa, data, len); 2201 2202 if (kvm_is_error_hva(ghc->hva)) 2203 return -EFAULT; 2204 2205 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2206 if (r) 2207 return -EFAULT; 2208 2209 return 0; 2210 } 2211 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2212 2213 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2214 { 2215 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2216 2217 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2218 } 2219 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2220 2221 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2222 { 2223 gfn_t gfn = gpa >> PAGE_SHIFT; 2224 int seg; 2225 int offset = offset_in_page(gpa); 2226 int ret; 2227 2228 while ((seg = next_segment(len, offset)) != 0) { 2229 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2230 if (ret < 0) 2231 return ret; 2232 offset = 0; 2233 len -= seg; 2234 ++gfn; 2235 } 2236 return 0; 2237 } 2238 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2239 2240 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2241 gfn_t gfn) 2242 { 2243 if (memslot && memslot->dirty_bitmap) { 2244 unsigned long rel_gfn = gfn - memslot->base_gfn; 2245 2246 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2247 } 2248 } 2249 2250 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2251 { 2252 struct kvm_memory_slot *memslot; 2253 2254 memslot = gfn_to_memslot(kvm, gfn); 2255 mark_page_dirty_in_slot(memslot, gfn); 2256 } 2257 EXPORT_SYMBOL_GPL(mark_page_dirty); 2258 2259 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2260 { 2261 struct kvm_memory_slot *memslot; 2262 2263 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2264 mark_page_dirty_in_slot(memslot, gfn); 2265 } 2266 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2267 2268 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2269 { 2270 if (!vcpu->sigset_active) 2271 return; 2272 2273 /* 2274 * This does a lockless modification of ->real_blocked, which is fine 2275 * because, only current can change ->real_blocked and all readers of 2276 * ->real_blocked don't care as long ->real_blocked is always a subset 2277 * of ->blocked. 2278 */ 2279 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2280 } 2281 2282 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2283 { 2284 if (!vcpu->sigset_active) 2285 return; 2286 2287 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2288 sigemptyset(¤t->real_blocked); 2289 } 2290 2291 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2292 { 2293 unsigned int old, val, grow, grow_start; 2294 2295 old = val = vcpu->halt_poll_ns; 2296 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2297 grow = READ_ONCE(halt_poll_ns_grow); 2298 if (!grow) 2299 goto out; 2300 2301 val *= grow; 2302 if (val < grow_start) 2303 val = grow_start; 2304 2305 if (val > halt_poll_ns) 2306 val = halt_poll_ns; 2307 2308 vcpu->halt_poll_ns = val; 2309 out: 2310 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2311 } 2312 2313 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2314 { 2315 unsigned int old, val, shrink; 2316 2317 old = val = vcpu->halt_poll_ns; 2318 shrink = READ_ONCE(halt_poll_ns_shrink); 2319 if (shrink == 0) 2320 val = 0; 2321 else 2322 val /= shrink; 2323 2324 vcpu->halt_poll_ns = val; 2325 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2326 } 2327 2328 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2329 { 2330 int ret = -EINTR; 2331 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2332 2333 if (kvm_arch_vcpu_runnable(vcpu)) { 2334 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2335 goto out; 2336 } 2337 if (kvm_cpu_has_pending_timer(vcpu)) 2338 goto out; 2339 if (signal_pending(current)) 2340 goto out; 2341 2342 ret = 0; 2343 out: 2344 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2345 return ret; 2346 } 2347 2348 /* 2349 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2350 */ 2351 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2352 { 2353 ktime_t start, cur; 2354 DECLARE_SWAITQUEUE(wait); 2355 bool waited = false; 2356 u64 block_ns; 2357 2358 kvm_arch_vcpu_blocking(vcpu); 2359 2360 start = cur = ktime_get(); 2361 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2362 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2363 2364 ++vcpu->stat.halt_attempted_poll; 2365 do { 2366 /* 2367 * This sets KVM_REQ_UNHALT if an interrupt 2368 * arrives. 2369 */ 2370 if (kvm_vcpu_check_block(vcpu) < 0) { 2371 ++vcpu->stat.halt_successful_poll; 2372 if (!vcpu_valid_wakeup(vcpu)) 2373 ++vcpu->stat.halt_poll_invalid; 2374 goto out; 2375 } 2376 cur = ktime_get(); 2377 } while (single_task_running() && ktime_before(cur, stop)); 2378 } 2379 2380 for (;;) { 2381 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2382 2383 if (kvm_vcpu_check_block(vcpu) < 0) 2384 break; 2385 2386 waited = true; 2387 schedule(); 2388 } 2389 2390 finish_swait(&vcpu->wq, &wait); 2391 cur = ktime_get(); 2392 out: 2393 kvm_arch_vcpu_unblocking(vcpu); 2394 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2395 2396 if (!kvm_arch_no_poll(vcpu)) { 2397 if (!vcpu_valid_wakeup(vcpu)) { 2398 shrink_halt_poll_ns(vcpu); 2399 } else if (halt_poll_ns) { 2400 if (block_ns <= vcpu->halt_poll_ns) 2401 ; 2402 /* we had a long block, shrink polling */ 2403 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2404 shrink_halt_poll_ns(vcpu); 2405 /* we had a short halt and our poll time is too small */ 2406 else if (vcpu->halt_poll_ns < halt_poll_ns && 2407 block_ns < halt_poll_ns) 2408 grow_halt_poll_ns(vcpu); 2409 } else { 2410 vcpu->halt_poll_ns = 0; 2411 } 2412 } 2413 2414 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2415 kvm_arch_vcpu_block_finish(vcpu); 2416 } 2417 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2418 2419 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2420 { 2421 struct swait_queue_head *wqp; 2422 2423 wqp = kvm_arch_vcpu_wq(vcpu); 2424 if (swq_has_sleeper(wqp)) { 2425 swake_up_one(wqp); 2426 WRITE_ONCE(vcpu->ready, true); 2427 ++vcpu->stat.halt_wakeup; 2428 return true; 2429 } 2430 2431 return false; 2432 } 2433 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2434 2435 #ifndef CONFIG_S390 2436 /* 2437 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2438 */ 2439 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2440 { 2441 int me; 2442 int cpu = vcpu->cpu; 2443 2444 if (kvm_vcpu_wake_up(vcpu)) 2445 return; 2446 2447 me = get_cpu(); 2448 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2449 if (kvm_arch_vcpu_should_kick(vcpu)) 2450 smp_send_reschedule(cpu); 2451 put_cpu(); 2452 } 2453 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2454 #endif /* !CONFIG_S390 */ 2455 2456 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2457 { 2458 struct pid *pid; 2459 struct task_struct *task = NULL; 2460 int ret = 0; 2461 2462 rcu_read_lock(); 2463 pid = rcu_dereference(target->pid); 2464 if (pid) 2465 task = get_pid_task(pid, PIDTYPE_PID); 2466 rcu_read_unlock(); 2467 if (!task) 2468 return ret; 2469 ret = yield_to(task, 1); 2470 put_task_struct(task); 2471 2472 return ret; 2473 } 2474 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2475 2476 /* 2477 * Helper that checks whether a VCPU is eligible for directed yield. 2478 * Most eligible candidate to yield is decided by following heuristics: 2479 * 2480 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2481 * (preempted lock holder), indicated by @in_spin_loop. 2482 * Set at the beiginning and cleared at the end of interception/PLE handler. 2483 * 2484 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2485 * chance last time (mostly it has become eligible now since we have probably 2486 * yielded to lockholder in last iteration. This is done by toggling 2487 * @dy_eligible each time a VCPU checked for eligibility.) 2488 * 2489 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2490 * to preempted lock-holder could result in wrong VCPU selection and CPU 2491 * burning. Giving priority for a potential lock-holder increases lock 2492 * progress. 2493 * 2494 * Since algorithm is based on heuristics, accessing another VCPU data without 2495 * locking does not harm. It may result in trying to yield to same VCPU, fail 2496 * and continue with next VCPU and so on. 2497 */ 2498 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2499 { 2500 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2501 bool eligible; 2502 2503 eligible = !vcpu->spin_loop.in_spin_loop || 2504 vcpu->spin_loop.dy_eligible; 2505 2506 if (vcpu->spin_loop.in_spin_loop) 2507 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2508 2509 return eligible; 2510 #else 2511 return true; 2512 #endif 2513 } 2514 2515 /* 2516 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2517 * a vcpu_load/vcpu_put pair. However, for most architectures 2518 * kvm_arch_vcpu_runnable does not require vcpu_load. 2519 */ 2520 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2521 { 2522 return kvm_arch_vcpu_runnable(vcpu); 2523 } 2524 2525 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2526 { 2527 if (kvm_arch_dy_runnable(vcpu)) 2528 return true; 2529 2530 #ifdef CONFIG_KVM_ASYNC_PF 2531 if (!list_empty_careful(&vcpu->async_pf.done)) 2532 return true; 2533 #endif 2534 2535 return false; 2536 } 2537 2538 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2539 { 2540 struct kvm *kvm = me->kvm; 2541 struct kvm_vcpu *vcpu; 2542 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2543 int yielded = 0; 2544 int try = 3; 2545 int pass; 2546 int i; 2547 2548 kvm_vcpu_set_in_spin_loop(me, true); 2549 /* 2550 * We boost the priority of a VCPU that is runnable but not 2551 * currently running, because it got preempted by something 2552 * else and called schedule in __vcpu_run. Hopefully that 2553 * VCPU is holding the lock that we need and will release it. 2554 * We approximate round-robin by starting at the last boosted VCPU. 2555 */ 2556 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2557 kvm_for_each_vcpu(i, vcpu, kvm) { 2558 if (!pass && i <= last_boosted_vcpu) { 2559 i = last_boosted_vcpu; 2560 continue; 2561 } else if (pass && i > last_boosted_vcpu) 2562 break; 2563 if (!READ_ONCE(vcpu->ready)) 2564 continue; 2565 if (vcpu == me) 2566 continue; 2567 if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) 2568 continue; 2569 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2570 !kvm_arch_vcpu_in_kernel(vcpu)) 2571 continue; 2572 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2573 continue; 2574 2575 yielded = kvm_vcpu_yield_to(vcpu); 2576 if (yielded > 0) { 2577 kvm->last_boosted_vcpu = i; 2578 break; 2579 } else if (yielded < 0) { 2580 try--; 2581 if (!try) 2582 break; 2583 } 2584 } 2585 } 2586 kvm_vcpu_set_in_spin_loop(me, false); 2587 2588 /* Ensure vcpu is not eligible during next spinloop */ 2589 kvm_vcpu_set_dy_eligible(me, false); 2590 } 2591 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2592 2593 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2594 { 2595 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2596 struct page *page; 2597 2598 if (vmf->pgoff == 0) 2599 page = virt_to_page(vcpu->run); 2600 #ifdef CONFIG_X86 2601 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2602 page = virt_to_page(vcpu->arch.pio_data); 2603 #endif 2604 #ifdef CONFIG_KVM_MMIO 2605 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2606 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2607 #endif 2608 else 2609 return kvm_arch_vcpu_fault(vcpu, vmf); 2610 get_page(page); 2611 vmf->page = page; 2612 return 0; 2613 } 2614 2615 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2616 .fault = kvm_vcpu_fault, 2617 }; 2618 2619 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2620 { 2621 vma->vm_ops = &kvm_vcpu_vm_ops; 2622 return 0; 2623 } 2624 2625 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2626 { 2627 struct kvm_vcpu *vcpu = filp->private_data; 2628 2629 debugfs_remove_recursive(vcpu->debugfs_dentry); 2630 kvm_put_kvm(vcpu->kvm); 2631 return 0; 2632 } 2633 2634 static struct file_operations kvm_vcpu_fops = { 2635 .release = kvm_vcpu_release, 2636 .unlocked_ioctl = kvm_vcpu_ioctl, 2637 .mmap = kvm_vcpu_mmap, 2638 .llseek = noop_llseek, 2639 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2640 }; 2641 2642 /* 2643 * Allocates an inode for the vcpu. 2644 */ 2645 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2646 { 2647 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2648 2649 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2650 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2651 } 2652 2653 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2654 { 2655 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 2656 char dir_name[ITOA_MAX_LEN * 2]; 2657 2658 if (!debugfs_initialized()) 2659 return; 2660 2661 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2662 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2663 vcpu->kvm->debugfs_dentry); 2664 2665 kvm_arch_create_vcpu_debugfs(vcpu); 2666 #endif 2667 } 2668 2669 /* 2670 * Creates some virtual cpus. Good luck creating more than one. 2671 */ 2672 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2673 { 2674 int r; 2675 struct kvm_vcpu *vcpu; 2676 2677 if (id >= KVM_MAX_VCPU_ID) 2678 return -EINVAL; 2679 2680 mutex_lock(&kvm->lock); 2681 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2682 mutex_unlock(&kvm->lock); 2683 return -EINVAL; 2684 } 2685 2686 kvm->created_vcpus++; 2687 mutex_unlock(&kvm->lock); 2688 2689 vcpu = kvm_arch_vcpu_create(kvm, id); 2690 if (IS_ERR(vcpu)) { 2691 r = PTR_ERR(vcpu); 2692 goto vcpu_decrement; 2693 } 2694 2695 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2696 2697 r = kvm_arch_vcpu_setup(vcpu); 2698 if (r) 2699 goto vcpu_destroy; 2700 2701 kvm_create_vcpu_debugfs(vcpu); 2702 2703 mutex_lock(&kvm->lock); 2704 if (kvm_get_vcpu_by_id(kvm, id)) { 2705 r = -EEXIST; 2706 goto unlock_vcpu_destroy; 2707 } 2708 2709 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2710 2711 /* Now it's all set up, let userspace reach it */ 2712 kvm_get_kvm(kvm); 2713 r = create_vcpu_fd(vcpu); 2714 if (r < 0) { 2715 kvm_put_kvm(kvm); 2716 goto unlock_vcpu_destroy; 2717 } 2718 2719 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2720 2721 /* 2722 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2723 * before kvm->online_vcpu's incremented value. 2724 */ 2725 smp_wmb(); 2726 atomic_inc(&kvm->online_vcpus); 2727 2728 mutex_unlock(&kvm->lock); 2729 kvm_arch_vcpu_postcreate(vcpu); 2730 return r; 2731 2732 unlock_vcpu_destroy: 2733 mutex_unlock(&kvm->lock); 2734 debugfs_remove_recursive(vcpu->debugfs_dentry); 2735 vcpu_destroy: 2736 kvm_arch_vcpu_destroy(vcpu); 2737 vcpu_decrement: 2738 mutex_lock(&kvm->lock); 2739 kvm->created_vcpus--; 2740 mutex_unlock(&kvm->lock); 2741 return r; 2742 } 2743 2744 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2745 { 2746 if (sigset) { 2747 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2748 vcpu->sigset_active = 1; 2749 vcpu->sigset = *sigset; 2750 } else 2751 vcpu->sigset_active = 0; 2752 return 0; 2753 } 2754 2755 static long kvm_vcpu_ioctl(struct file *filp, 2756 unsigned int ioctl, unsigned long arg) 2757 { 2758 struct kvm_vcpu *vcpu = filp->private_data; 2759 void __user *argp = (void __user *)arg; 2760 int r; 2761 struct kvm_fpu *fpu = NULL; 2762 struct kvm_sregs *kvm_sregs = NULL; 2763 2764 if (vcpu->kvm->mm != current->mm) 2765 return -EIO; 2766 2767 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2768 return -EINVAL; 2769 2770 /* 2771 * Some architectures have vcpu ioctls that are asynchronous to vcpu 2772 * execution; mutex_lock() would break them. 2773 */ 2774 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 2775 if (r != -ENOIOCTLCMD) 2776 return r; 2777 2778 if (mutex_lock_killable(&vcpu->mutex)) 2779 return -EINTR; 2780 switch (ioctl) { 2781 case KVM_RUN: { 2782 struct pid *oldpid; 2783 r = -EINVAL; 2784 if (arg) 2785 goto out; 2786 oldpid = rcu_access_pointer(vcpu->pid); 2787 if (unlikely(oldpid != task_pid(current))) { 2788 /* The thread running this VCPU changed. */ 2789 struct pid *newpid; 2790 2791 r = kvm_arch_vcpu_run_pid_change(vcpu); 2792 if (r) 2793 break; 2794 2795 newpid = get_task_pid(current, PIDTYPE_PID); 2796 rcu_assign_pointer(vcpu->pid, newpid); 2797 if (oldpid) 2798 synchronize_rcu(); 2799 put_pid(oldpid); 2800 } 2801 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2802 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2803 break; 2804 } 2805 case KVM_GET_REGS: { 2806 struct kvm_regs *kvm_regs; 2807 2808 r = -ENOMEM; 2809 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 2810 if (!kvm_regs) 2811 goto out; 2812 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2813 if (r) 2814 goto out_free1; 2815 r = -EFAULT; 2816 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2817 goto out_free1; 2818 r = 0; 2819 out_free1: 2820 kfree(kvm_regs); 2821 break; 2822 } 2823 case KVM_SET_REGS: { 2824 struct kvm_regs *kvm_regs; 2825 2826 r = -ENOMEM; 2827 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2828 if (IS_ERR(kvm_regs)) { 2829 r = PTR_ERR(kvm_regs); 2830 goto out; 2831 } 2832 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2833 kfree(kvm_regs); 2834 break; 2835 } 2836 case KVM_GET_SREGS: { 2837 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 2838 GFP_KERNEL_ACCOUNT); 2839 r = -ENOMEM; 2840 if (!kvm_sregs) 2841 goto out; 2842 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2843 if (r) 2844 goto out; 2845 r = -EFAULT; 2846 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2847 goto out; 2848 r = 0; 2849 break; 2850 } 2851 case KVM_SET_SREGS: { 2852 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2853 if (IS_ERR(kvm_sregs)) { 2854 r = PTR_ERR(kvm_sregs); 2855 kvm_sregs = NULL; 2856 goto out; 2857 } 2858 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2859 break; 2860 } 2861 case KVM_GET_MP_STATE: { 2862 struct kvm_mp_state mp_state; 2863 2864 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2865 if (r) 2866 goto out; 2867 r = -EFAULT; 2868 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2869 goto out; 2870 r = 0; 2871 break; 2872 } 2873 case KVM_SET_MP_STATE: { 2874 struct kvm_mp_state mp_state; 2875 2876 r = -EFAULT; 2877 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2878 goto out; 2879 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2880 break; 2881 } 2882 case KVM_TRANSLATE: { 2883 struct kvm_translation tr; 2884 2885 r = -EFAULT; 2886 if (copy_from_user(&tr, argp, sizeof(tr))) 2887 goto out; 2888 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2889 if (r) 2890 goto out; 2891 r = -EFAULT; 2892 if (copy_to_user(argp, &tr, sizeof(tr))) 2893 goto out; 2894 r = 0; 2895 break; 2896 } 2897 case KVM_SET_GUEST_DEBUG: { 2898 struct kvm_guest_debug dbg; 2899 2900 r = -EFAULT; 2901 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2902 goto out; 2903 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2904 break; 2905 } 2906 case KVM_SET_SIGNAL_MASK: { 2907 struct kvm_signal_mask __user *sigmask_arg = argp; 2908 struct kvm_signal_mask kvm_sigmask; 2909 sigset_t sigset, *p; 2910 2911 p = NULL; 2912 if (argp) { 2913 r = -EFAULT; 2914 if (copy_from_user(&kvm_sigmask, argp, 2915 sizeof(kvm_sigmask))) 2916 goto out; 2917 r = -EINVAL; 2918 if (kvm_sigmask.len != sizeof(sigset)) 2919 goto out; 2920 r = -EFAULT; 2921 if (copy_from_user(&sigset, sigmask_arg->sigset, 2922 sizeof(sigset))) 2923 goto out; 2924 p = &sigset; 2925 } 2926 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2927 break; 2928 } 2929 case KVM_GET_FPU: { 2930 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 2931 r = -ENOMEM; 2932 if (!fpu) 2933 goto out; 2934 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2935 if (r) 2936 goto out; 2937 r = -EFAULT; 2938 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2939 goto out; 2940 r = 0; 2941 break; 2942 } 2943 case KVM_SET_FPU: { 2944 fpu = memdup_user(argp, sizeof(*fpu)); 2945 if (IS_ERR(fpu)) { 2946 r = PTR_ERR(fpu); 2947 fpu = NULL; 2948 goto out; 2949 } 2950 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2951 break; 2952 } 2953 default: 2954 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2955 } 2956 out: 2957 mutex_unlock(&vcpu->mutex); 2958 kfree(fpu); 2959 kfree(kvm_sregs); 2960 return r; 2961 } 2962 2963 #ifdef CONFIG_KVM_COMPAT 2964 static long kvm_vcpu_compat_ioctl(struct file *filp, 2965 unsigned int ioctl, unsigned long arg) 2966 { 2967 struct kvm_vcpu *vcpu = filp->private_data; 2968 void __user *argp = compat_ptr(arg); 2969 int r; 2970 2971 if (vcpu->kvm->mm != current->mm) 2972 return -EIO; 2973 2974 switch (ioctl) { 2975 case KVM_SET_SIGNAL_MASK: { 2976 struct kvm_signal_mask __user *sigmask_arg = argp; 2977 struct kvm_signal_mask kvm_sigmask; 2978 sigset_t sigset; 2979 2980 if (argp) { 2981 r = -EFAULT; 2982 if (copy_from_user(&kvm_sigmask, argp, 2983 sizeof(kvm_sigmask))) 2984 goto out; 2985 r = -EINVAL; 2986 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 2987 goto out; 2988 r = -EFAULT; 2989 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 2990 goto out; 2991 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2992 } else 2993 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 2994 break; 2995 } 2996 default: 2997 r = kvm_vcpu_ioctl(filp, ioctl, arg); 2998 } 2999 3000 out: 3001 return r; 3002 } 3003 #endif 3004 3005 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3006 { 3007 struct kvm_device *dev = filp->private_data; 3008 3009 if (dev->ops->mmap) 3010 return dev->ops->mmap(dev, vma); 3011 3012 return -ENODEV; 3013 } 3014 3015 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3016 int (*accessor)(struct kvm_device *dev, 3017 struct kvm_device_attr *attr), 3018 unsigned long arg) 3019 { 3020 struct kvm_device_attr attr; 3021 3022 if (!accessor) 3023 return -EPERM; 3024 3025 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3026 return -EFAULT; 3027 3028 return accessor(dev, &attr); 3029 } 3030 3031 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3032 unsigned long arg) 3033 { 3034 struct kvm_device *dev = filp->private_data; 3035 3036 if (dev->kvm->mm != current->mm) 3037 return -EIO; 3038 3039 switch (ioctl) { 3040 case KVM_SET_DEVICE_ATTR: 3041 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3042 case KVM_GET_DEVICE_ATTR: 3043 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3044 case KVM_HAS_DEVICE_ATTR: 3045 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3046 default: 3047 if (dev->ops->ioctl) 3048 return dev->ops->ioctl(dev, ioctl, arg); 3049 3050 return -ENOTTY; 3051 } 3052 } 3053 3054 static int kvm_device_release(struct inode *inode, struct file *filp) 3055 { 3056 struct kvm_device *dev = filp->private_data; 3057 struct kvm *kvm = dev->kvm; 3058 3059 if (dev->ops->release) { 3060 mutex_lock(&kvm->lock); 3061 list_del(&dev->vm_node); 3062 dev->ops->release(dev); 3063 mutex_unlock(&kvm->lock); 3064 } 3065 3066 kvm_put_kvm(kvm); 3067 return 0; 3068 } 3069 3070 static const struct file_operations kvm_device_fops = { 3071 .unlocked_ioctl = kvm_device_ioctl, 3072 .release = kvm_device_release, 3073 KVM_COMPAT(kvm_device_ioctl), 3074 .mmap = kvm_device_mmap, 3075 }; 3076 3077 struct kvm_device *kvm_device_from_filp(struct file *filp) 3078 { 3079 if (filp->f_op != &kvm_device_fops) 3080 return NULL; 3081 3082 return filp->private_data; 3083 } 3084 3085 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3086 #ifdef CONFIG_KVM_MPIC 3087 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3088 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3089 #endif 3090 }; 3091 3092 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 3093 { 3094 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3095 return -ENOSPC; 3096 3097 if (kvm_device_ops_table[type] != NULL) 3098 return -EEXIST; 3099 3100 kvm_device_ops_table[type] = ops; 3101 return 0; 3102 } 3103 3104 void kvm_unregister_device_ops(u32 type) 3105 { 3106 if (kvm_device_ops_table[type] != NULL) 3107 kvm_device_ops_table[type] = NULL; 3108 } 3109 3110 static int kvm_ioctl_create_device(struct kvm *kvm, 3111 struct kvm_create_device *cd) 3112 { 3113 struct kvm_device_ops *ops = NULL; 3114 struct kvm_device *dev; 3115 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3116 int type; 3117 int ret; 3118 3119 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3120 return -ENODEV; 3121 3122 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3123 ops = kvm_device_ops_table[type]; 3124 if (ops == NULL) 3125 return -ENODEV; 3126 3127 if (test) 3128 return 0; 3129 3130 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3131 if (!dev) 3132 return -ENOMEM; 3133 3134 dev->ops = ops; 3135 dev->kvm = kvm; 3136 3137 mutex_lock(&kvm->lock); 3138 ret = ops->create(dev, type); 3139 if (ret < 0) { 3140 mutex_unlock(&kvm->lock); 3141 kfree(dev); 3142 return ret; 3143 } 3144 list_add(&dev->vm_node, &kvm->devices); 3145 mutex_unlock(&kvm->lock); 3146 3147 if (ops->init) 3148 ops->init(dev); 3149 3150 kvm_get_kvm(kvm); 3151 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3152 if (ret < 0) { 3153 kvm_put_kvm(kvm); 3154 mutex_lock(&kvm->lock); 3155 list_del(&dev->vm_node); 3156 mutex_unlock(&kvm->lock); 3157 ops->destroy(dev); 3158 return ret; 3159 } 3160 3161 cd->fd = ret; 3162 return 0; 3163 } 3164 3165 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3166 { 3167 switch (arg) { 3168 case KVM_CAP_USER_MEMORY: 3169 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3170 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3171 case KVM_CAP_INTERNAL_ERROR_DATA: 3172 #ifdef CONFIG_HAVE_KVM_MSI 3173 case KVM_CAP_SIGNAL_MSI: 3174 #endif 3175 #ifdef CONFIG_HAVE_KVM_IRQFD 3176 case KVM_CAP_IRQFD: 3177 case KVM_CAP_IRQFD_RESAMPLE: 3178 #endif 3179 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3180 case KVM_CAP_CHECK_EXTENSION_VM: 3181 case KVM_CAP_ENABLE_CAP_VM: 3182 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3183 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3184 #endif 3185 return 1; 3186 #ifdef CONFIG_KVM_MMIO 3187 case KVM_CAP_COALESCED_MMIO: 3188 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3189 case KVM_CAP_COALESCED_PIO: 3190 return 1; 3191 #endif 3192 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3193 case KVM_CAP_IRQ_ROUTING: 3194 return KVM_MAX_IRQ_ROUTES; 3195 #endif 3196 #if KVM_ADDRESS_SPACE_NUM > 1 3197 case KVM_CAP_MULTI_ADDRESS_SPACE: 3198 return KVM_ADDRESS_SPACE_NUM; 3199 #endif 3200 case KVM_CAP_NR_MEMSLOTS: 3201 return KVM_USER_MEM_SLOTS; 3202 default: 3203 break; 3204 } 3205 return kvm_vm_ioctl_check_extension(kvm, arg); 3206 } 3207 3208 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3209 struct kvm_enable_cap *cap) 3210 { 3211 return -EINVAL; 3212 } 3213 3214 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3215 struct kvm_enable_cap *cap) 3216 { 3217 switch (cap->cap) { 3218 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3219 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3220 if (cap->flags || (cap->args[0] & ~1)) 3221 return -EINVAL; 3222 kvm->manual_dirty_log_protect = cap->args[0]; 3223 return 0; 3224 #endif 3225 default: 3226 return kvm_vm_ioctl_enable_cap(kvm, cap); 3227 } 3228 } 3229 3230 static long kvm_vm_ioctl(struct file *filp, 3231 unsigned int ioctl, unsigned long arg) 3232 { 3233 struct kvm *kvm = filp->private_data; 3234 void __user *argp = (void __user *)arg; 3235 int r; 3236 3237 if (kvm->mm != current->mm) 3238 return -EIO; 3239 switch (ioctl) { 3240 case KVM_CREATE_VCPU: 3241 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3242 break; 3243 case KVM_ENABLE_CAP: { 3244 struct kvm_enable_cap cap; 3245 3246 r = -EFAULT; 3247 if (copy_from_user(&cap, argp, sizeof(cap))) 3248 goto out; 3249 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3250 break; 3251 } 3252 case KVM_SET_USER_MEMORY_REGION: { 3253 struct kvm_userspace_memory_region kvm_userspace_mem; 3254 3255 r = -EFAULT; 3256 if (copy_from_user(&kvm_userspace_mem, argp, 3257 sizeof(kvm_userspace_mem))) 3258 goto out; 3259 3260 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3261 break; 3262 } 3263 case KVM_GET_DIRTY_LOG: { 3264 struct kvm_dirty_log log; 3265 3266 r = -EFAULT; 3267 if (copy_from_user(&log, argp, sizeof(log))) 3268 goto out; 3269 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3270 break; 3271 } 3272 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3273 case KVM_CLEAR_DIRTY_LOG: { 3274 struct kvm_clear_dirty_log log; 3275 3276 r = -EFAULT; 3277 if (copy_from_user(&log, argp, sizeof(log))) 3278 goto out; 3279 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3280 break; 3281 } 3282 #endif 3283 #ifdef CONFIG_KVM_MMIO 3284 case KVM_REGISTER_COALESCED_MMIO: { 3285 struct kvm_coalesced_mmio_zone zone; 3286 3287 r = -EFAULT; 3288 if (copy_from_user(&zone, argp, sizeof(zone))) 3289 goto out; 3290 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3291 break; 3292 } 3293 case KVM_UNREGISTER_COALESCED_MMIO: { 3294 struct kvm_coalesced_mmio_zone zone; 3295 3296 r = -EFAULT; 3297 if (copy_from_user(&zone, argp, sizeof(zone))) 3298 goto out; 3299 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3300 break; 3301 } 3302 #endif 3303 case KVM_IRQFD: { 3304 struct kvm_irqfd data; 3305 3306 r = -EFAULT; 3307 if (copy_from_user(&data, argp, sizeof(data))) 3308 goto out; 3309 r = kvm_irqfd(kvm, &data); 3310 break; 3311 } 3312 case KVM_IOEVENTFD: { 3313 struct kvm_ioeventfd data; 3314 3315 r = -EFAULT; 3316 if (copy_from_user(&data, argp, sizeof(data))) 3317 goto out; 3318 r = kvm_ioeventfd(kvm, &data); 3319 break; 3320 } 3321 #ifdef CONFIG_HAVE_KVM_MSI 3322 case KVM_SIGNAL_MSI: { 3323 struct kvm_msi msi; 3324 3325 r = -EFAULT; 3326 if (copy_from_user(&msi, argp, sizeof(msi))) 3327 goto out; 3328 r = kvm_send_userspace_msi(kvm, &msi); 3329 break; 3330 } 3331 #endif 3332 #ifdef __KVM_HAVE_IRQ_LINE 3333 case KVM_IRQ_LINE_STATUS: 3334 case KVM_IRQ_LINE: { 3335 struct kvm_irq_level irq_event; 3336 3337 r = -EFAULT; 3338 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3339 goto out; 3340 3341 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3342 ioctl == KVM_IRQ_LINE_STATUS); 3343 if (r) 3344 goto out; 3345 3346 r = -EFAULT; 3347 if (ioctl == KVM_IRQ_LINE_STATUS) { 3348 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3349 goto out; 3350 } 3351 3352 r = 0; 3353 break; 3354 } 3355 #endif 3356 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3357 case KVM_SET_GSI_ROUTING: { 3358 struct kvm_irq_routing routing; 3359 struct kvm_irq_routing __user *urouting; 3360 struct kvm_irq_routing_entry *entries = NULL; 3361 3362 r = -EFAULT; 3363 if (copy_from_user(&routing, argp, sizeof(routing))) 3364 goto out; 3365 r = -EINVAL; 3366 if (!kvm_arch_can_set_irq_routing(kvm)) 3367 goto out; 3368 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3369 goto out; 3370 if (routing.flags) 3371 goto out; 3372 if (routing.nr) { 3373 r = -ENOMEM; 3374 entries = vmalloc(array_size(sizeof(*entries), 3375 routing.nr)); 3376 if (!entries) 3377 goto out; 3378 r = -EFAULT; 3379 urouting = argp; 3380 if (copy_from_user(entries, urouting->entries, 3381 routing.nr * sizeof(*entries))) 3382 goto out_free_irq_routing; 3383 } 3384 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3385 routing.flags); 3386 out_free_irq_routing: 3387 vfree(entries); 3388 break; 3389 } 3390 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3391 case KVM_CREATE_DEVICE: { 3392 struct kvm_create_device cd; 3393 3394 r = -EFAULT; 3395 if (copy_from_user(&cd, argp, sizeof(cd))) 3396 goto out; 3397 3398 r = kvm_ioctl_create_device(kvm, &cd); 3399 if (r) 3400 goto out; 3401 3402 r = -EFAULT; 3403 if (copy_to_user(argp, &cd, sizeof(cd))) 3404 goto out; 3405 3406 r = 0; 3407 break; 3408 } 3409 case KVM_CHECK_EXTENSION: 3410 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3411 break; 3412 default: 3413 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3414 } 3415 out: 3416 return r; 3417 } 3418 3419 #ifdef CONFIG_KVM_COMPAT 3420 struct compat_kvm_dirty_log { 3421 __u32 slot; 3422 __u32 padding1; 3423 union { 3424 compat_uptr_t dirty_bitmap; /* one bit per page */ 3425 __u64 padding2; 3426 }; 3427 }; 3428 3429 static long kvm_vm_compat_ioctl(struct file *filp, 3430 unsigned int ioctl, unsigned long arg) 3431 { 3432 struct kvm *kvm = filp->private_data; 3433 int r; 3434 3435 if (kvm->mm != current->mm) 3436 return -EIO; 3437 switch (ioctl) { 3438 case KVM_GET_DIRTY_LOG: { 3439 struct compat_kvm_dirty_log compat_log; 3440 struct kvm_dirty_log log; 3441 3442 if (copy_from_user(&compat_log, (void __user *)arg, 3443 sizeof(compat_log))) 3444 return -EFAULT; 3445 log.slot = compat_log.slot; 3446 log.padding1 = compat_log.padding1; 3447 log.padding2 = compat_log.padding2; 3448 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3449 3450 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3451 break; 3452 } 3453 default: 3454 r = kvm_vm_ioctl(filp, ioctl, arg); 3455 } 3456 return r; 3457 } 3458 #endif 3459 3460 static struct file_operations kvm_vm_fops = { 3461 .release = kvm_vm_release, 3462 .unlocked_ioctl = kvm_vm_ioctl, 3463 .llseek = noop_llseek, 3464 KVM_COMPAT(kvm_vm_compat_ioctl), 3465 }; 3466 3467 static int kvm_dev_ioctl_create_vm(unsigned long type) 3468 { 3469 int r; 3470 struct kvm *kvm; 3471 struct file *file; 3472 3473 kvm = kvm_create_vm(type); 3474 if (IS_ERR(kvm)) 3475 return PTR_ERR(kvm); 3476 #ifdef CONFIG_KVM_MMIO 3477 r = kvm_coalesced_mmio_init(kvm); 3478 if (r < 0) 3479 goto put_kvm; 3480 #endif 3481 r = get_unused_fd_flags(O_CLOEXEC); 3482 if (r < 0) 3483 goto put_kvm; 3484 3485 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3486 if (IS_ERR(file)) { 3487 put_unused_fd(r); 3488 r = PTR_ERR(file); 3489 goto put_kvm; 3490 } 3491 3492 /* 3493 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3494 * already set, with ->release() being kvm_vm_release(). In error 3495 * cases it will be called by the final fput(file) and will take 3496 * care of doing kvm_put_kvm(kvm). 3497 */ 3498 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3499 put_unused_fd(r); 3500 fput(file); 3501 return -ENOMEM; 3502 } 3503 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3504 3505 fd_install(r, file); 3506 return r; 3507 3508 put_kvm: 3509 kvm_put_kvm(kvm); 3510 return r; 3511 } 3512 3513 static long kvm_dev_ioctl(struct file *filp, 3514 unsigned int ioctl, unsigned long arg) 3515 { 3516 long r = -EINVAL; 3517 3518 switch (ioctl) { 3519 case KVM_GET_API_VERSION: 3520 if (arg) 3521 goto out; 3522 r = KVM_API_VERSION; 3523 break; 3524 case KVM_CREATE_VM: 3525 r = kvm_dev_ioctl_create_vm(arg); 3526 break; 3527 case KVM_CHECK_EXTENSION: 3528 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3529 break; 3530 case KVM_GET_VCPU_MMAP_SIZE: 3531 if (arg) 3532 goto out; 3533 r = PAGE_SIZE; /* struct kvm_run */ 3534 #ifdef CONFIG_X86 3535 r += PAGE_SIZE; /* pio data page */ 3536 #endif 3537 #ifdef CONFIG_KVM_MMIO 3538 r += PAGE_SIZE; /* coalesced mmio ring page */ 3539 #endif 3540 break; 3541 case KVM_TRACE_ENABLE: 3542 case KVM_TRACE_PAUSE: 3543 case KVM_TRACE_DISABLE: 3544 r = -EOPNOTSUPP; 3545 break; 3546 default: 3547 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3548 } 3549 out: 3550 return r; 3551 } 3552 3553 static struct file_operations kvm_chardev_ops = { 3554 .unlocked_ioctl = kvm_dev_ioctl, 3555 .llseek = noop_llseek, 3556 KVM_COMPAT(kvm_dev_ioctl), 3557 }; 3558 3559 static struct miscdevice kvm_dev = { 3560 KVM_MINOR, 3561 "kvm", 3562 &kvm_chardev_ops, 3563 }; 3564 3565 static void hardware_enable_nolock(void *junk) 3566 { 3567 int cpu = raw_smp_processor_id(); 3568 int r; 3569 3570 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3571 return; 3572 3573 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3574 3575 r = kvm_arch_hardware_enable(); 3576 3577 if (r) { 3578 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3579 atomic_inc(&hardware_enable_failed); 3580 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3581 } 3582 } 3583 3584 static int kvm_starting_cpu(unsigned int cpu) 3585 { 3586 raw_spin_lock(&kvm_count_lock); 3587 if (kvm_usage_count) 3588 hardware_enable_nolock(NULL); 3589 raw_spin_unlock(&kvm_count_lock); 3590 return 0; 3591 } 3592 3593 static void hardware_disable_nolock(void *junk) 3594 { 3595 int cpu = raw_smp_processor_id(); 3596 3597 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3598 return; 3599 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3600 kvm_arch_hardware_disable(); 3601 } 3602 3603 static int kvm_dying_cpu(unsigned int cpu) 3604 { 3605 raw_spin_lock(&kvm_count_lock); 3606 if (kvm_usage_count) 3607 hardware_disable_nolock(NULL); 3608 raw_spin_unlock(&kvm_count_lock); 3609 return 0; 3610 } 3611 3612 static void hardware_disable_all_nolock(void) 3613 { 3614 BUG_ON(!kvm_usage_count); 3615 3616 kvm_usage_count--; 3617 if (!kvm_usage_count) 3618 on_each_cpu(hardware_disable_nolock, NULL, 1); 3619 } 3620 3621 static void hardware_disable_all(void) 3622 { 3623 raw_spin_lock(&kvm_count_lock); 3624 hardware_disable_all_nolock(); 3625 raw_spin_unlock(&kvm_count_lock); 3626 } 3627 3628 static int hardware_enable_all(void) 3629 { 3630 int r = 0; 3631 3632 raw_spin_lock(&kvm_count_lock); 3633 3634 kvm_usage_count++; 3635 if (kvm_usage_count == 1) { 3636 atomic_set(&hardware_enable_failed, 0); 3637 on_each_cpu(hardware_enable_nolock, NULL, 1); 3638 3639 if (atomic_read(&hardware_enable_failed)) { 3640 hardware_disable_all_nolock(); 3641 r = -EBUSY; 3642 } 3643 } 3644 3645 raw_spin_unlock(&kvm_count_lock); 3646 3647 return r; 3648 } 3649 3650 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3651 void *v) 3652 { 3653 /* 3654 * Some (well, at least mine) BIOSes hang on reboot if 3655 * in vmx root mode. 3656 * 3657 * And Intel TXT required VMX off for all cpu when system shutdown. 3658 */ 3659 pr_info("kvm: exiting hardware virtualization\n"); 3660 kvm_rebooting = true; 3661 on_each_cpu(hardware_disable_nolock, NULL, 1); 3662 return NOTIFY_OK; 3663 } 3664 3665 static struct notifier_block kvm_reboot_notifier = { 3666 .notifier_call = kvm_reboot, 3667 .priority = 0, 3668 }; 3669 3670 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3671 { 3672 int i; 3673 3674 for (i = 0; i < bus->dev_count; i++) { 3675 struct kvm_io_device *pos = bus->range[i].dev; 3676 3677 kvm_iodevice_destructor(pos); 3678 } 3679 kfree(bus); 3680 } 3681 3682 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3683 const struct kvm_io_range *r2) 3684 { 3685 gpa_t addr1 = r1->addr; 3686 gpa_t addr2 = r2->addr; 3687 3688 if (addr1 < addr2) 3689 return -1; 3690 3691 /* If r2->len == 0, match the exact address. If r2->len != 0, 3692 * accept any overlapping write. Any order is acceptable for 3693 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3694 * we process all of them. 3695 */ 3696 if (r2->len) { 3697 addr1 += r1->len; 3698 addr2 += r2->len; 3699 } 3700 3701 if (addr1 > addr2) 3702 return 1; 3703 3704 return 0; 3705 } 3706 3707 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3708 { 3709 return kvm_io_bus_cmp(p1, p2); 3710 } 3711 3712 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3713 gpa_t addr, int len) 3714 { 3715 struct kvm_io_range *range, key; 3716 int off; 3717 3718 key = (struct kvm_io_range) { 3719 .addr = addr, 3720 .len = len, 3721 }; 3722 3723 range = bsearch(&key, bus->range, bus->dev_count, 3724 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3725 if (range == NULL) 3726 return -ENOENT; 3727 3728 off = range - bus->range; 3729 3730 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3731 off--; 3732 3733 return off; 3734 } 3735 3736 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3737 struct kvm_io_range *range, const void *val) 3738 { 3739 int idx; 3740 3741 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3742 if (idx < 0) 3743 return -EOPNOTSUPP; 3744 3745 while (idx < bus->dev_count && 3746 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3747 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3748 range->len, val)) 3749 return idx; 3750 idx++; 3751 } 3752 3753 return -EOPNOTSUPP; 3754 } 3755 3756 /* kvm_io_bus_write - called under kvm->slots_lock */ 3757 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3758 int len, const void *val) 3759 { 3760 struct kvm_io_bus *bus; 3761 struct kvm_io_range range; 3762 int r; 3763 3764 range = (struct kvm_io_range) { 3765 .addr = addr, 3766 .len = len, 3767 }; 3768 3769 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3770 if (!bus) 3771 return -ENOMEM; 3772 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3773 return r < 0 ? r : 0; 3774 } 3775 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3776 3777 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3778 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3779 gpa_t addr, int len, const void *val, long cookie) 3780 { 3781 struct kvm_io_bus *bus; 3782 struct kvm_io_range range; 3783 3784 range = (struct kvm_io_range) { 3785 .addr = addr, 3786 .len = len, 3787 }; 3788 3789 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3790 if (!bus) 3791 return -ENOMEM; 3792 3793 /* First try the device referenced by cookie. */ 3794 if ((cookie >= 0) && (cookie < bus->dev_count) && 3795 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3796 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3797 val)) 3798 return cookie; 3799 3800 /* 3801 * cookie contained garbage; fall back to search and return the 3802 * correct cookie value. 3803 */ 3804 return __kvm_io_bus_write(vcpu, bus, &range, val); 3805 } 3806 3807 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3808 struct kvm_io_range *range, void *val) 3809 { 3810 int idx; 3811 3812 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3813 if (idx < 0) 3814 return -EOPNOTSUPP; 3815 3816 while (idx < bus->dev_count && 3817 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3818 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3819 range->len, val)) 3820 return idx; 3821 idx++; 3822 } 3823 3824 return -EOPNOTSUPP; 3825 } 3826 3827 /* kvm_io_bus_read - called under kvm->slots_lock */ 3828 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3829 int len, void *val) 3830 { 3831 struct kvm_io_bus *bus; 3832 struct kvm_io_range range; 3833 int r; 3834 3835 range = (struct kvm_io_range) { 3836 .addr = addr, 3837 .len = len, 3838 }; 3839 3840 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3841 if (!bus) 3842 return -ENOMEM; 3843 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3844 return r < 0 ? r : 0; 3845 } 3846 3847 /* Caller must hold slots_lock. */ 3848 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3849 int len, struct kvm_io_device *dev) 3850 { 3851 int i; 3852 struct kvm_io_bus *new_bus, *bus; 3853 struct kvm_io_range range; 3854 3855 bus = kvm_get_bus(kvm, bus_idx); 3856 if (!bus) 3857 return -ENOMEM; 3858 3859 /* exclude ioeventfd which is limited by maximum fd */ 3860 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3861 return -ENOSPC; 3862 3863 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 3864 GFP_KERNEL_ACCOUNT); 3865 if (!new_bus) 3866 return -ENOMEM; 3867 3868 range = (struct kvm_io_range) { 3869 .addr = addr, 3870 .len = len, 3871 .dev = dev, 3872 }; 3873 3874 for (i = 0; i < bus->dev_count; i++) 3875 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 3876 break; 3877 3878 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3879 new_bus->dev_count++; 3880 new_bus->range[i] = range; 3881 memcpy(new_bus->range + i + 1, bus->range + i, 3882 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 3883 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3884 synchronize_srcu_expedited(&kvm->srcu); 3885 kfree(bus); 3886 3887 return 0; 3888 } 3889 3890 /* Caller must hold slots_lock. */ 3891 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3892 struct kvm_io_device *dev) 3893 { 3894 int i; 3895 struct kvm_io_bus *new_bus, *bus; 3896 3897 bus = kvm_get_bus(kvm, bus_idx); 3898 if (!bus) 3899 return; 3900 3901 for (i = 0; i < bus->dev_count; i++) 3902 if (bus->range[i].dev == dev) { 3903 break; 3904 } 3905 3906 if (i == bus->dev_count) 3907 return; 3908 3909 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 3910 GFP_KERNEL_ACCOUNT); 3911 if (!new_bus) { 3912 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3913 goto broken; 3914 } 3915 3916 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3917 new_bus->dev_count--; 3918 memcpy(new_bus->range + i, bus->range + i + 1, 3919 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3920 3921 broken: 3922 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3923 synchronize_srcu_expedited(&kvm->srcu); 3924 kfree(bus); 3925 return; 3926 } 3927 3928 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3929 gpa_t addr) 3930 { 3931 struct kvm_io_bus *bus; 3932 int dev_idx, srcu_idx; 3933 struct kvm_io_device *iodev = NULL; 3934 3935 srcu_idx = srcu_read_lock(&kvm->srcu); 3936 3937 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 3938 if (!bus) 3939 goto out_unlock; 3940 3941 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 3942 if (dev_idx < 0) 3943 goto out_unlock; 3944 3945 iodev = bus->range[dev_idx].dev; 3946 3947 out_unlock: 3948 srcu_read_unlock(&kvm->srcu, srcu_idx); 3949 3950 return iodev; 3951 } 3952 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 3953 3954 static int kvm_debugfs_open(struct inode *inode, struct file *file, 3955 int (*get)(void *, u64 *), int (*set)(void *, u64), 3956 const char *fmt) 3957 { 3958 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3959 inode->i_private; 3960 3961 /* The debugfs files are a reference to the kvm struct which 3962 * is still valid when kvm_destroy_vm is called. 3963 * To avoid the race between open and the removal of the debugfs 3964 * directory we test against the users count. 3965 */ 3966 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 3967 return -ENOENT; 3968 3969 if (simple_attr_open(inode, file, get, 3970 stat_data->mode & S_IWUGO ? set : NULL, 3971 fmt)) { 3972 kvm_put_kvm(stat_data->kvm); 3973 return -ENOMEM; 3974 } 3975 3976 return 0; 3977 } 3978 3979 static int kvm_debugfs_release(struct inode *inode, struct file *file) 3980 { 3981 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3982 inode->i_private; 3983 3984 simple_attr_release(inode, file); 3985 kvm_put_kvm(stat_data->kvm); 3986 3987 return 0; 3988 } 3989 3990 static int vm_stat_get_per_vm(void *data, u64 *val) 3991 { 3992 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 3993 3994 *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 3995 3996 return 0; 3997 } 3998 3999 static int vm_stat_clear_per_vm(void *data, u64 val) 4000 { 4001 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4002 4003 if (val) 4004 return -EINVAL; 4005 4006 *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 4007 4008 return 0; 4009 } 4010 4011 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 4012 { 4013 __simple_attr_check_format("%llu\n", 0ull); 4014 return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 4015 vm_stat_clear_per_vm, "%llu\n"); 4016 } 4017 4018 static const struct file_operations vm_stat_get_per_vm_fops = { 4019 .owner = THIS_MODULE, 4020 .open = vm_stat_get_per_vm_open, 4021 .release = kvm_debugfs_release, 4022 .read = simple_attr_read, 4023 .write = simple_attr_write, 4024 .llseek = no_llseek, 4025 }; 4026 4027 static int vcpu_stat_get_per_vm(void *data, u64 *val) 4028 { 4029 int i; 4030 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4031 struct kvm_vcpu *vcpu; 4032 4033 *val = 0; 4034 4035 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4036 *val += *(u64 *)((void *)vcpu + stat_data->offset); 4037 4038 return 0; 4039 } 4040 4041 static int vcpu_stat_clear_per_vm(void *data, u64 val) 4042 { 4043 int i; 4044 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4045 struct kvm_vcpu *vcpu; 4046 4047 if (val) 4048 return -EINVAL; 4049 4050 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4051 *(u64 *)((void *)vcpu + stat_data->offset) = 0; 4052 4053 return 0; 4054 } 4055 4056 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 4057 { 4058 __simple_attr_check_format("%llu\n", 0ull); 4059 return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 4060 vcpu_stat_clear_per_vm, "%llu\n"); 4061 } 4062 4063 static const struct file_operations vcpu_stat_get_per_vm_fops = { 4064 .owner = THIS_MODULE, 4065 .open = vcpu_stat_get_per_vm_open, 4066 .release = kvm_debugfs_release, 4067 .read = simple_attr_read, 4068 .write = simple_attr_write, 4069 .llseek = no_llseek, 4070 }; 4071 4072 static const struct file_operations *stat_fops_per_vm[] = { 4073 [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 4074 [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 4075 }; 4076 4077 static int vm_stat_get(void *_offset, u64 *val) 4078 { 4079 unsigned offset = (long)_offset; 4080 struct kvm *kvm; 4081 struct kvm_stat_data stat_tmp = {.offset = offset}; 4082 u64 tmp_val; 4083 4084 *val = 0; 4085 mutex_lock(&kvm_lock); 4086 list_for_each_entry(kvm, &vm_list, vm_list) { 4087 stat_tmp.kvm = kvm; 4088 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4089 *val += tmp_val; 4090 } 4091 mutex_unlock(&kvm_lock); 4092 return 0; 4093 } 4094 4095 static int vm_stat_clear(void *_offset, u64 val) 4096 { 4097 unsigned offset = (long)_offset; 4098 struct kvm *kvm; 4099 struct kvm_stat_data stat_tmp = {.offset = offset}; 4100 4101 if (val) 4102 return -EINVAL; 4103 4104 mutex_lock(&kvm_lock); 4105 list_for_each_entry(kvm, &vm_list, vm_list) { 4106 stat_tmp.kvm = kvm; 4107 vm_stat_clear_per_vm((void *)&stat_tmp, 0); 4108 } 4109 mutex_unlock(&kvm_lock); 4110 4111 return 0; 4112 } 4113 4114 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4115 4116 static int vcpu_stat_get(void *_offset, u64 *val) 4117 { 4118 unsigned offset = (long)_offset; 4119 struct kvm *kvm; 4120 struct kvm_stat_data stat_tmp = {.offset = offset}; 4121 u64 tmp_val; 4122 4123 *val = 0; 4124 mutex_lock(&kvm_lock); 4125 list_for_each_entry(kvm, &vm_list, vm_list) { 4126 stat_tmp.kvm = kvm; 4127 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4128 *val += tmp_val; 4129 } 4130 mutex_unlock(&kvm_lock); 4131 return 0; 4132 } 4133 4134 static int vcpu_stat_clear(void *_offset, u64 val) 4135 { 4136 unsigned offset = (long)_offset; 4137 struct kvm *kvm; 4138 struct kvm_stat_data stat_tmp = {.offset = offset}; 4139 4140 if (val) 4141 return -EINVAL; 4142 4143 mutex_lock(&kvm_lock); 4144 list_for_each_entry(kvm, &vm_list, vm_list) { 4145 stat_tmp.kvm = kvm; 4146 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 4147 } 4148 mutex_unlock(&kvm_lock); 4149 4150 return 0; 4151 } 4152 4153 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4154 "%llu\n"); 4155 4156 static const struct file_operations *stat_fops[] = { 4157 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4158 [KVM_STAT_VM] = &vm_stat_fops, 4159 }; 4160 4161 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4162 { 4163 struct kobj_uevent_env *env; 4164 unsigned long long created, active; 4165 4166 if (!kvm_dev.this_device || !kvm) 4167 return; 4168 4169 mutex_lock(&kvm_lock); 4170 if (type == KVM_EVENT_CREATE_VM) { 4171 kvm_createvm_count++; 4172 kvm_active_vms++; 4173 } else if (type == KVM_EVENT_DESTROY_VM) { 4174 kvm_active_vms--; 4175 } 4176 created = kvm_createvm_count; 4177 active = kvm_active_vms; 4178 mutex_unlock(&kvm_lock); 4179 4180 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4181 if (!env) 4182 return; 4183 4184 add_uevent_var(env, "CREATED=%llu", created); 4185 add_uevent_var(env, "COUNT=%llu", active); 4186 4187 if (type == KVM_EVENT_CREATE_VM) { 4188 add_uevent_var(env, "EVENT=create"); 4189 kvm->userspace_pid = task_pid_nr(current); 4190 } else if (type == KVM_EVENT_DESTROY_VM) { 4191 add_uevent_var(env, "EVENT=destroy"); 4192 } 4193 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4194 4195 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4196 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4197 4198 if (p) { 4199 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4200 if (!IS_ERR(tmp)) 4201 add_uevent_var(env, "STATS_PATH=%s", tmp); 4202 kfree(p); 4203 } 4204 } 4205 /* no need for checks, since we are adding at most only 5 keys */ 4206 env->envp[env->envp_idx++] = NULL; 4207 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4208 kfree(env); 4209 } 4210 4211 static void kvm_init_debug(void) 4212 { 4213 struct kvm_stats_debugfs_item *p; 4214 4215 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4216 4217 kvm_debugfs_num_entries = 0; 4218 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4219 int mode = p->mode ? p->mode : 0644; 4220 debugfs_create_file(p->name, mode, kvm_debugfs_dir, 4221 (void *)(long)p->offset, 4222 stat_fops[p->kind]); 4223 } 4224 } 4225 4226 static int kvm_suspend(void) 4227 { 4228 if (kvm_usage_count) 4229 hardware_disable_nolock(NULL); 4230 return 0; 4231 } 4232 4233 static void kvm_resume(void) 4234 { 4235 if (kvm_usage_count) { 4236 #ifdef CONFIG_LOCKDEP 4237 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4238 #endif 4239 hardware_enable_nolock(NULL); 4240 } 4241 } 4242 4243 static struct syscore_ops kvm_syscore_ops = { 4244 .suspend = kvm_suspend, 4245 .resume = kvm_resume, 4246 }; 4247 4248 static inline 4249 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4250 { 4251 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4252 } 4253 4254 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4255 { 4256 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4257 4258 WRITE_ONCE(vcpu->preempted, false); 4259 WRITE_ONCE(vcpu->ready, false); 4260 4261 kvm_arch_sched_in(vcpu, cpu); 4262 4263 kvm_arch_vcpu_load(vcpu, cpu); 4264 } 4265 4266 static void kvm_sched_out(struct preempt_notifier *pn, 4267 struct task_struct *next) 4268 { 4269 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4270 4271 if (current->state == TASK_RUNNING) { 4272 WRITE_ONCE(vcpu->preempted, true); 4273 WRITE_ONCE(vcpu->ready, true); 4274 } 4275 kvm_arch_vcpu_put(vcpu); 4276 } 4277 4278 static void check_processor_compat(void *rtn) 4279 { 4280 *(int *)rtn = kvm_arch_check_processor_compat(); 4281 } 4282 4283 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4284 struct module *module) 4285 { 4286 int r; 4287 int cpu; 4288 4289 r = kvm_arch_init(opaque); 4290 if (r) 4291 goto out_fail; 4292 4293 /* 4294 * kvm_arch_init makes sure there's at most one caller 4295 * for architectures that support multiple implementations, 4296 * like intel and amd on x86. 4297 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4298 * conflicts in case kvm is already setup for another implementation. 4299 */ 4300 r = kvm_irqfd_init(); 4301 if (r) 4302 goto out_irqfd; 4303 4304 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4305 r = -ENOMEM; 4306 goto out_free_0; 4307 } 4308 4309 r = kvm_arch_hardware_setup(); 4310 if (r < 0) 4311 goto out_free_0a; 4312 4313 for_each_online_cpu(cpu) { 4314 smp_call_function_single(cpu, check_processor_compat, &r, 1); 4315 if (r < 0) 4316 goto out_free_1; 4317 } 4318 4319 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4320 kvm_starting_cpu, kvm_dying_cpu); 4321 if (r) 4322 goto out_free_2; 4323 register_reboot_notifier(&kvm_reboot_notifier); 4324 4325 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4326 if (!vcpu_align) 4327 vcpu_align = __alignof__(struct kvm_vcpu); 4328 kvm_vcpu_cache = 4329 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4330 SLAB_ACCOUNT, 4331 offsetof(struct kvm_vcpu, arch), 4332 sizeof_field(struct kvm_vcpu, arch), 4333 NULL); 4334 if (!kvm_vcpu_cache) { 4335 r = -ENOMEM; 4336 goto out_free_3; 4337 } 4338 4339 r = kvm_async_pf_init(); 4340 if (r) 4341 goto out_free; 4342 4343 kvm_chardev_ops.owner = module; 4344 kvm_vm_fops.owner = module; 4345 kvm_vcpu_fops.owner = module; 4346 4347 r = misc_register(&kvm_dev); 4348 if (r) { 4349 pr_err("kvm: misc device register failed\n"); 4350 goto out_unreg; 4351 } 4352 4353 register_syscore_ops(&kvm_syscore_ops); 4354 4355 kvm_preempt_ops.sched_in = kvm_sched_in; 4356 kvm_preempt_ops.sched_out = kvm_sched_out; 4357 4358 kvm_init_debug(); 4359 4360 r = kvm_vfio_ops_init(); 4361 WARN_ON(r); 4362 4363 return 0; 4364 4365 out_unreg: 4366 kvm_async_pf_deinit(); 4367 out_free: 4368 kmem_cache_destroy(kvm_vcpu_cache); 4369 out_free_3: 4370 unregister_reboot_notifier(&kvm_reboot_notifier); 4371 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4372 out_free_2: 4373 out_free_1: 4374 kvm_arch_hardware_unsetup(); 4375 out_free_0a: 4376 free_cpumask_var(cpus_hardware_enabled); 4377 out_free_0: 4378 kvm_irqfd_exit(); 4379 out_irqfd: 4380 kvm_arch_exit(); 4381 out_fail: 4382 return r; 4383 } 4384 EXPORT_SYMBOL_GPL(kvm_init); 4385 4386 void kvm_exit(void) 4387 { 4388 debugfs_remove_recursive(kvm_debugfs_dir); 4389 misc_deregister(&kvm_dev); 4390 kmem_cache_destroy(kvm_vcpu_cache); 4391 kvm_async_pf_deinit(); 4392 unregister_syscore_ops(&kvm_syscore_ops); 4393 unregister_reboot_notifier(&kvm_reboot_notifier); 4394 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4395 on_each_cpu(hardware_disable_nolock, NULL, 1); 4396 kvm_arch_hardware_unsetup(); 4397 kvm_arch_exit(); 4398 kvm_irqfd_exit(); 4399 free_cpumask_var(cpus_hardware_enabled); 4400 kvm_vfio_ops_exit(); 4401 } 4402 EXPORT_SYMBOL_GPL(kvm_exit); 4403 4404 struct kvm_vm_worker_thread_context { 4405 struct kvm *kvm; 4406 struct task_struct *parent; 4407 struct completion init_done; 4408 kvm_vm_thread_fn_t thread_fn; 4409 uintptr_t data; 4410 int err; 4411 }; 4412 4413 static int kvm_vm_worker_thread(void *context) 4414 { 4415 /* 4416 * The init_context is allocated on the stack of the parent thread, so 4417 * we have to locally copy anything that is needed beyond initialization 4418 */ 4419 struct kvm_vm_worker_thread_context *init_context = context; 4420 struct kvm *kvm = init_context->kvm; 4421 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4422 uintptr_t data = init_context->data; 4423 int err; 4424 4425 err = kthread_park(current); 4426 /* kthread_park(current) is never supposed to return an error */ 4427 WARN_ON(err != 0); 4428 if (err) 4429 goto init_complete; 4430 4431 err = cgroup_attach_task_all(init_context->parent, current); 4432 if (err) { 4433 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4434 __func__, err); 4435 goto init_complete; 4436 } 4437 4438 set_user_nice(current, task_nice(init_context->parent)); 4439 4440 init_complete: 4441 init_context->err = err; 4442 complete(&init_context->init_done); 4443 init_context = NULL; 4444 4445 if (err) 4446 return err; 4447 4448 /* Wait to be woken up by the spawner before proceeding. */ 4449 kthread_parkme(); 4450 4451 if (!kthread_should_stop()) 4452 err = thread_fn(kvm, data); 4453 4454 return err; 4455 } 4456 4457 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4458 uintptr_t data, const char *name, 4459 struct task_struct **thread_ptr) 4460 { 4461 struct kvm_vm_worker_thread_context init_context = {}; 4462 struct task_struct *thread; 4463 4464 *thread_ptr = NULL; 4465 init_context.kvm = kvm; 4466 init_context.parent = current; 4467 init_context.thread_fn = thread_fn; 4468 init_context.data = data; 4469 init_completion(&init_context.init_done); 4470 4471 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4472 "%s-%d", name, task_pid_nr(current)); 4473 if (IS_ERR(thread)) 4474 return PTR_ERR(thread); 4475 4476 /* kthread_run is never supposed to return NULL */ 4477 WARN_ON(thread == NULL); 4478 4479 wait_for_completion(&init_context.init_done); 4480 4481 if (!init_context.err) 4482 *thread_ptr = thread; 4483 4484 return init_context.err; 4485 } 4486