1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 #include <asm/pgtable.h> 59 60 #include "coalesced_mmio.h" 61 #include "async_pf.h" 62 #include "vfio.h" 63 64 #define CREATE_TRACE_POINTS 65 #include <trace/events/kvm.h> 66 67 /* Worst case buffer size needed for holding an integer. */ 68 #define ITOA_MAX_LEN 12 69 70 MODULE_AUTHOR("Qumranet"); 71 MODULE_LICENSE("GPL"); 72 73 /* Architectures should define their poll value according to the halt latency */ 74 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 75 module_param(halt_poll_ns, uint, 0644); 76 EXPORT_SYMBOL_GPL(halt_poll_ns); 77 78 /* Default doubles per-vcpu halt_poll_ns. */ 79 unsigned int halt_poll_ns_grow = 2; 80 module_param(halt_poll_ns_grow, uint, 0644); 81 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82 83 /* The start value to grow halt_poll_ns from */ 84 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 85 module_param(halt_poll_ns_grow_start, uint, 0644); 86 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 87 88 /* Default resets per-vcpu halt_poll_ns . */ 89 unsigned int halt_poll_ns_shrink; 90 module_param(halt_poll_ns_shrink, uint, 0644); 91 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 92 93 /* 94 * Ordering of locks: 95 * 96 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 97 */ 98 99 DEFINE_MUTEX(kvm_lock); 100 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 101 LIST_HEAD(vm_list); 102 103 static cpumask_var_t cpus_hardware_enabled; 104 static int kvm_usage_count; 105 static atomic_t hardware_enable_failed; 106 107 static struct kmem_cache *kvm_vcpu_cache; 108 109 static __read_mostly struct preempt_ops kvm_preempt_ops; 110 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 111 112 struct dentry *kvm_debugfs_dir; 113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 114 115 static int kvm_debugfs_num_entries; 116 static const struct file_operations stat_fops_per_vm; 117 118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 119 unsigned long arg); 120 #ifdef CONFIG_KVM_COMPAT 121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg); 123 #define KVM_COMPAT(c) .compat_ioctl = (c) 124 #else 125 /* 126 * For architectures that don't implement a compat infrastructure, 127 * adopt a double line of defense: 128 * - Prevent a compat task from opening /dev/kvm 129 * - If the open has been done by a 64bit task, and the KVM fd 130 * passed to a compat task, let the ioctls fail. 131 */ 132 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 133 unsigned long arg) { return -EINVAL; } 134 135 static int kvm_no_compat_open(struct inode *inode, struct file *file) 136 { 137 return is_compat_task() ? -ENODEV : 0; 138 } 139 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 140 .open = kvm_no_compat_open 141 #endif 142 static int hardware_enable_all(void); 143 static void hardware_disable_all(void); 144 145 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 146 147 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 148 149 __visible bool kvm_rebooting; 150 EXPORT_SYMBOL_GPL(kvm_rebooting); 151 152 static bool largepages_enabled = true; 153 154 #define KVM_EVENT_CREATE_VM 0 155 #define KVM_EVENT_DESTROY_VM 1 156 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 157 static unsigned long long kvm_createvm_count; 158 static unsigned long long kvm_active_vms; 159 160 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 161 unsigned long start, unsigned long end, bool blockable) 162 { 163 return 0; 164 } 165 166 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 167 { 168 /* 169 * The metadata used by is_zone_device_page() to determine whether or 170 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 171 * the device has been pinned, e.g. by get_user_pages(). WARN if the 172 * page_count() is zero to help detect bad usage of this helper. 173 */ 174 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 175 return false; 176 177 return is_zone_device_page(pfn_to_page(pfn)); 178 } 179 180 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 181 { 182 /* 183 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 184 * perspective they are "normal" pages, albeit with slightly different 185 * usage rules. 186 */ 187 if (pfn_valid(pfn)) 188 return PageReserved(pfn_to_page(pfn)) && 189 !is_zero_pfn(pfn) && 190 !kvm_is_zone_device_pfn(pfn); 191 192 return true; 193 } 194 195 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 196 { 197 struct page *page = pfn_to_page(pfn); 198 199 if (!PageTransCompoundMap(page)) 200 return false; 201 202 return is_transparent_hugepage(compound_head(page)); 203 } 204 205 /* 206 * Switches to specified vcpu, until a matching vcpu_put() 207 */ 208 void vcpu_load(struct kvm_vcpu *vcpu) 209 { 210 int cpu = get_cpu(); 211 212 __this_cpu_write(kvm_running_vcpu, vcpu); 213 preempt_notifier_register(&vcpu->preempt_notifier); 214 kvm_arch_vcpu_load(vcpu, cpu); 215 put_cpu(); 216 } 217 EXPORT_SYMBOL_GPL(vcpu_load); 218 219 void vcpu_put(struct kvm_vcpu *vcpu) 220 { 221 preempt_disable(); 222 kvm_arch_vcpu_put(vcpu); 223 preempt_notifier_unregister(&vcpu->preempt_notifier); 224 __this_cpu_write(kvm_running_vcpu, NULL); 225 preempt_enable(); 226 } 227 EXPORT_SYMBOL_GPL(vcpu_put); 228 229 /* TODO: merge with kvm_arch_vcpu_should_kick */ 230 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 231 { 232 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 233 234 /* 235 * We need to wait for the VCPU to reenable interrupts and get out of 236 * READING_SHADOW_PAGE_TABLES mode. 237 */ 238 if (req & KVM_REQUEST_WAIT) 239 return mode != OUTSIDE_GUEST_MODE; 240 241 /* 242 * Need to kick a running VCPU, but otherwise there is nothing to do. 243 */ 244 return mode == IN_GUEST_MODE; 245 } 246 247 static void ack_flush(void *_completed) 248 { 249 } 250 251 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 252 { 253 if (unlikely(!cpus)) 254 cpus = cpu_online_mask; 255 256 if (cpumask_empty(cpus)) 257 return false; 258 259 smp_call_function_many(cpus, ack_flush, NULL, wait); 260 return true; 261 } 262 263 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 264 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 265 { 266 int i, cpu, me; 267 struct kvm_vcpu *vcpu; 268 bool called; 269 270 me = get_cpu(); 271 272 kvm_for_each_vcpu(i, vcpu, kvm) { 273 if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) 274 continue; 275 276 kvm_make_request(req, vcpu); 277 cpu = vcpu->cpu; 278 279 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 280 continue; 281 282 if (tmp != NULL && cpu != -1 && cpu != me && 283 kvm_request_needs_ipi(vcpu, req)) 284 __cpumask_set_cpu(cpu, tmp); 285 } 286 287 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 288 put_cpu(); 289 290 return called; 291 } 292 293 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 294 { 295 cpumask_var_t cpus; 296 bool called; 297 298 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 299 300 called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); 301 302 free_cpumask_var(cpus); 303 return called; 304 } 305 306 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 307 void kvm_flush_remote_tlbs(struct kvm *kvm) 308 { 309 /* 310 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 311 * kvm_make_all_cpus_request. 312 */ 313 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 314 315 /* 316 * We want to publish modifications to the page tables before reading 317 * mode. Pairs with a memory barrier in arch-specific code. 318 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 319 * and smp_mb in walk_shadow_page_lockless_begin/end. 320 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 321 * 322 * There is already an smp_mb__after_atomic() before 323 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 324 * barrier here. 325 */ 326 if (!kvm_arch_flush_remote_tlb(kvm) 327 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 328 ++kvm->stat.remote_tlb_flush; 329 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 330 } 331 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 332 #endif 333 334 void kvm_reload_remote_mmus(struct kvm *kvm) 335 { 336 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 337 } 338 339 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 340 { 341 mutex_init(&vcpu->mutex); 342 vcpu->cpu = -1; 343 vcpu->kvm = kvm; 344 vcpu->vcpu_id = id; 345 vcpu->pid = NULL; 346 init_swait_queue_head(&vcpu->wq); 347 kvm_async_pf_vcpu_init(vcpu); 348 349 vcpu->pre_pcpu = -1; 350 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 351 352 kvm_vcpu_set_in_spin_loop(vcpu, false); 353 kvm_vcpu_set_dy_eligible(vcpu, false); 354 vcpu->preempted = false; 355 vcpu->ready = false; 356 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 357 } 358 359 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 360 { 361 kvm_arch_vcpu_destroy(vcpu); 362 363 /* 364 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 365 * the vcpu->pid pointer, and at destruction time all file descriptors 366 * are already gone. 367 */ 368 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 369 370 free_page((unsigned long)vcpu->run); 371 kmem_cache_free(kvm_vcpu_cache, vcpu); 372 } 373 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 374 375 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 376 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 377 { 378 return container_of(mn, struct kvm, mmu_notifier); 379 } 380 381 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 382 struct mm_struct *mm, 383 unsigned long address, 384 pte_t pte) 385 { 386 struct kvm *kvm = mmu_notifier_to_kvm(mn); 387 int idx; 388 389 idx = srcu_read_lock(&kvm->srcu); 390 spin_lock(&kvm->mmu_lock); 391 kvm->mmu_notifier_seq++; 392 393 if (kvm_set_spte_hva(kvm, address, pte)) 394 kvm_flush_remote_tlbs(kvm); 395 396 spin_unlock(&kvm->mmu_lock); 397 srcu_read_unlock(&kvm->srcu, idx); 398 } 399 400 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 401 const struct mmu_notifier_range *range) 402 { 403 struct kvm *kvm = mmu_notifier_to_kvm(mn); 404 int need_tlb_flush = 0, idx; 405 int ret; 406 407 idx = srcu_read_lock(&kvm->srcu); 408 spin_lock(&kvm->mmu_lock); 409 /* 410 * The count increase must become visible at unlock time as no 411 * spte can be established without taking the mmu_lock and 412 * count is also read inside the mmu_lock critical section. 413 */ 414 kvm->mmu_notifier_count++; 415 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); 416 need_tlb_flush |= kvm->tlbs_dirty; 417 /* we've to flush the tlb before the pages can be freed */ 418 if (need_tlb_flush) 419 kvm_flush_remote_tlbs(kvm); 420 421 spin_unlock(&kvm->mmu_lock); 422 423 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, 424 range->end, 425 mmu_notifier_range_blockable(range)); 426 427 srcu_read_unlock(&kvm->srcu, idx); 428 429 return ret; 430 } 431 432 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 433 const struct mmu_notifier_range *range) 434 { 435 struct kvm *kvm = mmu_notifier_to_kvm(mn); 436 437 spin_lock(&kvm->mmu_lock); 438 /* 439 * This sequence increase will notify the kvm page fault that 440 * the page that is going to be mapped in the spte could have 441 * been freed. 442 */ 443 kvm->mmu_notifier_seq++; 444 smp_wmb(); 445 /* 446 * The above sequence increase must be visible before the 447 * below count decrease, which is ensured by the smp_wmb above 448 * in conjunction with the smp_rmb in mmu_notifier_retry(). 449 */ 450 kvm->mmu_notifier_count--; 451 spin_unlock(&kvm->mmu_lock); 452 453 BUG_ON(kvm->mmu_notifier_count < 0); 454 } 455 456 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 457 struct mm_struct *mm, 458 unsigned long start, 459 unsigned long end) 460 { 461 struct kvm *kvm = mmu_notifier_to_kvm(mn); 462 int young, idx; 463 464 idx = srcu_read_lock(&kvm->srcu); 465 spin_lock(&kvm->mmu_lock); 466 467 young = kvm_age_hva(kvm, start, end); 468 if (young) 469 kvm_flush_remote_tlbs(kvm); 470 471 spin_unlock(&kvm->mmu_lock); 472 srcu_read_unlock(&kvm->srcu, idx); 473 474 return young; 475 } 476 477 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 478 struct mm_struct *mm, 479 unsigned long start, 480 unsigned long end) 481 { 482 struct kvm *kvm = mmu_notifier_to_kvm(mn); 483 int young, idx; 484 485 idx = srcu_read_lock(&kvm->srcu); 486 spin_lock(&kvm->mmu_lock); 487 /* 488 * Even though we do not flush TLB, this will still adversely 489 * affect performance on pre-Haswell Intel EPT, where there is 490 * no EPT Access Bit to clear so that we have to tear down EPT 491 * tables instead. If we find this unacceptable, we can always 492 * add a parameter to kvm_age_hva so that it effectively doesn't 493 * do anything on clear_young. 494 * 495 * Also note that currently we never issue secondary TLB flushes 496 * from clear_young, leaving this job up to the regular system 497 * cadence. If we find this inaccurate, we might come up with a 498 * more sophisticated heuristic later. 499 */ 500 young = kvm_age_hva(kvm, start, end); 501 spin_unlock(&kvm->mmu_lock); 502 srcu_read_unlock(&kvm->srcu, idx); 503 504 return young; 505 } 506 507 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 508 struct mm_struct *mm, 509 unsigned long address) 510 { 511 struct kvm *kvm = mmu_notifier_to_kvm(mn); 512 int young, idx; 513 514 idx = srcu_read_lock(&kvm->srcu); 515 spin_lock(&kvm->mmu_lock); 516 young = kvm_test_age_hva(kvm, address); 517 spin_unlock(&kvm->mmu_lock); 518 srcu_read_unlock(&kvm->srcu, idx); 519 520 return young; 521 } 522 523 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 524 struct mm_struct *mm) 525 { 526 struct kvm *kvm = mmu_notifier_to_kvm(mn); 527 int idx; 528 529 idx = srcu_read_lock(&kvm->srcu); 530 kvm_arch_flush_shadow_all(kvm); 531 srcu_read_unlock(&kvm->srcu, idx); 532 } 533 534 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 535 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 536 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 537 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 538 .clear_young = kvm_mmu_notifier_clear_young, 539 .test_young = kvm_mmu_notifier_test_young, 540 .change_pte = kvm_mmu_notifier_change_pte, 541 .release = kvm_mmu_notifier_release, 542 }; 543 544 static int kvm_init_mmu_notifier(struct kvm *kvm) 545 { 546 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 547 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 548 } 549 550 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 551 552 static int kvm_init_mmu_notifier(struct kvm *kvm) 553 { 554 return 0; 555 } 556 557 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 558 559 static struct kvm_memslots *kvm_alloc_memslots(void) 560 { 561 int i; 562 struct kvm_memslots *slots; 563 564 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 565 if (!slots) 566 return NULL; 567 568 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 569 slots->id_to_index[i] = slots->memslots[i].id = i; 570 571 return slots; 572 } 573 574 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 575 { 576 if (!memslot->dirty_bitmap) 577 return; 578 579 kvfree(memslot->dirty_bitmap); 580 memslot->dirty_bitmap = NULL; 581 } 582 583 /* 584 * Free any memory in @free but not in @dont. 585 */ 586 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 587 struct kvm_memory_slot *dont) 588 { 589 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 590 kvm_destroy_dirty_bitmap(free); 591 592 kvm_arch_free_memslot(kvm, free, dont); 593 594 free->npages = 0; 595 } 596 597 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 598 { 599 struct kvm_memory_slot *memslot; 600 601 if (!slots) 602 return; 603 604 kvm_for_each_memslot(memslot, slots) 605 kvm_free_memslot(kvm, memslot, NULL); 606 607 kvfree(slots); 608 } 609 610 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 611 { 612 int i; 613 614 if (!kvm->debugfs_dentry) 615 return; 616 617 debugfs_remove_recursive(kvm->debugfs_dentry); 618 619 if (kvm->debugfs_stat_data) { 620 for (i = 0; i < kvm_debugfs_num_entries; i++) 621 kfree(kvm->debugfs_stat_data[i]); 622 kfree(kvm->debugfs_stat_data); 623 } 624 } 625 626 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 627 { 628 char dir_name[ITOA_MAX_LEN * 2]; 629 struct kvm_stat_data *stat_data; 630 struct kvm_stats_debugfs_item *p; 631 632 if (!debugfs_initialized()) 633 return 0; 634 635 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 636 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 637 638 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 639 sizeof(*kvm->debugfs_stat_data), 640 GFP_KERNEL_ACCOUNT); 641 if (!kvm->debugfs_stat_data) 642 return -ENOMEM; 643 644 for (p = debugfs_entries; p->name; p++) { 645 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 646 if (!stat_data) 647 return -ENOMEM; 648 649 stat_data->kvm = kvm; 650 stat_data->dbgfs_item = p; 651 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 652 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 653 kvm->debugfs_dentry, stat_data, 654 &stat_fops_per_vm); 655 } 656 return 0; 657 } 658 659 /* 660 * Called after the VM is otherwise initialized, but just before adding it to 661 * the vm_list. 662 */ 663 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 664 { 665 return 0; 666 } 667 668 /* 669 * Called just after removing the VM from the vm_list, but before doing any 670 * other destruction. 671 */ 672 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 673 { 674 } 675 676 static struct kvm *kvm_create_vm(unsigned long type) 677 { 678 struct kvm *kvm = kvm_arch_alloc_vm(); 679 int r = -ENOMEM; 680 int i; 681 682 if (!kvm) 683 return ERR_PTR(-ENOMEM); 684 685 spin_lock_init(&kvm->mmu_lock); 686 mmgrab(current->mm); 687 kvm->mm = current->mm; 688 kvm_eventfd_init(kvm); 689 mutex_init(&kvm->lock); 690 mutex_init(&kvm->irq_lock); 691 mutex_init(&kvm->slots_lock); 692 INIT_LIST_HEAD(&kvm->devices); 693 694 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 695 696 if (init_srcu_struct(&kvm->srcu)) 697 goto out_err_no_srcu; 698 if (init_srcu_struct(&kvm->irq_srcu)) 699 goto out_err_no_irq_srcu; 700 701 refcount_set(&kvm->users_count, 1); 702 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 703 struct kvm_memslots *slots = kvm_alloc_memslots(); 704 705 if (!slots) 706 goto out_err_no_arch_destroy_vm; 707 /* Generations must be different for each address space. */ 708 slots->generation = i; 709 rcu_assign_pointer(kvm->memslots[i], slots); 710 } 711 712 for (i = 0; i < KVM_NR_BUSES; i++) { 713 rcu_assign_pointer(kvm->buses[i], 714 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 715 if (!kvm->buses[i]) 716 goto out_err_no_arch_destroy_vm; 717 } 718 719 r = kvm_arch_init_vm(kvm, type); 720 if (r) 721 goto out_err_no_arch_destroy_vm; 722 723 r = hardware_enable_all(); 724 if (r) 725 goto out_err_no_disable; 726 727 #ifdef CONFIG_HAVE_KVM_IRQFD 728 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 729 #endif 730 731 r = kvm_init_mmu_notifier(kvm); 732 if (r) 733 goto out_err_no_mmu_notifier; 734 735 r = kvm_arch_post_init_vm(kvm); 736 if (r) 737 goto out_err; 738 739 mutex_lock(&kvm_lock); 740 list_add(&kvm->vm_list, &vm_list); 741 mutex_unlock(&kvm_lock); 742 743 preempt_notifier_inc(); 744 745 return kvm; 746 747 out_err: 748 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 749 if (kvm->mmu_notifier.ops) 750 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 751 #endif 752 out_err_no_mmu_notifier: 753 hardware_disable_all(); 754 out_err_no_disable: 755 kvm_arch_destroy_vm(kvm); 756 out_err_no_arch_destroy_vm: 757 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 758 for (i = 0; i < KVM_NR_BUSES; i++) 759 kfree(kvm_get_bus(kvm, i)); 760 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 761 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 762 cleanup_srcu_struct(&kvm->irq_srcu); 763 out_err_no_irq_srcu: 764 cleanup_srcu_struct(&kvm->srcu); 765 out_err_no_srcu: 766 kvm_arch_free_vm(kvm); 767 mmdrop(current->mm); 768 return ERR_PTR(r); 769 } 770 771 static void kvm_destroy_devices(struct kvm *kvm) 772 { 773 struct kvm_device *dev, *tmp; 774 775 /* 776 * We do not need to take the kvm->lock here, because nobody else 777 * has a reference to the struct kvm at this point and therefore 778 * cannot access the devices list anyhow. 779 */ 780 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 781 list_del(&dev->vm_node); 782 dev->ops->destroy(dev); 783 } 784 } 785 786 static void kvm_destroy_vm(struct kvm *kvm) 787 { 788 int i; 789 struct mm_struct *mm = kvm->mm; 790 791 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 792 kvm_destroy_vm_debugfs(kvm); 793 kvm_arch_sync_events(kvm); 794 mutex_lock(&kvm_lock); 795 list_del(&kvm->vm_list); 796 mutex_unlock(&kvm_lock); 797 kvm_arch_pre_destroy_vm(kvm); 798 799 kvm_free_irq_routing(kvm); 800 for (i = 0; i < KVM_NR_BUSES; i++) { 801 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 802 803 if (bus) 804 kvm_io_bus_destroy(bus); 805 kvm->buses[i] = NULL; 806 } 807 kvm_coalesced_mmio_free(kvm); 808 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 809 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 810 #else 811 kvm_arch_flush_shadow_all(kvm); 812 #endif 813 kvm_arch_destroy_vm(kvm); 814 kvm_destroy_devices(kvm); 815 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 816 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 817 cleanup_srcu_struct(&kvm->irq_srcu); 818 cleanup_srcu_struct(&kvm->srcu); 819 kvm_arch_free_vm(kvm); 820 preempt_notifier_dec(); 821 hardware_disable_all(); 822 mmdrop(mm); 823 } 824 825 void kvm_get_kvm(struct kvm *kvm) 826 { 827 refcount_inc(&kvm->users_count); 828 } 829 EXPORT_SYMBOL_GPL(kvm_get_kvm); 830 831 void kvm_put_kvm(struct kvm *kvm) 832 { 833 if (refcount_dec_and_test(&kvm->users_count)) 834 kvm_destroy_vm(kvm); 835 } 836 EXPORT_SYMBOL_GPL(kvm_put_kvm); 837 838 /* 839 * Used to put a reference that was taken on behalf of an object associated 840 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 841 * of the new file descriptor fails and the reference cannot be transferred to 842 * its final owner. In such cases, the caller is still actively using @kvm and 843 * will fail miserably if the refcount unexpectedly hits zero. 844 */ 845 void kvm_put_kvm_no_destroy(struct kvm *kvm) 846 { 847 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 848 } 849 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 850 851 static int kvm_vm_release(struct inode *inode, struct file *filp) 852 { 853 struct kvm *kvm = filp->private_data; 854 855 kvm_irqfd_release(kvm); 856 857 kvm_put_kvm(kvm); 858 return 0; 859 } 860 861 /* 862 * Allocation size is twice as large as the actual dirty bitmap size. 863 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 864 */ 865 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 866 { 867 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 868 869 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 870 if (!memslot->dirty_bitmap) 871 return -ENOMEM; 872 873 return 0; 874 } 875 876 /* 877 * Insert memslot and re-sort memslots based on their GFN, 878 * so binary search could be used to lookup GFN. 879 * Sorting algorithm takes advantage of having initially 880 * sorted array and known changed memslot position. 881 */ 882 static void update_memslots(struct kvm_memslots *slots, 883 struct kvm_memory_slot *new, 884 enum kvm_mr_change change) 885 { 886 int id = new->id; 887 int i = slots->id_to_index[id]; 888 struct kvm_memory_slot *mslots = slots->memslots; 889 890 WARN_ON(mslots[i].id != id); 891 switch (change) { 892 case KVM_MR_CREATE: 893 slots->used_slots++; 894 WARN_ON(mslots[i].npages || !new->npages); 895 break; 896 case KVM_MR_DELETE: 897 slots->used_slots--; 898 WARN_ON(new->npages || !mslots[i].npages); 899 break; 900 default: 901 break; 902 } 903 904 while (i < KVM_MEM_SLOTS_NUM - 1 && 905 new->base_gfn <= mslots[i + 1].base_gfn) { 906 if (!mslots[i + 1].npages) 907 break; 908 mslots[i] = mslots[i + 1]; 909 slots->id_to_index[mslots[i].id] = i; 910 i++; 911 } 912 913 /* 914 * The ">=" is needed when creating a slot with base_gfn == 0, 915 * so that it moves before all those with base_gfn == npages == 0. 916 * 917 * On the other hand, if new->npages is zero, the above loop has 918 * already left i pointing to the beginning of the empty part of 919 * mslots, and the ">=" would move the hole backwards in this 920 * case---which is wrong. So skip the loop when deleting a slot. 921 */ 922 if (new->npages) { 923 while (i > 0 && 924 new->base_gfn >= mslots[i - 1].base_gfn) { 925 mslots[i] = mslots[i - 1]; 926 slots->id_to_index[mslots[i].id] = i; 927 i--; 928 } 929 } else 930 WARN_ON_ONCE(i != slots->used_slots); 931 932 mslots[i] = *new; 933 slots->id_to_index[mslots[i].id] = i; 934 } 935 936 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 937 { 938 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 939 940 #ifdef __KVM_HAVE_READONLY_MEM 941 valid_flags |= KVM_MEM_READONLY; 942 #endif 943 944 if (mem->flags & ~valid_flags) 945 return -EINVAL; 946 947 return 0; 948 } 949 950 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 951 int as_id, struct kvm_memslots *slots) 952 { 953 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 954 u64 gen = old_memslots->generation; 955 956 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 957 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 958 959 rcu_assign_pointer(kvm->memslots[as_id], slots); 960 synchronize_srcu_expedited(&kvm->srcu); 961 962 /* 963 * Increment the new memslot generation a second time, dropping the 964 * update in-progress flag and incrementing the generation based on 965 * the number of address spaces. This provides a unique and easily 966 * identifiable generation number while the memslots are in flux. 967 */ 968 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 969 970 /* 971 * Generations must be unique even across address spaces. We do not need 972 * a global counter for that, instead the generation space is evenly split 973 * across address spaces. For example, with two address spaces, address 974 * space 0 will use generations 0, 2, 4, ... while address space 1 will 975 * use generations 1, 3, 5, ... 976 */ 977 gen += KVM_ADDRESS_SPACE_NUM; 978 979 kvm_arch_memslots_updated(kvm, gen); 980 981 slots->generation = gen; 982 983 return old_memslots; 984 } 985 986 /* 987 * Allocate some memory and give it an address in the guest physical address 988 * space. 989 * 990 * Discontiguous memory is allowed, mostly for framebuffers. 991 * 992 * Must be called holding kvm->slots_lock for write. 993 */ 994 int __kvm_set_memory_region(struct kvm *kvm, 995 const struct kvm_userspace_memory_region *mem) 996 { 997 int r; 998 gfn_t base_gfn; 999 unsigned long npages; 1000 struct kvm_memory_slot *slot; 1001 struct kvm_memory_slot old, new; 1002 struct kvm_memslots *slots = NULL, *old_memslots; 1003 int as_id, id; 1004 enum kvm_mr_change change; 1005 1006 r = check_memory_region_flags(mem); 1007 if (r) 1008 goto out; 1009 1010 r = -EINVAL; 1011 as_id = mem->slot >> 16; 1012 id = (u16)mem->slot; 1013 1014 /* General sanity checks */ 1015 if (mem->memory_size & (PAGE_SIZE - 1)) 1016 goto out; 1017 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1018 goto out; 1019 /* We can read the guest memory with __xxx_user() later on. */ 1020 if ((id < KVM_USER_MEM_SLOTS) && 1021 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1022 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1023 mem->memory_size))) 1024 goto out; 1025 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1026 goto out; 1027 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1028 goto out; 1029 1030 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1031 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1032 npages = mem->memory_size >> PAGE_SHIFT; 1033 1034 if (npages > KVM_MEM_MAX_NR_PAGES) 1035 goto out; 1036 1037 new = old = *slot; 1038 1039 new.id = id; 1040 new.base_gfn = base_gfn; 1041 new.npages = npages; 1042 new.flags = mem->flags; 1043 1044 if (npages) { 1045 if (!old.npages) 1046 change = KVM_MR_CREATE; 1047 else { /* Modify an existing slot. */ 1048 if ((mem->userspace_addr != old.userspace_addr) || 1049 (npages != old.npages) || 1050 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1051 goto out; 1052 1053 if (base_gfn != old.base_gfn) 1054 change = KVM_MR_MOVE; 1055 else if (new.flags != old.flags) 1056 change = KVM_MR_FLAGS_ONLY; 1057 else { /* Nothing to change. */ 1058 r = 0; 1059 goto out; 1060 } 1061 } 1062 } else { 1063 if (!old.npages) 1064 goto out; 1065 1066 change = KVM_MR_DELETE; 1067 new.base_gfn = 0; 1068 new.flags = 0; 1069 } 1070 1071 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1072 /* Check for overlaps */ 1073 r = -EEXIST; 1074 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 1075 if (slot->id == id) 1076 continue; 1077 if (!((base_gfn + npages <= slot->base_gfn) || 1078 (base_gfn >= slot->base_gfn + slot->npages))) 1079 goto out; 1080 } 1081 } 1082 1083 /* Free page dirty bitmap if unneeded */ 1084 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1085 new.dirty_bitmap = NULL; 1086 1087 r = -ENOMEM; 1088 if (change == KVM_MR_CREATE) { 1089 new.userspace_addr = mem->userspace_addr; 1090 1091 if (kvm_arch_create_memslot(kvm, &new, npages)) 1092 goto out_free; 1093 } 1094 1095 /* Allocate page dirty bitmap if needed */ 1096 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1097 if (kvm_create_dirty_bitmap(&new) < 0) 1098 goto out_free; 1099 } 1100 1101 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 1102 if (!slots) 1103 goto out_free; 1104 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1105 1106 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1107 slot = id_to_memslot(slots, id); 1108 slot->flags |= KVM_MEMSLOT_INVALID; 1109 1110 old_memslots = install_new_memslots(kvm, as_id, slots); 1111 1112 /* From this point no new shadow pages pointing to a deleted, 1113 * or moved, memslot will be created. 1114 * 1115 * validation of sp->gfn happens in: 1116 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1117 * - kvm_is_visible_gfn (mmu_check_root) 1118 */ 1119 kvm_arch_flush_shadow_memslot(kvm, slot); 1120 1121 /* 1122 * We can re-use the old_memslots from above, the only difference 1123 * from the currently installed memslots is the invalid flag. This 1124 * will get overwritten by update_memslots anyway. 1125 */ 1126 slots = old_memslots; 1127 } 1128 1129 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1130 if (r) 1131 goto out_slots; 1132 1133 /* actual memory is freed via old in kvm_free_memslot below */ 1134 if (change == KVM_MR_DELETE) { 1135 new.dirty_bitmap = NULL; 1136 memset(&new.arch, 0, sizeof(new.arch)); 1137 } 1138 1139 update_memslots(slots, &new, change); 1140 old_memslots = install_new_memslots(kvm, as_id, slots); 1141 1142 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1143 1144 kvm_free_memslot(kvm, &old, &new); 1145 kvfree(old_memslots); 1146 return 0; 1147 1148 out_slots: 1149 kvfree(slots); 1150 out_free: 1151 kvm_free_memslot(kvm, &new, &old); 1152 out: 1153 return r; 1154 } 1155 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1156 1157 int kvm_set_memory_region(struct kvm *kvm, 1158 const struct kvm_userspace_memory_region *mem) 1159 { 1160 int r; 1161 1162 mutex_lock(&kvm->slots_lock); 1163 r = __kvm_set_memory_region(kvm, mem); 1164 mutex_unlock(&kvm->slots_lock); 1165 return r; 1166 } 1167 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1168 1169 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1170 struct kvm_userspace_memory_region *mem) 1171 { 1172 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1173 return -EINVAL; 1174 1175 return kvm_set_memory_region(kvm, mem); 1176 } 1177 1178 int kvm_get_dirty_log(struct kvm *kvm, 1179 struct kvm_dirty_log *log, int *is_dirty) 1180 { 1181 struct kvm_memslots *slots; 1182 struct kvm_memory_slot *memslot; 1183 int i, as_id, id; 1184 unsigned long n; 1185 unsigned long any = 0; 1186 1187 as_id = log->slot >> 16; 1188 id = (u16)log->slot; 1189 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1190 return -EINVAL; 1191 1192 slots = __kvm_memslots(kvm, as_id); 1193 memslot = id_to_memslot(slots, id); 1194 if (!memslot->dirty_bitmap) 1195 return -ENOENT; 1196 1197 n = kvm_dirty_bitmap_bytes(memslot); 1198 1199 for (i = 0; !any && i < n/sizeof(long); ++i) 1200 any = memslot->dirty_bitmap[i]; 1201 1202 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1203 return -EFAULT; 1204 1205 if (any) 1206 *is_dirty = 1; 1207 return 0; 1208 } 1209 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1210 1211 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1212 /** 1213 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1214 * and reenable dirty page tracking for the corresponding pages. 1215 * @kvm: pointer to kvm instance 1216 * @log: slot id and address to which we copy the log 1217 * @flush: true if TLB flush is needed by caller 1218 * 1219 * We need to keep it in mind that VCPU threads can write to the bitmap 1220 * concurrently. So, to avoid losing track of dirty pages we keep the 1221 * following order: 1222 * 1223 * 1. Take a snapshot of the bit and clear it if needed. 1224 * 2. Write protect the corresponding page. 1225 * 3. Copy the snapshot to the userspace. 1226 * 4. Upon return caller flushes TLB's if needed. 1227 * 1228 * Between 2 and 4, the guest may write to the page using the remaining TLB 1229 * entry. This is not a problem because the page is reported dirty using 1230 * the snapshot taken before and step 4 ensures that writes done after 1231 * exiting to userspace will be logged for the next call. 1232 * 1233 */ 1234 int kvm_get_dirty_log_protect(struct kvm *kvm, 1235 struct kvm_dirty_log *log, bool *flush) 1236 { 1237 struct kvm_memslots *slots; 1238 struct kvm_memory_slot *memslot; 1239 int i, as_id, id; 1240 unsigned long n; 1241 unsigned long *dirty_bitmap; 1242 unsigned long *dirty_bitmap_buffer; 1243 1244 as_id = log->slot >> 16; 1245 id = (u16)log->slot; 1246 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1247 return -EINVAL; 1248 1249 slots = __kvm_memslots(kvm, as_id); 1250 memslot = id_to_memslot(slots, id); 1251 1252 dirty_bitmap = memslot->dirty_bitmap; 1253 if (!dirty_bitmap) 1254 return -ENOENT; 1255 1256 n = kvm_dirty_bitmap_bytes(memslot); 1257 *flush = false; 1258 if (kvm->manual_dirty_log_protect) { 1259 /* 1260 * Unlike kvm_get_dirty_log, we always return false in *flush, 1261 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1262 * is some code duplication between this function and 1263 * kvm_get_dirty_log, but hopefully all architecture 1264 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1265 * can be eliminated. 1266 */ 1267 dirty_bitmap_buffer = dirty_bitmap; 1268 } else { 1269 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1270 memset(dirty_bitmap_buffer, 0, n); 1271 1272 spin_lock(&kvm->mmu_lock); 1273 for (i = 0; i < n / sizeof(long); i++) { 1274 unsigned long mask; 1275 gfn_t offset; 1276 1277 if (!dirty_bitmap[i]) 1278 continue; 1279 1280 *flush = true; 1281 mask = xchg(&dirty_bitmap[i], 0); 1282 dirty_bitmap_buffer[i] = mask; 1283 1284 offset = i * BITS_PER_LONG; 1285 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1286 offset, mask); 1287 } 1288 spin_unlock(&kvm->mmu_lock); 1289 } 1290 1291 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1292 return -EFAULT; 1293 return 0; 1294 } 1295 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1296 1297 /** 1298 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1299 * and reenable dirty page tracking for the corresponding pages. 1300 * @kvm: pointer to kvm instance 1301 * @log: slot id and address from which to fetch the bitmap of dirty pages 1302 * @flush: true if TLB flush is needed by caller 1303 */ 1304 int kvm_clear_dirty_log_protect(struct kvm *kvm, 1305 struct kvm_clear_dirty_log *log, bool *flush) 1306 { 1307 struct kvm_memslots *slots; 1308 struct kvm_memory_slot *memslot; 1309 int as_id, id; 1310 gfn_t offset; 1311 unsigned long i, n; 1312 unsigned long *dirty_bitmap; 1313 unsigned long *dirty_bitmap_buffer; 1314 1315 as_id = log->slot >> 16; 1316 id = (u16)log->slot; 1317 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1318 return -EINVAL; 1319 1320 if (log->first_page & 63) 1321 return -EINVAL; 1322 1323 slots = __kvm_memslots(kvm, as_id); 1324 memslot = id_to_memslot(slots, id); 1325 1326 dirty_bitmap = memslot->dirty_bitmap; 1327 if (!dirty_bitmap) 1328 return -ENOENT; 1329 1330 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1331 1332 if (log->first_page > memslot->npages || 1333 log->num_pages > memslot->npages - log->first_page || 1334 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1335 return -EINVAL; 1336 1337 *flush = false; 1338 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1339 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1340 return -EFAULT; 1341 1342 spin_lock(&kvm->mmu_lock); 1343 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1344 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1345 i++, offset += BITS_PER_LONG) { 1346 unsigned long mask = *dirty_bitmap_buffer++; 1347 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1348 if (!mask) 1349 continue; 1350 1351 mask &= atomic_long_fetch_andnot(mask, p); 1352 1353 /* 1354 * mask contains the bits that really have been cleared. This 1355 * never includes any bits beyond the length of the memslot (if 1356 * the length is not aligned to 64 pages), therefore it is not 1357 * a problem if userspace sets them in log->dirty_bitmap. 1358 */ 1359 if (mask) { 1360 *flush = true; 1361 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1362 offset, mask); 1363 } 1364 } 1365 spin_unlock(&kvm->mmu_lock); 1366 1367 return 0; 1368 } 1369 EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); 1370 #endif 1371 1372 bool kvm_largepages_enabled(void) 1373 { 1374 return largepages_enabled; 1375 } 1376 1377 void kvm_disable_largepages(void) 1378 { 1379 largepages_enabled = false; 1380 } 1381 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1382 1383 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1384 { 1385 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1386 } 1387 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1388 1389 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1390 { 1391 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1392 } 1393 1394 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1395 { 1396 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1397 1398 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1399 memslot->flags & KVM_MEMSLOT_INVALID) 1400 return false; 1401 1402 return true; 1403 } 1404 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1405 1406 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1407 { 1408 struct vm_area_struct *vma; 1409 unsigned long addr, size; 1410 1411 size = PAGE_SIZE; 1412 1413 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1414 if (kvm_is_error_hva(addr)) 1415 return PAGE_SIZE; 1416 1417 down_read(¤t->mm->mmap_sem); 1418 vma = find_vma(current->mm, addr); 1419 if (!vma) 1420 goto out; 1421 1422 size = vma_kernel_pagesize(vma); 1423 1424 out: 1425 up_read(¤t->mm->mmap_sem); 1426 1427 return size; 1428 } 1429 1430 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1431 { 1432 return slot->flags & KVM_MEM_READONLY; 1433 } 1434 1435 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1436 gfn_t *nr_pages, bool write) 1437 { 1438 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1439 return KVM_HVA_ERR_BAD; 1440 1441 if (memslot_is_readonly(slot) && write) 1442 return KVM_HVA_ERR_RO_BAD; 1443 1444 if (nr_pages) 1445 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1446 1447 return __gfn_to_hva_memslot(slot, gfn); 1448 } 1449 1450 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1451 gfn_t *nr_pages) 1452 { 1453 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1454 } 1455 1456 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1457 gfn_t gfn) 1458 { 1459 return gfn_to_hva_many(slot, gfn, NULL); 1460 } 1461 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1462 1463 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1464 { 1465 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1466 } 1467 EXPORT_SYMBOL_GPL(gfn_to_hva); 1468 1469 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1470 { 1471 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1472 } 1473 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1474 1475 /* 1476 * Return the hva of a @gfn and the R/W attribute if possible. 1477 * 1478 * @slot: the kvm_memory_slot which contains @gfn 1479 * @gfn: the gfn to be translated 1480 * @writable: used to return the read/write attribute of the @slot if the hva 1481 * is valid and @writable is not NULL 1482 */ 1483 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1484 gfn_t gfn, bool *writable) 1485 { 1486 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1487 1488 if (!kvm_is_error_hva(hva) && writable) 1489 *writable = !memslot_is_readonly(slot); 1490 1491 return hva; 1492 } 1493 1494 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1495 { 1496 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1497 1498 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1499 } 1500 1501 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1502 { 1503 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1504 1505 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1506 } 1507 1508 static inline int check_user_page_hwpoison(unsigned long addr) 1509 { 1510 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1511 1512 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1513 return rc == -EHWPOISON; 1514 } 1515 1516 /* 1517 * The fast path to get the writable pfn which will be stored in @pfn, 1518 * true indicates success, otherwise false is returned. It's also the 1519 * only part that runs if we can in atomic context. 1520 */ 1521 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1522 bool *writable, kvm_pfn_t *pfn) 1523 { 1524 struct page *page[1]; 1525 int npages; 1526 1527 /* 1528 * Fast pin a writable pfn only if it is a write fault request 1529 * or the caller allows to map a writable pfn for a read fault 1530 * request. 1531 */ 1532 if (!(write_fault || writable)) 1533 return false; 1534 1535 npages = __get_user_pages_fast(addr, 1, 1, page); 1536 if (npages == 1) { 1537 *pfn = page_to_pfn(page[0]); 1538 1539 if (writable) 1540 *writable = true; 1541 return true; 1542 } 1543 1544 return false; 1545 } 1546 1547 /* 1548 * The slow path to get the pfn of the specified host virtual address, 1549 * 1 indicates success, -errno is returned if error is detected. 1550 */ 1551 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1552 bool *writable, kvm_pfn_t *pfn) 1553 { 1554 unsigned int flags = FOLL_HWPOISON; 1555 struct page *page; 1556 int npages = 0; 1557 1558 might_sleep(); 1559 1560 if (writable) 1561 *writable = write_fault; 1562 1563 if (write_fault) 1564 flags |= FOLL_WRITE; 1565 if (async) 1566 flags |= FOLL_NOWAIT; 1567 1568 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1569 if (npages != 1) 1570 return npages; 1571 1572 /* map read fault as writable if possible */ 1573 if (unlikely(!write_fault) && writable) { 1574 struct page *wpage; 1575 1576 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1577 *writable = true; 1578 put_page(page); 1579 page = wpage; 1580 } 1581 } 1582 *pfn = page_to_pfn(page); 1583 return npages; 1584 } 1585 1586 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1587 { 1588 if (unlikely(!(vma->vm_flags & VM_READ))) 1589 return false; 1590 1591 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1592 return false; 1593 1594 return true; 1595 } 1596 1597 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1598 unsigned long addr, bool *async, 1599 bool write_fault, bool *writable, 1600 kvm_pfn_t *p_pfn) 1601 { 1602 unsigned long pfn; 1603 int r; 1604 1605 r = follow_pfn(vma, addr, &pfn); 1606 if (r) { 1607 /* 1608 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1609 * not call the fault handler, so do it here. 1610 */ 1611 bool unlocked = false; 1612 r = fixup_user_fault(current, current->mm, addr, 1613 (write_fault ? FAULT_FLAG_WRITE : 0), 1614 &unlocked); 1615 if (unlocked) 1616 return -EAGAIN; 1617 if (r) 1618 return r; 1619 1620 r = follow_pfn(vma, addr, &pfn); 1621 if (r) 1622 return r; 1623 1624 } 1625 1626 if (writable) 1627 *writable = true; 1628 1629 /* 1630 * Get a reference here because callers of *hva_to_pfn* and 1631 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1632 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1633 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1634 * simply do nothing for reserved pfns. 1635 * 1636 * Whoever called remap_pfn_range is also going to call e.g. 1637 * unmap_mapping_range before the underlying pages are freed, 1638 * causing a call to our MMU notifier. 1639 */ 1640 kvm_get_pfn(pfn); 1641 1642 *p_pfn = pfn; 1643 return 0; 1644 } 1645 1646 /* 1647 * Pin guest page in memory and return its pfn. 1648 * @addr: host virtual address which maps memory to the guest 1649 * @atomic: whether this function can sleep 1650 * @async: whether this function need to wait IO complete if the 1651 * host page is not in the memory 1652 * @write_fault: whether we should get a writable host page 1653 * @writable: whether it allows to map a writable host page for !@write_fault 1654 * 1655 * The function will map a writable host page for these two cases: 1656 * 1): @write_fault = true 1657 * 2): @write_fault = false && @writable, @writable will tell the caller 1658 * whether the mapping is writable. 1659 */ 1660 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1661 bool write_fault, bool *writable) 1662 { 1663 struct vm_area_struct *vma; 1664 kvm_pfn_t pfn = 0; 1665 int npages, r; 1666 1667 /* we can do it either atomically or asynchronously, not both */ 1668 BUG_ON(atomic && async); 1669 1670 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1671 return pfn; 1672 1673 if (atomic) 1674 return KVM_PFN_ERR_FAULT; 1675 1676 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1677 if (npages == 1) 1678 return pfn; 1679 1680 down_read(¤t->mm->mmap_sem); 1681 if (npages == -EHWPOISON || 1682 (!async && check_user_page_hwpoison(addr))) { 1683 pfn = KVM_PFN_ERR_HWPOISON; 1684 goto exit; 1685 } 1686 1687 retry: 1688 vma = find_vma_intersection(current->mm, addr, addr + 1); 1689 1690 if (vma == NULL) 1691 pfn = KVM_PFN_ERR_FAULT; 1692 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1693 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1694 if (r == -EAGAIN) 1695 goto retry; 1696 if (r < 0) 1697 pfn = KVM_PFN_ERR_FAULT; 1698 } else { 1699 if (async && vma_is_valid(vma, write_fault)) 1700 *async = true; 1701 pfn = KVM_PFN_ERR_FAULT; 1702 } 1703 exit: 1704 up_read(¤t->mm->mmap_sem); 1705 return pfn; 1706 } 1707 1708 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1709 bool atomic, bool *async, bool write_fault, 1710 bool *writable) 1711 { 1712 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1713 1714 if (addr == KVM_HVA_ERR_RO_BAD) { 1715 if (writable) 1716 *writable = false; 1717 return KVM_PFN_ERR_RO_FAULT; 1718 } 1719 1720 if (kvm_is_error_hva(addr)) { 1721 if (writable) 1722 *writable = false; 1723 return KVM_PFN_NOSLOT; 1724 } 1725 1726 /* Do not map writable pfn in the readonly memslot. */ 1727 if (writable && memslot_is_readonly(slot)) { 1728 *writable = false; 1729 writable = NULL; 1730 } 1731 1732 return hva_to_pfn(addr, atomic, async, write_fault, 1733 writable); 1734 } 1735 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1736 1737 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1738 bool *writable) 1739 { 1740 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1741 write_fault, writable); 1742 } 1743 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1744 1745 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1746 { 1747 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1748 } 1749 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1750 1751 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1752 { 1753 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1754 } 1755 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1756 1757 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1758 { 1759 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1760 } 1761 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1762 1763 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1764 { 1765 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1766 } 1767 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1768 1769 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1770 { 1771 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1772 } 1773 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1774 1775 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1776 { 1777 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1778 } 1779 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1780 1781 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1782 struct page **pages, int nr_pages) 1783 { 1784 unsigned long addr; 1785 gfn_t entry = 0; 1786 1787 addr = gfn_to_hva_many(slot, gfn, &entry); 1788 if (kvm_is_error_hva(addr)) 1789 return -1; 1790 1791 if (entry < nr_pages) 1792 return 0; 1793 1794 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1795 } 1796 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1797 1798 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1799 { 1800 if (is_error_noslot_pfn(pfn)) 1801 return KVM_ERR_PTR_BAD_PAGE; 1802 1803 if (kvm_is_reserved_pfn(pfn)) { 1804 WARN_ON(1); 1805 return KVM_ERR_PTR_BAD_PAGE; 1806 } 1807 1808 return pfn_to_page(pfn); 1809 } 1810 1811 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1812 { 1813 kvm_pfn_t pfn; 1814 1815 pfn = gfn_to_pfn(kvm, gfn); 1816 1817 return kvm_pfn_to_page(pfn); 1818 } 1819 EXPORT_SYMBOL_GPL(gfn_to_page); 1820 1821 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 1822 { 1823 if (pfn == 0) 1824 return; 1825 1826 if (cache) 1827 cache->pfn = cache->gfn = 0; 1828 1829 if (dirty) 1830 kvm_release_pfn_dirty(pfn); 1831 else 1832 kvm_release_pfn_clean(pfn); 1833 } 1834 1835 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 1836 struct gfn_to_pfn_cache *cache, u64 gen) 1837 { 1838 kvm_release_pfn(cache->pfn, cache->dirty, cache); 1839 1840 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 1841 cache->gfn = gfn; 1842 cache->dirty = false; 1843 cache->generation = gen; 1844 } 1845 1846 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 1847 struct kvm_host_map *map, 1848 struct gfn_to_pfn_cache *cache, 1849 bool atomic) 1850 { 1851 kvm_pfn_t pfn; 1852 void *hva = NULL; 1853 struct page *page = KVM_UNMAPPED_PAGE; 1854 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 1855 u64 gen = slots->generation; 1856 1857 if (!map) 1858 return -EINVAL; 1859 1860 if (cache) { 1861 if (!cache->pfn || cache->gfn != gfn || 1862 cache->generation != gen) { 1863 if (atomic) 1864 return -EAGAIN; 1865 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 1866 } 1867 pfn = cache->pfn; 1868 } else { 1869 if (atomic) 1870 return -EAGAIN; 1871 pfn = gfn_to_pfn_memslot(slot, gfn); 1872 } 1873 if (is_error_noslot_pfn(pfn)) 1874 return -EINVAL; 1875 1876 if (pfn_valid(pfn)) { 1877 page = pfn_to_page(pfn); 1878 if (atomic) 1879 hva = kmap_atomic(page); 1880 else 1881 hva = kmap(page); 1882 #ifdef CONFIG_HAS_IOMEM 1883 } else if (!atomic) { 1884 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 1885 } else { 1886 return -EINVAL; 1887 #endif 1888 } 1889 1890 if (!hva) 1891 return -EFAULT; 1892 1893 map->page = page; 1894 map->hva = hva; 1895 map->pfn = pfn; 1896 map->gfn = gfn; 1897 1898 return 0; 1899 } 1900 1901 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 1902 struct gfn_to_pfn_cache *cache, bool atomic) 1903 { 1904 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 1905 cache, atomic); 1906 } 1907 EXPORT_SYMBOL_GPL(kvm_map_gfn); 1908 1909 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 1910 { 1911 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 1912 NULL, false); 1913 } 1914 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 1915 1916 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 1917 struct kvm_host_map *map, 1918 struct gfn_to_pfn_cache *cache, 1919 bool dirty, bool atomic) 1920 { 1921 if (!map) 1922 return; 1923 1924 if (!map->hva) 1925 return; 1926 1927 if (map->page != KVM_UNMAPPED_PAGE) { 1928 if (atomic) 1929 kunmap_atomic(map->hva); 1930 else 1931 kunmap(map->page); 1932 } 1933 #ifdef CONFIG_HAS_IOMEM 1934 else if (!atomic) 1935 memunmap(map->hva); 1936 else 1937 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 1938 #endif 1939 1940 if (dirty) 1941 mark_page_dirty_in_slot(memslot, map->gfn); 1942 1943 if (cache) 1944 cache->dirty |= dirty; 1945 else 1946 kvm_release_pfn(map->pfn, dirty, NULL); 1947 1948 map->hva = NULL; 1949 map->page = NULL; 1950 } 1951 1952 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 1953 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 1954 { 1955 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 1956 cache, dirty, atomic); 1957 return 0; 1958 } 1959 EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 1960 1961 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 1962 { 1963 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 1964 dirty, false); 1965 } 1966 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 1967 1968 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1969 { 1970 kvm_pfn_t pfn; 1971 1972 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1973 1974 return kvm_pfn_to_page(pfn); 1975 } 1976 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1977 1978 void kvm_release_page_clean(struct page *page) 1979 { 1980 WARN_ON(is_error_page(page)); 1981 1982 kvm_release_pfn_clean(page_to_pfn(page)); 1983 } 1984 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1985 1986 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1987 { 1988 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1989 put_page(pfn_to_page(pfn)); 1990 } 1991 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1992 1993 void kvm_release_page_dirty(struct page *page) 1994 { 1995 WARN_ON(is_error_page(page)); 1996 1997 kvm_release_pfn_dirty(page_to_pfn(page)); 1998 } 1999 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2000 2001 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2002 { 2003 kvm_set_pfn_dirty(pfn); 2004 kvm_release_pfn_clean(pfn); 2005 } 2006 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2007 2008 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2009 { 2010 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2011 SetPageDirty(pfn_to_page(pfn)); 2012 } 2013 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2014 2015 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2016 { 2017 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2018 mark_page_accessed(pfn_to_page(pfn)); 2019 } 2020 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2021 2022 void kvm_get_pfn(kvm_pfn_t pfn) 2023 { 2024 if (!kvm_is_reserved_pfn(pfn)) 2025 get_page(pfn_to_page(pfn)); 2026 } 2027 EXPORT_SYMBOL_GPL(kvm_get_pfn); 2028 2029 static int next_segment(unsigned long len, int offset) 2030 { 2031 if (len > PAGE_SIZE - offset) 2032 return PAGE_SIZE - offset; 2033 else 2034 return len; 2035 } 2036 2037 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2038 void *data, int offset, int len) 2039 { 2040 int r; 2041 unsigned long addr; 2042 2043 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2044 if (kvm_is_error_hva(addr)) 2045 return -EFAULT; 2046 r = __copy_from_user(data, (void __user *)addr + offset, len); 2047 if (r) 2048 return -EFAULT; 2049 return 0; 2050 } 2051 2052 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2053 int len) 2054 { 2055 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2056 2057 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2058 } 2059 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2060 2061 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2062 int offset, int len) 2063 { 2064 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2065 2066 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2067 } 2068 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2069 2070 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2071 { 2072 gfn_t gfn = gpa >> PAGE_SHIFT; 2073 int seg; 2074 int offset = offset_in_page(gpa); 2075 int ret; 2076 2077 while ((seg = next_segment(len, offset)) != 0) { 2078 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2079 if (ret < 0) 2080 return ret; 2081 offset = 0; 2082 len -= seg; 2083 data += seg; 2084 ++gfn; 2085 } 2086 return 0; 2087 } 2088 EXPORT_SYMBOL_GPL(kvm_read_guest); 2089 2090 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2091 { 2092 gfn_t gfn = gpa >> PAGE_SHIFT; 2093 int seg; 2094 int offset = offset_in_page(gpa); 2095 int ret; 2096 2097 while ((seg = next_segment(len, offset)) != 0) { 2098 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2099 if (ret < 0) 2100 return ret; 2101 offset = 0; 2102 len -= seg; 2103 data += seg; 2104 ++gfn; 2105 } 2106 return 0; 2107 } 2108 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2109 2110 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2111 void *data, int offset, unsigned long len) 2112 { 2113 int r; 2114 unsigned long addr; 2115 2116 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2117 if (kvm_is_error_hva(addr)) 2118 return -EFAULT; 2119 pagefault_disable(); 2120 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2121 pagefault_enable(); 2122 if (r) 2123 return -EFAULT; 2124 return 0; 2125 } 2126 2127 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2128 void *data, unsigned long len) 2129 { 2130 gfn_t gfn = gpa >> PAGE_SHIFT; 2131 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2132 int offset = offset_in_page(gpa); 2133 2134 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2135 } 2136 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2137 2138 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2139 const void *data, int offset, int len) 2140 { 2141 int r; 2142 unsigned long addr; 2143 2144 addr = gfn_to_hva_memslot(memslot, gfn); 2145 if (kvm_is_error_hva(addr)) 2146 return -EFAULT; 2147 r = __copy_to_user((void __user *)addr + offset, data, len); 2148 if (r) 2149 return -EFAULT; 2150 mark_page_dirty_in_slot(memslot, gfn); 2151 return 0; 2152 } 2153 2154 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2155 const void *data, int offset, int len) 2156 { 2157 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2158 2159 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2160 } 2161 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2162 2163 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2164 const void *data, int offset, int len) 2165 { 2166 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2167 2168 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2169 } 2170 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2171 2172 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2173 unsigned long len) 2174 { 2175 gfn_t gfn = gpa >> PAGE_SHIFT; 2176 int seg; 2177 int offset = offset_in_page(gpa); 2178 int ret; 2179 2180 while ((seg = next_segment(len, offset)) != 0) { 2181 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2182 if (ret < 0) 2183 return ret; 2184 offset = 0; 2185 len -= seg; 2186 data += seg; 2187 ++gfn; 2188 } 2189 return 0; 2190 } 2191 EXPORT_SYMBOL_GPL(kvm_write_guest); 2192 2193 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2194 unsigned long len) 2195 { 2196 gfn_t gfn = gpa >> PAGE_SHIFT; 2197 int seg; 2198 int offset = offset_in_page(gpa); 2199 int ret; 2200 2201 while ((seg = next_segment(len, offset)) != 0) { 2202 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2203 if (ret < 0) 2204 return ret; 2205 offset = 0; 2206 len -= seg; 2207 data += seg; 2208 ++gfn; 2209 } 2210 return 0; 2211 } 2212 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2213 2214 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2215 struct gfn_to_hva_cache *ghc, 2216 gpa_t gpa, unsigned long len) 2217 { 2218 int offset = offset_in_page(gpa); 2219 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2220 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2221 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2222 gfn_t nr_pages_avail; 2223 2224 /* Update ghc->generation before performing any error checks. */ 2225 ghc->generation = slots->generation; 2226 2227 if (start_gfn > end_gfn) { 2228 ghc->hva = KVM_HVA_ERR_BAD; 2229 return -EINVAL; 2230 } 2231 2232 /* 2233 * If the requested region crosses two memslots, we still 2234 * verify that the entire region is valid here. 2235 */ 2236 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2237 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2238 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2239 &nr_pages_avail); 2240 if (kvm_is_error_hva(ghc->hva)) 2241 return -EFAULT; 2242 } 2243 2244 /* Use the slow path for cross page reads and writes. */ 2245 if (nr_pages_needed == 1) 2246 ghc->hva += offset; 2247 else 2248 ghc->memslot = NULL; 2249 2250 ghc->gpa = gpa; 2251 ghc->len = len; 2252 return 0; 2253 } 2254 2255 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2256 gpa_t gpa, unsigned long len) 2257 { 2258 struct kvm_memslots *slots = kvm_memslots(kvm); 2259 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2260 } 2261 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2262 2263 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2264 void *data, unsigned int offset, 2265 unsigned long len) 2266 { 2267 struct kvm_memslots *slots = kvm_memslots(kvm); 2268 int r; 2269 gpa_t gpa = ghc->gpa + offset; 2270 2271 BUG_ON(len + offset > ghc->len); 2272 2273 if (slots->generation != ghc->generation) { 2274 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2275 return -EFAULT; 2276 } 2277 2278 if (kvm_is_error_hva(ghc->hva)) 2279 return -EFAULT; 2280 2281 if (unlikely(!ghc->memslot)) 2282 return kvm_write_guest(kvm, gpa, data, len); 2283 2284 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2285 if (r) 2286 return -EFAULT; 2287 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2288 2289 return 0; 2290 } 2291 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2292 2293 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2294 void *data, unsigned long len) 2295 { 2296 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2297 } 2298 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2299 2300 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2301 void *data, unsigned long len) 2302 { 2303 struct kvm_memslots *slots = kvm_memslots(kvm); 2304 int r; 2305 2306 BUG_ON(len > ghc->len); 2307 2308 if (slots->generation != ghc->generation) { 2309 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2310 return -EFAULT; 2311 } 2312 2313 if (kvm_is_error_hva(ghc->hva)) 2314 return -EFAULT; 2315 2316 if (unlikely(!ghc->memslot)) 2317 return kvm_read_guest(kvm, ghc->gpa, data, len); 2318 2319 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2320 if (r) 2321 return -EFAULT; 2322 2323 return 0; 2324 } 2325 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2326 2327 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2328 { 2329 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2330 2331 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2332 } 2333 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2334 2335 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2336 { 2337 gfn_t gfn = gpa >> PAGE_SHIFT; 2338 int seg; 2339 int offset = offset_in_page(gpa); 2340 int ret; 2341 2342 while ((seg = next_segment(len, offset)) != 0) { 2343 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2344 if (ret < 0) 2345 return ret; 2346 offset = 0; 2347 len -= seg; 2348 ++gfn; 2349 } 2350 return 0; 2351 } 2352 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2353 2354 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2355 gfn_t gfn) 2356 { 2357 if (memslot && memslot->dirty_bitmap) { 2358 unsigned long rel_gfn = gfn - memslot->base_gfn; 2359 2360 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2361 } 2362 } 2363 2364 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2365 { 2366 struct kvm_memory_slot *memslot; 2367 2368 memslot = gfn_to_memslot(kvm, gfn); 2369 mark_page_dirty_in_slot(memslot, gfn); 2370 } 2371 EXPORT_SYMBOL_GPL(mark_page_dirty); 2372 2373 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2374 { 2375 struct kvm_memory_slot *memslot; 2376 2377 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2378 mark_page_dirty_in_slot(memslot, gfn); 2379 } 2380 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2381 2382 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2383 { 2384 if (!vcpu->sigset_active) 2385 return; 2386 2387 /* 2388 * This does a lockless modification of ->real_blocked, which is fine 2389 * because, only current can change ->real_blocked and all readers of 2390 * ->real_blocked don't care as long ->real_blocked is always a subset 2391 * of ->blocked. 2392 */ 2393 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2394 } 2395 2396 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2397 { 2398 if (!vcpu->sigset_active) 2399 return; 2400 2401 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2402 sigemptyset(¤t->real_blocked); 2403 } 2404 2405 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2406 { 2407 unsigned int old, val, grow, grow_start; 2408 2409 old = val = vcpu->halt_poll_ns; 2410 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2411 grow = READ_ONCE(halt_poll_ns_grow); 2412 if (!grow) 2413 goto out; 2414 2415 val *= grow; 2416 if (val < grow_start) 2417 val = grow_start; 2418 2419 if (val > halt_poll_ns) 2420 val = halt_poll_ns; 2421 2422 vcpu->halt_poll_ns = val; 2423 out: 2424 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2425 } 2426 2427 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2428 { 2429 unsigned int old, val, shrink; 2430 2431 old = val = vcpu->halt_poll_ns; 2432 shrink = READ_ONCE(halt_poll_ns_shrink); 2433 if (shrink == 0) 2434 val = 0; 2435 else 2436 val /= shrink; 2437 2438 vcpu->halt_poll_ns = val; 2439 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2440 } 2441 2442 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2443 { 2444 int ret = -EINTR; 2445 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2446 2447 if (kvm_arch_vcpu_runnable(vcpu)) { 2448 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2449 goto out; 2450 } 2451 if (kvm_cpu_has_pending_timer(vcpu)) 2452 goto out; 2453 if (signal_pending(current)) 2454 goto out; 2455 2456 ret = 0; 2457 out: 2458 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2459 return ret; 2460 } 2461 2462 /* 2463 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2464 */ 2465 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2466 { 2467 ktime_t start, cur; 2468 DECLARE_SWAITQUEUE(wait); 2469 bool waited = false; 2470 u64 block_ns; 2471 2472 kvm_arch_vcpu_blocking(vcpu); 2473 2474 start = cur = ktime_get(); 2475 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2476 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2477 2478 ++vcpu->stat.halt_attempted_poll; 2479 do { 2480 /* 2481 * This sets KVM_REQ_UNHALT if an interrupt 2482 * arrives. 2483 */ 2484 if (kvm_vcpu_check_block(vcpu) < 0) { 2485 ++vcpu->stat.halt_successful_poll; 2486 if (!vcpu_valid_wakeup(vcpu)) 2487 ++vcpu->stat.halt_poll_invalid; 2488 goto out; 2489 } 2490 cur = ktime_get(); 2491 } while (single_task_running() && ktime_before(cur, stop)); 2492 } 2493 2494 for (;;) { 2495 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2496 2497 if (kvm_vcpu_check_block(vcpu) < 0) 2498 break; 2499 2500 waited = true; 2501 schedule(); 2502 } 2503 2504 finish_swait(&vcpu->wq, &wait); 2505 cur = ktime_get(); 2506 out: 2507 kvm_arch_vcpu_unblocking(vcpu); 2508 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2509 2510 if (!kvm_arch_no_poll(vcpu)) { 2511 if (!vcpu_valid_wakeup(vcpu)) { 2512 shrink_halt_poll_ns(vcpu); 2513 } else if (halt_poll_ns) { 2514 if (block_ns <= vcpu->halt_poll_ns) 2515 ; 2516 /* we had a long block, shrink polling */ 2517 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2518 shrink_halt_poll_ns(vcpu); 2519 /* we had a short halt and our poll time is too small */ 2520 else if (vcpu->halt_poll_ns < halt_poll_ns && 2521 block_ns < halt_poll_ns) 2522 grow_halt_poll_ns(vcpu); 2523 } else { 2524 vcpu->halt_poll_ns = 0; 2525 } 2526 } 2527 2528 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2529 kvm_arch_vcpu_block_finish(vcpu); 2530 } 2531 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2532 2533 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2534 { 2535 struct swait_queue_head *wqp; 2536 2537 wqp = kvm_arch_vcpu_wq(vcpu); 2538 if (swq_has_sleeper(wqp)) { 2539 swake_up_one(wqp); 2540 WRITE_ONCE(vcpu->ready, true); 2541 ++vcpu->stat.halt_wakeup; 2542 return true; 2543 } 2544 2545 return false; 2546 } 2547 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2548 2549 #ifndef CONFIG_S390 2550 /* 2551 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2552 */ 2553 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2554 { 2555 int me; 2556 int cpu = vcpu->cpu; 2557 2558 if (kvm_vcpu_wake_up(vcpu)) 2559 return; 2560 2561 me = get_cpu(); 2562 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2563 if (kvm_arch_vcpu_should_kick(vcpu)) 2564 smp_send_reschedule(cpu); 2565 put_cpu(); 2566 } 2567 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2568 #endif /* !CONFIG_S390 */ 2569 2570 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2571 { 2572 struct pid *pid; 2573 struct task_struct *task = NULL; 2574 int ret = 0; 2575 2576 rcu_read_lock(); 2577 pid = rcu_dereference(target->pid); 2578 if (pid) 2579 task = get_pid_task(pid, PIDTYPE_PID); 2580 rcu_read_unlock(); 2581 if (!task) 2582 return ret; 2583 ret = yield_to(task, 1); 2584 put_task_struct(task); 2585 2586 return ret; 2587 } 2588 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2589 2590 /* 2591 * Helper that checks whether a VCPU is eligible for directed yield. 2592 * Most eligible candidate to yield is decided by following heuristics: 2593 * 2594 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2595 * (preempted lock holder), indicated by @in_spin_loop. 2596 * Set at the beiginning and cleared at the end of interception/PLE handler. 2597 * 2598 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2599 * chance last time (mostly it has become eligible now since we have probably 2600 * yielded to lockholder in last iteration. This is done by toggling 2601 * @dy_eligible each time a VCPU checked for eligibility.) 2602 * 2603 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2604 * to preempted lock-holder could result in wrong VCPU selection and CPU 2605 * burning. Giving priority for a potential lock-holder increases lock 2606 * progress. 2607 * 2608 * Since algorithm is based on heuristics, accessing another VCPU data without 2609 * locking does not harm. It may result in trying to yield to same VCPU, fail 2610 * and continue with next VCPU and so on. 2611 */ 2612 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2613 { 2614 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2615 bool eligible; 2616 2617 eligible = !vcpu->spin_loop.in_spin_loop || 2618 vcpu->spin_loop.dy_eligible; 2619 2620 if (vcpu->spin_loop.in_spin_loop) 2621 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2622 2623 return eligible; 2624 #else 2625 return true; 2626 #endif 2627 } 2628 2629 /* 2630 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2631 * a vcpu_load/vcpu_put pair. However, for most architectures 2632 * kvm_arch_vcpu_runnable does not require vcpu_load. 2633 */ 2634 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2635 { 2636 return kvm_arch_vcpu_runnable(vcpu); 2637 } 2638 2639 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2640 { 2641 if (kvm_arch_dy_runnable(vcpu)) 2642 return true; 2643 2644 #ifdef CONFIG_KVM_ASYNC_PF 2645 if (!list_empty_careful(&vcpu->async_pf.done)) 2646 return true; 2647 #endif 2648 2649 return false; 2650 } 2651 2652 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2653 { 2654 struct kvm *kvm = me->kvm; 2655 struct kvm_vcpu *vcpu; 2656 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2657 int yielded = 0; 2658 int try = 3; 2659 int pass; 2660 int i; 2661 2662 kvm_vcpu_set_in_spin_loop(me, true); 2663 /* 2664 * We boost the priority of a VCPU that is runnable but not 2665 * currently running, because it got preempted by something 2666 * else and called schedule in __vcpu_run. Hopefully that 2667 * VCPU is holding the lock that we need and will release it. 2668 * We approximate round-robin by starting at the last boosted VCPU. 2669 */ 2670 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2671 kvm_for_each_vcpu(i, vcpu, kvm) { 2672 if (!pass && i <= last_boosted_vcpu) { 2673 i = last_boosted_vcpu; 2674 continue; 2675 } else if (pass && i > last_boosted_vcpu) 2676 break; 2677 if (!READ_ONCE(vcpu->ready)) 2678 continue; 2679 if (vcpu == me) 2680 continue; 2681 if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) 2682 continue; 2683 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2684 !kvm_arch_vcpu_in_kernel(vcpu)) 2685 continue; 2686 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2687 continue; 2688 2689 yielded = kvm_vcpu_yield_to(vcpu); 2690 if (yielded > 0) { 2691 kvm->last_boosted_vcpu = i; 2692 break; 2693 } else if (yielded < 0) { 2694 try--; 2695 if (!try) 2696 break; 2697 } 2698 } 2699 } 2700 kvm_vcpu_set_in_spin_loop(me, false); 2701 2702 /* Ensure vcpu is not eligible during next spinloop */ 2703 kvm_vcpu_set_dy_eligible(me, false); 2704 } 2705 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2706 2707 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2708 { 2709 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2710 struct page *page; 2711 2712 if (vmf->pgoff == 0) 2713 page = virt_to_page(vcpu->run); 2714 #ifdef CONFIG_X86 2715 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2716 page = virt_to_page(vcpu->arch.pio_data); 2717 #endif 2718 #ifdef CONFIG_KVM_MMIO 2719 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2720 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2721 #endif 2722 else 2723 return kvm_arch_vcpu_fault(vcpu, vmf); 2724 get_page(page); 2725 vmf->page = page; 2726 return 0; 2727 } 2728 2729 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2730 .fault = kvm_vcpu_fault, 2731 }; 2732 2733 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2734 { 2735 vma->vm_ops = &kvm_vcpu_vm_ops; 2736 return 0; 2737 } 2738 2739 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2740 { 2741 struct kvm_vcpu *vcpu = filp->private_data; 2742 2743 debugfs_remove_recursive(vcpu->debugfs_dentry); 2744 kvm_put_kvm(vcpu->kvm); 2745 return 0; 2746 } 2747 2748 static struct file_operations kvm_vcpu_fops = { 2749 .release = kvm_vcpu_release, 2750 .unlocked_ioctl = kvm_vcpu_ioctl, 2751 .mmap = kvm_vcpu_mmap, 2752 .llseek = noop_llseek, 2753 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2754 }; 2755 2756 /* 2757 * Allocates an inode for the vcpu. 2758 */ 2759 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2760 { 2761 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2762 2763 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2764 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2765 } 2766 2767 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2768 { 2769 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 2770 char dir_name[ITOA_MAX_LEN * 2]; 2771 2772 if (!debugfs_initialized()) 2773 return; 2774 2775 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2776 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2777 vcpu->kvm->debugfs_dentry); 2778 2779 kvm_arch_create_vcpu_debugfs(vcpu); 2780 #endif 2781 } 2782 2783 /* 2784 * Creates some virtual cpus. Good luck creating more than one. 2785 */ 2786 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2787 { 2788 int r; 2789 struct kvm_vcpu *vcpu; 2790 struct page *page; 2791 2792 if (id >= KVM_MAX_VCPU_ID) 2793 return -EINVAL; 2794 2795 mutex_lock(&kvm->lock); 2796 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2797 mutex_unlock(&kvm->lock); 2798 return -EINVAL; 2799 } 2800 2801 kvm->created_vcpus++; 2802 mutex_unlock(&kvm->lock); 2803 2804 r = kvm_arch_vcpu_precreate(kvm, id); 2805 if (r) 2806 goto vcpu_decrement; 2807 2808 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 2809 if (!vcpu) { 2810 r = -ENOMEM; 2811 goto vcpu_decrement; 2812 } 2813 2814 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 2815 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2816 if (!page) { 2817 r = -ENOMEM; 2818 goto vcpu_free; 2819 } 2820 vcpu->run = page_address(page); 2821 2822 kvm_vcpu_init(vcpu, kvm, id); 2823 2824 r = kvm_arch_vcpu_create(vcpu); 2825 if (r) 2826 goto vcpu_free_run_page; 2827 2828 kvm_create_vcpu_debugfs(vcpu); 2829 2830 mutex_lock(&kvm->lock); 2831 if (kvm_get_vcpu_by_id(kvm, id)) { 2832 r = -EEXIST; 2833 goto unlock_vcpu_destroy; 2834 } 2835 2836 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 2837 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 2838 2839 /* Now it's all set up, let userspace reach it */ 2840 kvm_get_kvm(kvm); 2841 r = create_vcpu_fd(vcpu); 2842 if (r < 0) { 2843 kvm_put_kvm_no_destroy(kvm); 2844 goto unlock_vcpu_destroy; 2845 } 2846 2847 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 2848 2849 /* 2850 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2851 * before kvm->online_vcpu's incremented value. 2852 */ 2853 smp_wmb(); 2854 atomic_inc(&kvm->online_vcpus); 2855 2856 mutex_unlock(&kvm->lock); 2857 kvm_arch_vcpu_postcreate(vcpu); 2858 return r; 2859 2860 unlock_vcpu_destroy: 2861 mutex_unlock(&kvm->lock); 2862 debugfs_remove_recursive(vcpu->debugfs_dentry); 2863 kvm_arch_vcpu_destroy(vcpu); 2864 vcpu_free_run_page: 2865 free_page((unsigned long)vcpu->run); 2866 vcpu_free: 2867 kmem_cache_free(kvm_vcpu_cache, vcpu); 2868 vcpu_decrement: 2869 mutex_lock(&kvm->lock); 2870 kvm->created_vcpus--; 2871 mutex_unlock(&kvm->lock); 2872 return r; 2873 } 2874 2875 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2876 { 2877 if (sigset) { 2878 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2879 vcpu->sigset_active = 1; 2880 vcpu->sigset = *sigset; 2881 } else 2882 vcpu->sigset_active = 0; 2883 return 0; 2884 } 2885 2886 static long kvm_vcpu_ioctl(struct file *filp, 2887 unsigned int ioctl, unsigned long arg) 2888 { 2889 struct kvm_vcpu *vcpu = filp->private_data; 2890 void __user *argp = (void __user *)arg; 2891 int r; 2892 struct kvm_fpu *fpu = NULL; 2893 struct kvm_sregs *kvm_sregs = NULL; 2894 2895 if (vcpu->kvm->mm != current->mm) 2896 return -EIO; 2897 2898 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2899 return -EINVAL; 2900 2901 /* 2902 * Some architectures have vcpu ioctls that are asynchronous to vcpu 2903 * execution; mutex_lock() would break them. 2904 */ 2905 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 2906 if (r != -ENOIOCTLCMD) 2907 return r; 2908 2909 if (mutex_lock_killable(&vcpu->mutex)) 2910 return -EINTR; 2911 switch (ioctl) { 2912 case KVM_RUN: { 2913 struct pid *oldpid; 2914 r = -EINVAL; 2915 if (arg) 2916 goto out; 2917 oldpid = rcu_access_pointer(vcpu->pid); 2918 if (unlikely(oldpid != task_pid(current))) { 2919 /* The thread running this VCPU changed. */ 2920 struct pid *newpid; 2921 2922 r = kvm_arch_vcpu_run_pid_change(vcpu); 2923 if (r) 2924 break; 2925 2926 newpid = get_task_pid(current, PIDTYPE_PID); 2927 rcu_assign_pointer(vcpu->pid, newpid); 2928 if (oldpid) 2929 synchronize_rcu(); 2930 put_pid(oldpid); 2931 } 2932 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2933 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2934 break; 2935 } 2936 case KVM_GET_REGS: { 2937 struct kvm_regs *kvm_regs; 2938 2939 r = -ENOMEM; 2940 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 2941 if (!kvm_regs) 2942 goto out; 2943 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2944 if (r) 2945 goto out_free1; 2946 r = -EFAULT; 2947 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2948 goto out_free1; 2949 r = 0; 2950 out_free1: 2951 kfree(kvm_regs); 2952 break; 2953 } 2954 case KVM_SET_REGS: { 2955 struct kvm_regs *kvm_regs; 2956 2957 r = -ENOMEM; 2958 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2959 if (IS_ERR(kvm_regs)) { 2960 r = PTR_ERR(kvm_regs); 2961 goto out; 2962 } 2963 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2964 kfree(kvm_regs); 2965 break; 2966 } 2967 case KVM_GET_SREGS: { 2968 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 2969 GFP_KERNEL_ACCOUNT); 2970 r = -ENOMEM; 2971 if (!kvm_sregs) 2972 goto out; 2973 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2974 if (r) 2975 goto out; 2976 r = -EFAULT; 2977 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2978 goto out; 2979 r = 0; 2980 break; 2981 } 2982 case KVM_SET_SREGS: { 2983 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2984 if (IS_ERR(kvm_sregs)) { 2985 r = PTR_ERR(kvm_sregs); 2986 kvm_sregs = NULL; 2987 goto out; 2988 } 2989 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2990 break; 2991 } 2992 case KVM_GET_MP_STATE: { 2993 struct kvm_mp_state mp_state; 2994 2995 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2996 if (r) 2997 goto out; 2998 r = -EFAULT; 2999 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3000 goto out; 3001 r = 0; 3002 break; 3003 } 3004 case KVM_SET_MP_STATE: { 3005 struct kvm_mp_state mp_state; 3006 3007 r = -EFAULT; 3008 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3009 goto out; 3010 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3011 break; 3012 } 3013 case KVM_TRANSLATE: { 3014 struct kvm_translation tr; 3015 3016 r = -EFAULT; 3017 if (copy_from_user(&tr, argp, sizeof(tr))) 3018 goto out; 3019 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3020 if (r) 3021 goto out; 3022 r = -EFAULT; 3023 if (copy_to_user(argp, &tr, sizeof(tr))) 3024 goto out; 3025 r = 0; 3026 break; 3027 } 3028 case KVM_SET_GUEST_DEBUG: { 3029 struct kvm_guest_debug dbg; 3030 3031 r = -EFAULT; 3032 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3033 goto out; 3034 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3035 break; 3036 } 3037 case KVM_SET_SIGNAL_MASK: { 3038 struct kvm_signal_mask __user *sigmask_arg = argp; 3039 struct kvm_signal_mask kvm_sigmask; 3040 sigset_t sigset, *p; 3041 3042 p = NULL; 3043 if (argp) { 3044 r = -EFAULT; 3045 if (copy_from_user(&kvm_sigmask, argp, 3046 sizeof(kvm_sigmask))) 3047 goto out; 3048 r = -EINVAL; 3049 if (kvm_sigmask.len != sizeof(sigset)) 3050 goto out; 3051 r = -EFAULT; 3052 if (copy_from_user(&sigset, sigmask_arg->sigset, 3053 sizeof(sigset))) 3054 goto out; 3055 p = &sigset; 3056 } 3057 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3058 break; 3059 } 3060 case KVM_GET_FPU: { 3061 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3062 r = -ENOMEM; 3063 if (!fpu) 3064 goto out; 3065 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3066 if (r) 3067 goto out; 3068 r = -EFAULT; 3069 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3070 goto out; 3071 r = 0; 3072 break; 3073 } 3074 case KVM_SET_FPU: { 3075 fpu = memdup_user(argp, sizeof(*fpu)); 3076 if (IS_ERR(fpu)) { 3077 r = PTR_ERR(fpu); 3078 fpu = NULL; 3079 goto out; 3080 } 3081 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3082 break; 3083 } 3084 default: 3085 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3086 } 3087 out: 3088 mutex_unlock(&vcpu->mutex); 3089 kfree(fpu); 3090 kfree(kvm_sregs); 3091 return r; 3092 } 3093 3094 #ifdef CONFIG_KVM_COMPAT 3095 static long kvm_vcpu_compat_ioctl(struct file *filp, 3096 unsigned int ioctl, unsigned long arg) 3097 { 3098 struct kvm_vcpu *vcpu = filp->private_data; 3099 void __user *argp = compat_ptr(arg); 3100 int r; 3101 3102 if (vcpu->kvm->mm != current->mm) 3103 return -EIO; 3104 3105 switch (ioctl) { 3106 case KVM_SET_SIGNAL_MASK: { 3107 struct kvm_signal_mask __user *sigmask_arg = argp; 3108 struct kvm_signal_mask kvm_sigmask; 3109 sigset_t sigset; 3110 3111 if (argp) { 3112 r = -EFAULT; 3113 if (copy_from_user(&kvm_sigmask, argp, 3114 sizeof(kvm_sigmask))) 3115 goto out; 3116 r = -EINVAL; 3117 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3118 goto out; 3119 r = -EFAULT; 3120 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 3121 goto out; 3122 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3123 } else 3124 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3125 break; 3126 } 3127 default: 3128 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3129 } 3130 3131 out: 3132 return r; 3133 } 3134 #endif 3135 3136 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3137 { 3138 struct kvm_device *dev = filp->private_data; 3139 3140 if (dev->ops->mmap) 3141 return dev->ops->mmap(dev, vma); 3142 3143 return -ENODEV; 3144 } 3145 3146 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3147 int (*accessor)(struct kvm_device *dev, 3148 struct kvm_device_attr *attr), 3149 unsigned long arg) 3150 { 3151 struct kvm_device_attr attr; 3152 3153 if (!accessor) 3154 return -EPERM; 3155 3156 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3157 return -EFAULT; 3158 3159 return accessor(dev, &attr); 3160 } 3161 3162 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3163 unsigned long arg) 3164 { 3165 struct kvm_device *dev = filp->private_data; 3166 3167 if (dev->kvm->mm != current->mm) 3168 return -EIO; 3169 3170 switch (ioctl) { 3171 case KVM_SET_DEVICE_ATTR: 3172 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3173 case KVM_GET_DEVICE_ATTR: 3174 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3175 case KVM_HAS_DEVICE_ATTR: 3176 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3177 default: 3178 if (dev->ops->ioctl) 3179 return dev->ops->ioctl(dev, ioctl, arg); 3180 3181 return -ENOTTY; 3182 } 3183 } 3184 3185 static int kvm_device_release(struct inode *inode, struct file *filp) 3186 { 3187 struct kvm_device *dev = filp->private_data; 3188 struct kvm *kvm = dev->kvm; 3189 3190 if (dev->ops->release) { 3191 mutex_lock(&kvm->lock); 3192 list_del(&dev->vm_node); 3193 dev->ops->release(dev); 3194 mutex_unlock(&kvm->lock); 3195 } 3196 3197 kvm_put_kvm(kvm); 3198 return 0; 3199 } 3200 3201 static const struct file_operations kvm_device_fops = { 3202 .unlocked_ioctl = kvm_device_ioctl, 3203 .release = kvm_device_release, 3204 KVM_COMPAT(kvm_device_ioctl), 3205 .mmap = kvm_device_mmap, 3206 }; 3207 3208 struct kvm_device *kvm_device_from_filp(struct file *filp) 3209 { 3210 if (filp->f_op != &kvm_device_fops) 3211 return NULL; 3212 3213 return filp->private_data; 3214 } 3215 3216 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3217 #ifdef CONFIG_KVM_MPIC 3218 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3219 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3220 #endif 3221 }; 3222 3223 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3224 { 3225 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3226 return -ENOSPC; 3227 3228 if (kvm_device_ops_table[type] != NULL) 3229 return -EEXIST; 3230 3231 kvm_device_ops_table[type] = ops; 3232 return 0; 3233 } 3234 3235 void kvm_unregister_device_ops(u32 type) 3236 { 3237 if (kvm_device_ops_table[type] != NULL) 3238 kvm_device_ops_table[type] = NULL; 3239 } 3240 3241 static int kvm_ioctl_create_device(struct kvm *kvm, 3242 struct kvm_create_device *cd) 3243 { 3244 const struct kvm_device_ops *ops = NULL; 3245 struct kvm_device *dev; 3246 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3247 int type; 3248 int ret; 3249 3250 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3251 return -ENODEV; 3252 3253 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3254 ops = kvm_device_ops_table[type]; 3255 if (ops == NULL) 3256 return -ENODEV; 3257 3258 if (test) 3259 return 0; 3260 3261 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3262 if (!dev) 3263 return -ENOMEM; 3264 3265 dev->ops = ops; 3266 dev->kvm = kvm; 3267 3268 mutex_lock(&kvm->lock); 3269 ret = ops->create(dev, type); 3270 if (ret < 0) { 3271 mutex_unlock(&kvm->lock); 3272 kfree(dev); 3273 return ret; 3274 } 3275 list_add(&dev->vm_node, &kvm->devices); 3276 mutex_unlock(&kvm->lock); 3277 3278 if (ops->init) 3279 ops->init(dev); 3280 3281 kvm_get_kvm(kvm); 3282 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3283 if (ret < 0) { 3284 kvm_put_kvm_no_destroy(kvm); 3285 mutex_lock(&kvm->lock); 3286 list_del(&dev->vm_node); 3287 mutex_unlock(&kvm->lock); 3288 ops->destroy(dev); 3289 return ret; 3290 } 3291 3292 cd->fd = ret; 3293 return 0; 3294 } 3295 3296 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3297 { 3298 switch (arg) { 3299 case KVM_CAP_USER_MEMORY: 3300 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3301 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3302 case KVM_CAP_INTERNAL_ERROR_DATA: 3303 #ifdef CONFIG_HAVE_KVM_MSI 3304 case KVM_CAP_SIGNAL_MSI: 3305 #endif 3306 #ifdef CONFIG_HAVE_KVM_IRQFD 3307 case KVM_CAP_IRQFD: 3308 case KVM_CAP_IRQFD_RESAMPLE: 3309 #endif 3310 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3311 case KVM_CAP_CHECK_EXTENSION_VM: 3312 case KVM_CAP_ENABLE_CAP_VM: 3313 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3314 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3315 #endif 3316 return 1; 3317 #ifdef CONFIG_KVM_MMIO 3318 case KVM_CAP_COALESCED_MMIO: 3319 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3320 case KVM_CAP_COALESCED_PIO: 3321 return 1; 3322 #endif 3323 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3324 case KVM_CAP_IRQ_ROUTING: 3325 return KVM_MAX_IRQ_ROUTES; 3326 #endif 3327 #if KVM_ADDRESS_SPACE_NUM > 1 3328 case KVM_CAP_MULTI_ADDRESS_SPACE: 3329 return KVM_ADDRESS_SPACE_NUM; 3330 #endif 3331 case KVM_CAP_NR_MEMSLOTS: 3332 return KVM_USER_MEM_SLOTS; 3333 default: 3334 break; 3335 } 3336 return kvm_vm_ioctl_check_extension(kvm, arg); 3337 } 3338 3339 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3340 struct kvm_enable_cap *cap) 3341 { 3342 return -EINVAL; 3343 } 3344 3345 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3346 struct kvm_enable_cap *cap) 3347 { 3348 switch (cap->cap) { 3349 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3350 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3351 if (cap->flags || (cap->args[0] & ~1)) 3352 return -EINVAL; 3353 kvm->manual_dirty_log_protect = cap->args[0]; 3354 return 0; 3355 #endif 3356 default: 3357 return kvm_vm_ioctl_enable_cap(kvm, cap); 3358 } 3359 } 3360 3361 static long kvm_vm_ioctl(struct file *filp, 3362 unsigned int ioctl, unsigned long arg) 3363 { 3364 struct kvm *kvm = filp->private_data; 3365 void __user *argp = (void __user *)arg; 3366 int r; 3367 3368 if (kvm->mm != current->mm) 3369 return -EIO; 3370 switch (ioctl) { 3371 case KVM_CREATE_VCPU: 3372 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3373 break; 3374 case KVM_ENABLE_CAP: { 3375 struct kvm_enable_cap cap; 3376 3377 r = -EFAULT; 3378 if (copy_from_user(&cap, argp, sizeof(cap))) 3379 goto out; 3380 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3381 break; 3382 } 3383 case KVM_SET_USER_MEMORY_REGION: { 3384 struct kvm_userspace_memory_region kvm_userspace_mem; 3385 3386 r = -EFAULT; 3387 if (copy_from_user(&kvm_userspace_mem, argp, 3388 sizeof(kvm_userspace_mem))) 3389 goto out; 3390 3391 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3392 break; 3393 } 3394 case KVM_GET_DIRTY_LOG: { 3395 struct kvm_dirty_log log; 3396 3397 r = -EFAULT; 3398 if (copy_from_user(&log, argp, sizeof(log))) 3399 goto out; 3400 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3401 break; 3402 } 3403 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3404 case KVM_CLEAR_DIRTY_LOG: { 3405 struct kvm_clear_dirty_log log; 3406 3407 r = -EFAULT; 3408 if (copy_from_user(&log, argp, sizeof(log))) 3409 goto out; 3410 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3411 break; 3412 } 3413 #endif 3414 #ifdef CONFIG_KVM_MMIO 3415 case KVM_REGISTER_COALESCED_MMIO: { 3416 struct kvm_coalesced_mmio_zone zone; 3417 3418 r = -EFAULT; 3419 if (copy_from_user(&zone, argp, sizeof(zone))) 3420 goto out; 3421 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3422 break; 3423 } 3424 case KVM_UNREGISTER_COALESCED_MMIO: { 3425 struct kvm_coalesced_mmio_zone zone; 3426 3427 r = -EFAULT; 3428 if (copy_from_user(&zone, argp, sizeof(zone))) 3429 goto out; 3430 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3431 break; 3432 } 3433 #endif 3434 case KVM_IRQFD: { 3435 struct kvm_irqfd data; 3436 3437 r = -EFAULT; 3438 if (copy_from_user(&data, argp, sizeof(data))) 3439 goto out; 3440 r = kvm_irqfd(kvm, &data); 3441 break; 3442 } 3443 case KVM_IOEVENTFD: { 3444 struct kvm_ioeventfd data; 3445 3446 r = -EFAULT; 3447 if (copy_from_user(&data, argp, sizeof(data))) 3448 goto out; 3449 r = kvm_ioeventfd(kvm, &data); 3450 break; 3451 } 3452 #ifdef CONFIG_HAVE_KVM_MSI 3453 case KVM_SIGNAL_MSI: { 3454 struct kvm_msi msi; 3455 3456 r = -EFAULT; 3457 if (copy_from_user(&msi, argp, sizeof(msi))) 3458 goto out; 3459 r = kvm_send_userspace_msi(kvm, &msi); 3460 break; 3461 } 3462 #endif 3463 #ifdef __KVM_HAVE_IRQ_LINE 3464 case KVM_IRQ_LINE_STATUS: 3465 case KVM_IRQ_LINE: { 3466 struct kvm_irq_level irq_event; 3467 3468 r = -EFAULT; 3469 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3470 goto out; 3471 3472 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3473 ioctl == KVM_IRQ_LINE_STATUS); 3474 if (r) 3475 goto out; 3476 3477 r = -EFAULT; 3478 if (ioctl == KVM_IRQ_LINE_STATUS) { 3479 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3480 goto out; 3481 } 3482 3483 r = 0; 3484 break; 3485 } 3486 #endif 3487 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3488 case KVM_SET_GSI_ROUTING: { 3489 struct kvm_irq_routing routing; 3490 struct kvm_irq_routing __user *urouting; 3491 struct kvm_irq_routing_entry *entries = NULL; 3492 3493 r = -EFAULT; 3494 if (copy_from_user(&routing, argp, sizeof(routing))) 3495 goto out; 3496 r = -EINVAL; 3497 if (!kvm_arch_can_set_irq_routing(kvm)) 3498 goto out; 3499 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3500 goto out; 3501 if (routing.flags) 3502 goto out; 3503 if (routing.nr) { 3504 r = -ENOMEM; 3505 entries = vmalloc(array_size(sizeof(*entries), 3506 routing.nr)); 3507 if (!entries) 3508 goto out; 3509 r = -EFAULT; 3510 urouting = argp; 3511 if (copy_from_user(entries, urouting->entries, 3512 routing.nr * sizeof(*entries))) 3513 goto out_free_irq_routing; 3514 } 3515 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3516 routing.flags); 3517 out_free_irq_routing: 3518 vfree(entries); 3519 break; 3520 } 3521 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3522 case KVM_CREATE_DEVICE: { 3523 struct kvm_create_device cd; 3524 3525 r = -EFAULT; 3526 if (copy_from_user(&cd, argp, sizeof(cd))) 3527 goto out; 3528 3529 r = kvm_ioctl_create_device(kvm, &cd); 3530 if (r) 3531 goto out; 3532 3533 r = -EFAULT; 3534 if (copy_to_user(argp, &cd, sizeof(cd))) 3535 goto out; 3536 3537 r = 0; 3538 break; 3539 } 3540 case KVM_CHECK_EXTENSION: 3541 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3542 break; 3543 default: 3544 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3545 } 3546 out: 3547 return r; 3548 } 3549 3550 #ifdef CONFIG_KVM_COMPAT 3551 struct compat_kvm_dirty_log { 3552 __u32 slot; 3553 __u32 padding1; 3554 union { 3555 compat_uptr_t dirty_bitmap; /* one bit per page */ 3556 __u64 padding2; 3557 }; 3558 }; 3559 3560 static long kvm_vm_compat_ioctl(struct file *filp, 3561 unsigned int ioctl, unsigned long arg) 3562 { 3563 struct kvm *kvm = filp->private_data; 3564 int r; 3565 3566 if (kvm->mm != current->mm) 3567 return -EIO; 3568 switch (ioctl) { 3569 case KVM_GET_DIRTY_LOG: { 3570 struct compat_kvm_dirty_log compat_log; 3571 struct kvm_dirty_log log; 3572 3573 if (copy_from_user(&compat_log, (void __user *)arg, 3574 sizeof(compat_log))) 3575 return -EFAULT; 3576 log.slot = compat_log.slot; 3577 log.padding1 = compat_log.padding1; 3578 log.padding2 = compat_log.padding2; 3579 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3580 3581 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3582 break; 3583 } 3584 default: 3585 r = kvm_vm_ioctl(filp, ioctl, arg); 3586 } 3587 return r; 3588 } 3589 #endif 3590 3591 static struct file_operations kvm_vm_fops = { 3592 .release = kvm_vm_release, 3593 .unlocked_ioctl = kvm_vm_ioctl, 3594 .llseek = noop_llseek, 3595 KVM_COMPAT(kvm_vm_compat_ioctl), 3596 }; 3597 3598 static int kvm_dev_ioctl_create_vm(unsigned long type) 3599 { 3600 int r; 3601 struct kvm *kvm; 3602 struct file *file; 3603 3604 kvm = kvm_create_vm(type); 3605 if (IS_ERR(kvm)) 3606 return PTR_ERR(kvm); 3607 #ifdef CONFIG_KVM_MMIO 3608 r = kvm_coalesced_mmio_init(kvm); 3609 if (r < 0) 3610 goto put_kvm; 3611 #endif 3612 r = get_unused_fd_flags(O_CLOEXEC); 3613 if (r < 0) 3614 goto put_kvm; 3615 3616 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3617 if (IS_ERR(file)) { 3618 put_unused_fd(r); 3619 r = PTR_ERR(file); 3620 goto put_kvm; 3621 } 3622 3623 /* 3624 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3625 * already set, with ->release() being kvm_vm_release(). In error 3626 * cases it will be called by the final fput(file) and will take 3627 * care of doing kvm_put_kvm(kvm). 3628 */ 3629 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3630 put_unused_fd(r); 3631 fput(file); 3632 return -ENOMEM; 3633 } 3634 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3635 3636 fd_install(r, file); 3637 return r; 3638 3639 put_kvm: 3640 kvm_put_kvm(kvm); 3641 return r; 3642 } 3643 3644 static long kvm_dev_ioctl(struct file *filp, 3645 unsigned int ioctl, unsigned long arg) 3646 { 3647 long r = -EINVAL; 3648 3649 switch (ioctl) { 3650 case KVM_GET_API_VERSION: 3651 if (arg) 3652 goto out; 3653 r = KVM_API_VERSION; 3654 break; 3655 case KVM_CREATE_VM: 3656 r = kvm_dev_ioctl_create_vm(arg); 3657 break; 3658 case KVM_CHECK_EXTENSION: 3659 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3660 break; 3661 case KVM_GET_VCPU_MMAP_SIZE: 3662 if (arg) 3663 goto out; 3664 r = PAGE_SIZE; /* struct kvm_run */ 3665 #ifdef CONFIG_X86 3666 r += PAGE_SIZE; /* pio data page */ 3667 #endif 3668 #ifdef CONFIG_KVM_MMIO 3669 r += PAGE_SIZE; /* coalesced mmio ring page */ 3670 #endif 3671 break; 3672 case KVM_TRACE_ENABLE: 3673 case KVM_TRACE_PAUSE: 3674 case KVM_TRACE_DISABLE: 3675 r = -EOPNOTSUPP; 3676 break; 3677 default: 3678 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3679 } 3680 out: 3681 return r; 3682 } 3683 3684 static struct file_operations kvm_chardev_ops = { 3685 .unlocked_ioctl = kvm_dev_ioctl, 3686 .llseek = noop_llseek, 3687 KVM_COMPAT(kvm_dev_ioctl), 3688 }; 3689 3690 static struct miscdevice kvm_dev = { 3691 KVM_MINOR, 3692 "kvm", 3693 &kvm_chardev_ops, 3694 }; 3695 3696 static void hardware_enable_nolock(void *junk) 3697 { 3698 int cpu = raw_smp_processor_id(); 3699 int r; 3700 3701 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3702 return; 3703 3704 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3705 3706 r = kvm_arch_hardware_enable(); 3707 3708 if (r) { 3709 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3710 atomic_inc(&hardware_enable_failed); 3711 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3712 } 3713 } 3714 3715 static int kvm_starting_cpu(unsigned int cpu) 3716 { 3717 raw_spin_lock(&kvm_count_lock); 3718 if (kvm_usage_count) 3719 hardware_enable_nolock(NULL); 3720 raw_spin_unlock(&kvm_count_lock); 3721 return 0; 3722 } 3723 3724 static void hardware_disable_nolock(void *junk) 3725 { 3726 int cpu = raw_smp_processor_id(); 3727 3728 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3729 return; 3730 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3731 kvm_arch_hardware_disable(); 3732 } 3733 3734 static int kvm_dying_cpu(unsigned int cpu) 3735 { 3736 raw_spin_lock(&kvm_count_lock); 3737 if (kvm_usage_count) 3738 hardware_disable_nolock(NULL); 3739 raw_spin_unlock(&kvm_count_lock); 3740 return 0; 3741 } 3742 3743 static void hardware_disable_all_nolock(void) 3744 { 3745 BUG_ON(!kvm_usage_count); 3746 3747 kvm_usage_count--; 3748 if (!kvm_usage_count) 3749 on_each_cpu(hardware_disable_nolock, NULL, 1); 3750 } 3751 3752 static void hardware_disable_all(void) 3753 { 3754 raw_spin_lock(&kvm_count_lock); 3755 hardware_disable_all_nolock(); 3756 raw_spin_unlock(&kvm_count_lock); 3757 } 3758 3759 static int hardware_enable_all(void) 3760 { 3761 int r = 0; 3762 3763 raw_spin_lock(&kvm_count_lock); 3764 3765 kvm_usage_count++; 3766 if (kvm_usage_count == 1) { 3767 atomic_set(&hardware_enable_failed, 0); 3768 on_each_cpu(hardware_enable_nolock, NULL, 1); 3769 3770 if (atomic_read(&hardware_enable_failed)) { 3771 hardware_disable_all_nolock(); 3772 r = -EBUSY; 3773 } 3774 } 3775 3776 raw_spin_unlock(&kvm_count_lock); 3777 3778 return r; 3779 } 3780 3781 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3782 void *v) 3783 { 3784 /* 3785 * Some (well, at least mine) BIOSes hang on reboot if 3786 * in vmx root mode. 3787 * 3788 * And Intel TXT required VMX off for all cpu when system shutdown. 3789 */ 3790 pr_info("kvm: exiting hardware virtualization\n"); 3791 kvm_rebooting = true; 3792 on_each_cpu(hardware_disable_nolock, NULL, 1); 3793 return NOTIFY_OK; 3794 } 3795 3796 static struct notifier_block kvm_reboot_notifier = { 3797 .notifier_call = kvm_reboot, 3798 .priority = 0, 3799 }; 3800 3801 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3802 { 3803 int i; 3804 3805 for (i = 0; i < bus->dev_count; i++) { 3806 struct kvm_io_device *pos = bus->range[i].dev; 3807 3808 kvm_iodevice_destructor(pos); 3809 } 3810 kfree(bus); 3811 } 3812 3813 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3814 const struct kvm_io_range *r2) 3815 { 3816 gpa_t addr1 = r1->addr; 3817 gpa_t addr2 = r2->addr; 3818 3819 if (addr1 < addr2) 3820 return -1; 3821 3822 /* If r2->len == 0, match the exact address. If r2->len != 0, 3823 * accept any overlapping write. Any order is acceptable for 3824 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3825 * we process all of them. 3826 */ 3827 if (r2->len) { 3828 addr1 += r1->len; 3829 addr2 += r2->len; 3830 } 3831 3832 if (addr1 > addr2) 3833 return 1; 3834 3835 return 0; 3836 } 3837 3838 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3839 { 3840 return kvm_io_bus_cmp(p1, p2); 3841 } 3842 3843 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3844 gpa_t addr, int len) 3845 { 3846 struct kvm_io_range *range, key; 3847 int off; 3848 3849 key = (struct kvm_io_range) { 3850 .addr = addr, 3851 .len = len, 3852 }; 3853 3854 range = bsearch(&key, bus->range, bus->dev_count, 3855 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3856 if (range == NULL) 3857 return -ENOENT; 3858 3859 off = range - bus->range; 3860 3861 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3862 off--; 3863 3864 return off; 3865 } 3866 3867 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3868 struct kvm_io_range *range, const void *val) 3869 { 3870 int idx; 3871 3872 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3873 if (idx < 0) 3874 return -EOPNOTSUPP; 3875 3876 while (idx < bus->dev_count && 3877 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3878 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3879 range->len, val)) 3880 return idx; 3881 idx++; 3882 } 3883 3884 return -EOPNOTSUPP; 3885 } 3886 3887 /* kvm_io_bus_write - called under kvm->slots_lock */ 3888 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3889 int len, const void *val) 3890 { 3891 struct kvm_io_bus *bus; 3892 struct kvm_io_range range; 3893 int r; 3894 3895 range = (struct kvm_io_range) { 3896 .addr = addr, 3897 .len = len, 3898 }; 3899 3900 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3901 if (!bus) 3902 return -ENOMEM; 3903 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3904 return r < 0 ? r : 0; 3905 } 3906 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3907 3908 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3909 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3910 gpa_t addr, int len, const void *val, long cookie) 3911 { 3912 struct kvm_io_bus *bus; 3913 struct kvm_io_range range; 3914 3915 range = (struct kvm_io_range) { 3916 .addr = addr, 3917 .len = len, 3918 }; 3919 3920 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3921 if (!bus) 3922 return -ENOMEM; 3923 3924 /* First try the device referenced by cookie. */ 3925 if ((cookie >= 0) && (cookie < bus->dev_count) && 3926 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3927 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3928 val)) 3929 return cookie; 3930 3931 /* 3932 * cookie contained garbage; fall back to search and return the 3933 * correct cookie value. 3934 */ 3935 return __kvm_io_bus_write(vcpu, bus, &range, val); 3936 } 3937 3938 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3939 struct kvm_io_range *range, void *val) 3940 { 3941 int idx; 3942 3943 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3944 if (idx < 0) 3945 return -EOPNOTSUPP; 3946 3947 while (idx < bus->dev_count && 3948 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3949 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3950 range->len, val)) 3951 return idx; 3952 idx++; 3953 } 3954 3955 return -EOPNOTSUPP; 3956 } 3957 3958 /* kvm_io_bus_read - called under kvm->slots_lock */ 3959 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3960 int len, void *val) 3961 { 3962 struct kvm_io_bus *bus; 3963 struct kvm_io_range range; 3964 int r; 3965 3966 range = (struct kvm_io_range) { 3967 .addr = addr, 3968 .len = len, 3969 }; 3970 3971 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3972 if (!bus) 3973 return -ENOMEM; 3974 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3975 return r < 0 ? r : 0; 3976 } 3977 3978 /* Caller must hold slots_lock. */ 3979 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3980 int len, struct kvm_io_device *dev) 3981 { 3982 int i; 3983 struct kvm_io_bus *new_bus, *bus; 3984 struct kvm_io_range range; 3985 3986 bus = kvm_get_bus(kvm, bus_idx); 3987 if (!bus) 3988 return -ENOMEM; 3989 3990 /* exclude ioeventfd which is limited by maximum fd */ 3991 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3992 return -ENOSPC; 3993 3994 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 3995 GFP_KERNEL_ACCOUNT); 3996 if (!new_bus) 3997 return -ENOMEM; 3998 3999 range = (struct kvm_io_range) { 4000 .addr = addr, 4001 .len = len, 4002 .dev = dev, 4003 }; 4004 4005 for (i = 0; i < bus->dev_count; i++) 4006 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4007 break; 4008 4009 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4010 new_bus->dev_count++; 4011 new_bus->range[i] = range; 4012 memcpy(new_bus->range + i + 1, bus->range + i, 4013 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4014 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4015 synchronize_srcu_expedited(&kvm->srcu); 4016 kfree(bus); 4017 4018 return 0; 4019 } 4020 4021 /* Caller must hold slots_lock. */ 4022 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4023 struct kvm_io_device *dev) 4024 { 4025 int i; 4026 struct kvm_io_bus *new_bus, *bus; 4027 4028 bus = kvm_get_bus(kvm, bus_idx); 4029 if (!bus) 4030 return; 4031 4032 for (i = 0; i < bus->dev_count; i++) 4033 if (bus->range[i].dev == dev) { 4034 break; 4035 } 4036 4037 if (i == bus->dev_count) 4038 return; 4039 4040 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4041 GFP_KERNEL_ACCOUNT); 4042 if (!new_bus) { 4043 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4044 goto broken; 4045 } 4046 4047 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4048 new_bus->dev_count--; 4049 memcpy(new_bus->range + i, bus->range + i + 1, 4050 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 4051 4052 broken: 4053 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4054 synchronize_srcu_expedited(&kvm->srcu); 4055 kfree(bus); 4056 return; 4057 } 4058 4059 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4060 gpa_t addr) 4061 { 4062 struct kvm_io_bus *bus; 4063 int dev_idx, srcu_idx; 4064 struct kvm_io_device *iodev = NULL; 4065 4066 srcu_idx = srcu_read_lock(&kvm->srcu); 4067 4068 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4069 if (!bus) 4070 goto out_unlock; 4071 4072 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4073 if (dev_idx < 0) 4074 goto out_unlock; 4075 4076 iodev = bus->range[dev_idx].dev; 4077 4078 out_unlock: 4079 srcu_read_unlock(&kvm->srcu, srcu_idx); 4080 4081 return iodev; 4082 } 4083 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4084 4085 static int kvm_debugfs_open(struct inode *inode, struct file *file, 4086 int (*get)(void *, u64 *), int (*set)(void *, u64), 4087 const char *fmt) 4088 { 4089 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4090 inode->i_private; 4091 4092 /* The debugfs files are a reference to the kvm struct which 4093 * is still valid when kvm_destroy_vm is called. 4094 * To avoid the race between open and the removal of the debugfs 4095 * directory we test against the users count. 4096 */ 4097 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4098 return -ENOENT; 4099 4100 if (simple_attr_open(inode, file, get, 4101 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4102 ? set : NULL, 4103 fmt)) { 4104 kvm_put_kvm(stat_data->kvm); 4105 return -ENOMEM; 4106 } 4107 4108 return 0; 4109 } 4110 4111 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4112 { 4113 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4114 inode->i_private; 4115 4116 simple_attr_release(inode, file); 4117 kvm_put_kvm(stat_data->kvm); 4118 4119 return 0; 4120 } 4121 4122 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4123 { 4124 *val = *(ulong *)((void *)kvm + offset); 4125 4126 return 0; 4127 } 4128 4129 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4130 { 4131 *(ulong *)((void *)kvm + offset) = 0; 4132 4133 return 0; 4134 } 4135 4136 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4137 { 4138 int i; 4139 struct kvm_vcpu *vcpu; 4140 4141 *val = 0; 4142 4143 kvm_for_each_vcpu(i, vcpu, kvm) 4144 *val += *(u64 *)((void *)vcpu + offset); 4145 4146 return 0; 4147 } 4148 4149 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4150 { 4151 int i; 4152 struct kvm_vcpu *vcpu; 4153 4154 kvm_for_each_vcpu(i, vcpu, kvm) 4155 *(u64 *)((void *)vcpu + offset) = 0; 4156 4157 return 0; 4158 } 4159 4160 static int kvm_stat_data_get(void *data, u64 *val) 4161 { 4162 int r = -EFAULT; 4163 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4164 4165 switch (stat_data->dbgfs_item->kind) { 4166 case KVM_STAT_VM: 4167 r = kvm_get_stat_per_vm(stat_data->kvm, 4168 stat_data->dbgfs_item->offset, val); 4169 break; 4170 case KVM_STAT_VCPU: 4171 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4172 stat_data->dbgfs_item->offset, val); 4173 break; 4174 } 4175 4176 return r; 4177 } 4178 4179 static int kvm_stat_data_clear(void *data, u64 val) 4180 { 4181 int r = -EFAULT; 4182 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4183 4184 if (val) 4185 return -EINVAL; 4186 4187 switch (stat_data->dbgfs_item->kind) { 4188 case KVM_STAT_VM: 4189 r = kvm_clear_stat_per_vm(stat_data->kvm, 4190 stat_data->dbgfs_item->offset); 4191 break; 4192 case KVM_STAT_VCPU: 4193 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4194 stat_data->dbgfs_item->offset); 4195 break; 4196 } 4197 4198 return r; 4199 } 4200 4201 static int kvm_stat_data_open(struct inode *inode, struct file *file) 4202 { 4203 __simple_attr_check_format("%llu\n", 0ull); 4204 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4205 kvm_stat_data_clear, "%llu\n"); 4206 } 4207 4208 static const struct file_operations stat_fops_per_vm = { 4209 .owner = THIS_MODULE, 4210 .open = kvm_stat_data_open, 4211 .release = kvm_debugfs_release, 4212 .read = simple_attr_read, 4213 .write = simple_attr_write, 4214 .llseek = no_llseek, 4215 }; 4216 4217 static int vm_stat_get(void *_offset, u64 *val) 4218 { 4219 unsigned offset = (long)_offset; 4220 struct kvm *kvm; 4221 u64 tmp_val; 4222 4223 *val = 0; 4224 mutex_lock(&kvm_lock); 4225 list_for_each_entry(kvm, &vm_list, vm_list) { 4226 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4227 *val += tmp_val; 4228 } 4229 mutex_unlock(&kvm_lock); 4230 return 0; 4231 } 4232 4233 static int vm_stat_clear(void *_offset, u64 val) 4234 { 4235 unsigned offset = (long)_offset; 4236 struct kvm *kvm; 4237 4238 if (val) 4239 return -EINVAL; 4240 4241 mutex_lock(&kvm_lock); 4242 list_for_each_entry(kvm, &vm_list, vm_list) { 4243 kvm_clear_stat_per_vm(kvm, offset); 4244 } 4245 mutex_unlock(&kvm_lock); 4246 4247 return 0; 4248 } 4249 4250 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4251 4252 static int vcpu_stat_get(void *_offset, u64 *val) 4253 { 4254 unsigned offset = (long)_offset; 4255 struct kvm *kvm; 4256 u64 tmp_val; 4257 4258 *val = 0; 4259 mutex_lock(&kvm_lock); 4260 list_for_each_entry(kvm, &vm_list, vm_list) { 4261 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4262 *val += tmp_val; 4263 } 4264 mutex_unlock(&kvm_lock); 4265 return 0; 4266 } 4267 4268 static int vcpu_stat_clear(void *_offset, u64 val) 4269 { 4270 unsigned offset = (long)_offset; 4271 struct kvm *kvm; 4272 4273 if (val) 4274 return -EINVAL; 4275 4276 mutex_lock(&kvm_lock); 4277 list_for_each_entry(kvm, &vm_list, vm_list) { 4278 kvm_clear_stat_per_vcpu(kvm, offset); 4279 } 4280 mutex_unlock(&kvm_lock); 4281 4282 return 0; 4283 } 4284 4285 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4286 "%llu\n"); 4287 4288 static const struct file_operations *stat_fops[] = { 4289 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4290 [KVM_STAT_VM] = &vm_stat_fops, 4291 }; 4292 4293 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4294 { 4295 struct kobj_uevent_env *env; 4296 unsigned long long created, active; 4297 4298 if (!kvm_dev.this_device || !kvm) 4299 return; 4300 4301 mutex_lock(&kvm_lock); 4302 if (type == KVM_EVENT_CREATE_VM) { 4303 kvm_createvm_count++; 4304 kvm_active_vms++; 4305 } else if (type == KVM_EVENT_DESTROY_VM) { 4306 kvm_active_vms--; 4307 } 4308 created = kvm_createvm_count; 4309 active = kvm_active_vms; 4310 mutex_unlock(&kvm_lock); 4311 4312 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4313 if (!env) 4314 return; 4315 4316 add_uevent_var(env, "CREATED=%llu", created); 4317 add_uevent_var(env, "COUNT=%llu", active); 4318 4319 if (type == KVM_EVENT_CREATE_VM) { 4320 add_uevent_var(env, "EVENT=create"); 4321 kvm->userspace_pid = task_pid_nr(current); 4322 } else if (type == KVM_EVENT_DESTROY_VM) { 4323 add_uevent_var(env, "EVENT=destroy"); 4324 } 4325 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4326 4327 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4328 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4329 4330 if (p) { 4331 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4332 if (!IS_ERR(tmp)) 4333 add_uevent_var(env, "STATS_PATH=%s", tmp); 4334 kfree(p); 4335 } 4336 } 4337 /* no need for checks, since we are adding at most only 5 keys */ 4338 env->envp[env->envp_idx++] = NULL; 4339 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4340 kfree(env); 4341 } 4342 4343 static void kvm_init_debug(void) 4344 { 4345 struct kvm_stats_debugfs_item *p; 4346 4347 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4348 4349 kvm_debugfs_num_entries = 0; 4350 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4351 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4352 kvm_debugfs_dir, (void *)(long)p->offset, 4353 stat_fops[p->kind]); 4354 } 4355 } 4356 4357 static int kvm_suspend(void) 4358 { 4359 if (kvm_usage_count) 4360 hardware_disable_nolock(NULL); 4361 return 0; 4362 } 4363 4364 static void kvm_resume(void) 4365 { 4366 if (kvm_usage_count) { 4367 #ifdef CONFIG_LOCKDEP 4368 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4369 #endif 4370 hardware_enable_nolock(NULL); 4371 } 4372 } 4373 4374 static struct syscore_ops kvm_syscore_ops = { 4375 .suspend = kvm_suspend, 4376 .resume = kvm_resume, 4377 }; 4378 4379 static inline 4380 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4381 { 4382 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4383 } 4384 4385 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4386 { 4387 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4388 4389 WRITE_ONCE(vcpu->preempted, false); 4390 WRITE_ONCE(vcpu->ready, false); 4391 4392 __this_cpu_write(kvm_running_vcpu, vcpu); 4393 kvm_arch_sched_in(vcpu, cpu); 4394 kvm_arch_vcpu_load(vcpu, cpu); 4395 } 4396 4397 static void kvm_sched_out(struct preempt_notifier *pn, 4398 struct task_struct *next) 4399 { 4400 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4401 4402 if (current->state == TASK_RUNNING) { 4403 WRITE_ONCE(vcpu->preempted, true); 4404 WRITE_ONCE(vcpu->ready, true); 4405 } 4406 kvm_arch_vcpu_put(vcpu); 4407 __this_cpu_write(kvm_running_vcpu, NULL); 4408 } 4409 4410 /** 4411 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4412 * 4413 * We can disable preemption locally around accessing the per-CPU variable, 4414 * and use the resolved vcpu pointer after enabling preemption again, 4415 * because even if the current thread is migrated to another CPU, reading 4416 * the per-CPU value later will give us the same value as we update the 4417 * per-CPU variable in the preempt notifier handlers. 4418 */ 4419 struct kvm_vcpu *kvm_get_running_vcpu(void) 4420 { 4421 struct kvm_vcpu *vcpu; 4422 4423 preempt_disable(); 4424 vcpu = __this_cpu_read(kvm_running_vcpu); 4425 preempt_enable(); 4426 4427 return vcpu; 4428 } 4429 4430 /** 4431 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4432 */ 4433 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4434 { 4435 return &kvm_running_vcpu; 4436 } 4437 4438 static void check_processor_compat(void *rtn) 4439 { 4440 *(int *)rtn = kvm_arch_check_processor_compat(); 4441 } 4442 4443 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4444 struct module *module) 4445 { 4446 int r; 4447 int cpu; 4448 4449 r = kvm_arch_init(opaque); 4450 if (r) 4451 goto out_fail; 4452 4453 /* 4454 * kvm_arch_init makes sure there's at most one caller 4455 * for architectures that support multiple implementations, 4456 * like intel and amd on x86. 4457 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4458 * conflicts in case kvm is already setup for another implementation. 4459 */ 4460 r = kvm_irqfd_init(); 4461 if (r) 4462 goto out_irqfd; 4463 4464 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4465 r = -ENOMEM; 4466 goto out_free_0; 4467 } 4468 4469 r = kvm_arch_hardware_setup(); 4470 if (r < 0) 4471 goto out_free_1; 4472 4473 for_each_online_cpu(cpu) { 4474 smp_call_function_single(cpu, check_processor_compat, &r, 1); 4475 if (r < 0) 4476 goto out_free_2; 4477 } 4478 4479 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4480 kvm_starting_cpu, kvm_dying_cpu); 4481 if (r) 4482 goto out_free_2; 4483 register_reboot_notifier(&kvm_reboot_notifier); 4484 4485 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4486 if (!vcpu_align) 4487 vcpu_align = __alignof__(struct kvm_vcpu); 4488 kvm_vcpu_cache = 4489 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4490 SLAB_ACCOUNT, 4491 offsetof(struct kvm_vcpu, arch), 4492 sizeof_field(struct kvm_vcpu, arch), 4493 NULL); 4494 if (!kvm_vcpu_cache) { 4495 r = -ENOMEM; 4496 goto out_free_3; 4497 } 4498 4499 r = kvm_async_pf_init(); 4500 if (r) 4501 goto out_free; 4502 4503 kvm_chardev_ops.owner = module; 4504 kvm_vm_fops.owner = module; 4505 kvm_vcpu_fops.owner = module; 4506 4507 r = misc_register(&kvm_dev); 4508 if (r) { 4509 pr_err("kvm: misc device register failed\n"); 4510 goto out_unreg; 4511 } 4512 4513 register_syscore_ops(&kvm_syscore_ops); 4514 4515 kvm_preempt_ops.sched_in = kvm_sched_in; 4516 kvm_preempt_ops.sched_out = kvm_sched_out; 4517 4518 kvm_init_debug(); 4519 4520 r = kvm_vfio_ops_init(); 4521 WARN_ON(r); 4522 4523 return 0; 4524 4525 out_unreg: 4526 kvm_async_pf_deinit(); 4527 out_free: 4528 kmem_cache_destroy(kvm_vcpu_cache); 4529 out_free_3: 4530 unregister_reboot_notifier(&kvm_reboot_notifier); 4531 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4532 out_free_2: 4533 kvm_arch_hardware_unsetup(); 4534 out_free_1: 4535 free_cpumask_var(cpus_hardware_enabled); 4536 out_free_0: 4537 kvm_irqfd_exit(); 4538 out_irqfd: 4539 kvm_arch_exit(); 4540 out_fail: 4541 return r; 4542 } 4543 EXPORT_SYMBOL_GPL(kvm_init); 4544 4545 void kvm_exit(void) 4546 { 4547 debugfs_remove_recursive(kvm_debugfs_dir); 4548 misc_deregister(&kvm_dev); 4549 kmem_cache_destroy(kvm_vcpu_cache); 4550 kvm_async_pf_deinit(); 4551 unregister_syscore_ops(&kvm_syscore_ops); 4552 unregister_reboot_notifier(&kvm_reboot_notifier); 4553 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4554 on_each_cpu(hardware_disable_nolock, NULL, 1); 4555 kvm_arch_hardware_unsetup(); 4556 kvm_arch_exit(); 4557 kvm_irqfd_exit(); 4558 free_cpumask_var(cpus_hardware_enabled); 4559 kvm_vfio_ops_exit(); 4560 } 4561 EXPORT_SYMBOL_GPL(kvm_exit); 4562 4563 struct kvm_vm_worker_thread_context { 4564 struct kvm *kvm; 4565 struct task_struct *parent; 4566 struct completion init_done; 4567 kvm_vm_thread_fn_t thread_fn; 4568 uintptr_t data; 4569 int err; 4570 }; 4571 4572 static int kvm_vm_worker_thread(void *context) 4573 { 4574 /* 4575 * The init_context is allocated on the stack of the parent thread, so 4576 * we have to locally copy anything that is needed beyond initialization 4577 */ 4578 struct kvm_vm_worker_thread_context *init_context = context; 4579 struct kvm *kvm = init_context->kvm; 4580 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4581 uintptr_t data = init_context->data; 4582 int err; 4583 4584 err = kthread_park(current); 4585 /* kthread_park(current) is never supposed to return an error */ 4586 WARN_ON(err != 0); 4587 if (err) 4588 goto init_complete; 4589 4590 err = cgroup_attach_task_all(init_context->parent, current); 4591 if (err) { 4592 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4593 __func__, err); 4594 goto init_complete; 4595 } 4596 4597 set_user_nice(current, task_nice(init_context->parent)); 4598 4599 init_complete: 4600 init_context->err = err; 4601 complete(&init_context->init_done); 4602 init_context = NULL; 4603 4604 if (err) 4605 return err; 4606 4607 /* Wait to be woken up by the spawner before proceeding. */ 4608 kthread_parkme(); 4609 4610 if (!kthread_should_stop()) 4611 err = thread_fn(kvm, data); 4612 4613 return err; 4614 } 4615 4616 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4617 uintptr_t data, const char *name, 4618 struct task_struct **thread_ptr) 4619 { 4620 struct kvm_vm_worker_thread_context init_context = {}; 4621 struct task_struct *thread; 4622 4623 *thread_ptr = NULL; 4624 init_context.kvm = kvm; 4625 init_context.parent = current; 4626 init_context.thread_fn = thread_fn; 4627 init_context.data = data; 4628 init_completion(&init_context.init_done); 4629 4630 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4631 "%s-%d", name, task_pid_nr(current)); 4632 if (IS_ERR(thread)) 4633 return PTR_ERR(thread); 4634 4635 /* kthread_run is never supposed to return NULL */ 4636 WARN_ON(thread == NULL); 4637 4638 wait_for_completion(&init_context.init_done); 4639 4640 if (!init_context.err) 4641 *thread_ptr = thread; 4642 4643 return init_context.err; 4644 } 4645