1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 #include <asm/pgtable.h> 59 60 #include "coalesced_mmio.h" 61 #include "async_pf.h" 62 #include "vfio.h" 63 64 #define CREATE_TRACE_POINTS 65 #include <trace/events/kvm.h> 66 67 /* Worst case buffer size needed for holding an integer. */ 68 #define ITOA_MAX_LEN 12 69 70 MODULE_AUTHOR("Qumranet"); 71 MODULE_LICENSE("GPL"); 72 73 /* Architectures should define their poll value according to the halt latency */ 74 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 75 module_param(halt_poll_ns, uint, 0644); 76 EXPORT_SYMBOL_GPL(halt_poll_ns); 77 78 /* Default doubles per-vcpu halt_poll_ns. */ 79 unsigned int halt_poll_ns_grow = 2; 80 module_param(halt_poll_ns_grow, uint, 0644); 81 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82 83 /* The start value to grow halt_poll_ns from */ 84 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 85 module_param(halt_poll_ns_grow_start, uint, 0644); 86 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 87 88 /* Default resets per-vcpu halt_poll_ns . */ 89 unsigned int halt_poll_ns_shrink; 90 module_param(halt_poll_ns_shrink, uint, 0644); 91 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 92 93 /* 94 * Ordering of locks: 95 * 96 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 97 */ 98 99 DEFINE_MUTEX(kvm_lock); 100 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 101 LIST_HEAD(vm_list); 102 103 static cpumask_var_t cpus_hardware_enabled; 104 static int kvm_usage_count; 105 static atomic_t hardware_enable_failed; 106 107 static struct kmem_cache *kvm_vcpu_cache; 108 109 static __read_mostly struct preempt_ops kvm_preempt_ops; 110 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 111 112 struct dentry *kvm_debugfs_dir; 113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 114 115 static int kvm_debugfs_num_entries; 116 static const struct file_operations stat_fops_per_vm; 117 118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 119 unsigned long arg); 120 #ifdef CONFIG_KVM_COMPAT 121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg); 123 #define KVM_COMPAT(c) .compat_ioctl = (c) 124 #else 125 /* 126 * For architectures that don't implement a compat infrastructure, 127 * adopt a double line of defense: 128 * - Prevent a compat task from opening /dev/kvm 129 * - If the open has been done by a 64bit task, and the KVM fd 130 * passed to a compat task, let the ioctls fail. 131 */ 132 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 133 unsigned long arg) { return -EINVAL; } 134 135 static int kvm_no_compat_open(struct inode *inode, struct file *file) 136 { 137 return is_compat_task() ? -ENODEV : 0; 138 } 139 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 140 .open = kvm_no_compat_open 141 #endif 142 static int hardware_enable_all(void); 143 static void hardware_disable_all(void); 144 145 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 146 147 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 148 149 __visible bool kvm_rebooting; 150 EXPORT_SYMBOL_GPL(kvm_rebooting); 151 152 #define KVM_EVENT_CREATE_VM 0 153 #define KVM_EVENT_DESTROY_VM 1 154 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 155 static unsigned long long kvm_createvm_count; 156 static unsigned long long kvm_active_vms; 157 158 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 159 unsigned long start, unsigned long end, bool blockable) 160 { 161 return 0; 162 } 163 164 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 165 { 166 /* 167 * The metadata used by is_zone_device_page() to determine whether or 168 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 169 * the device has been pinned, e.g. by get_user_pages(). WARN if the 170 * page_count() is zero to help detect bad usage of this helper. 171 */ 172 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 173 return false; 174 175 return is_zone_device_page(pfn_to_page(pfn)); 176 } 177 178 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 179 { 180 /* 181 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 182 * perspective they are "normal" pages, albeit with slightly different 183 * usage rules. 184 */ 185 if (pfn_valid(pfn)) 186 return PageReserved(pfn_to_page(pfn)) && 187 !is_zero_pfn(pfn) && 188 !kvm_is_zone_device_pfn(pfn); 189 190 return true; 191 } 192 193 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 194 { 195 struct page *page = pfn_to_page(pfn); 196 197 if (!PageTransCompoundMap(page)) 198 return false; 199 200 return is_transparent_hugepage(compound_head(page)); 201 } 202 203 /* 204 * Switches to specified vcpu, until a matching vcpu_put() 205 */ 206 void vcpu_load(struct kvm_vcpu *vcpu) 207 { 208 int cpu = get_cpu(); 209 210 __this_cpu_write(kvm_running_vcpu, vcpu); 211 preempt_notifier_register(&vcpu->preempt_notifier); 212 kvm_arch_vcpu_load(vcpu, cpu); 213 put_cpu(); 214 } 215 EXPORT_SYMBOL_GPL(vcpu_load); 216 217 void vcpu_put(struct kvm_vcpu *vcpu) 218 { 219 preempt_disable(); 220 kvm_arch_vcpu_put(vcpu); 221 preempt_notifier_unregister(&vcpu->preempt_notifier); 222 __this_cpu_write(kvm_running_vcpu, NULL); 223 preempt_enable(); 224 } 225 EXPORT_SYMBOL_GPL(vcpu_put); 226 227 /* TODO: merge with kvm_arch_vcpu_should_kick */ 228 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 229 { 230 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 231 232 /* 233 * We need to wait for the VCPU to reenable interrupts and get out of 234 * READING_SHADOW_PAGE_TABLES mode. 235 */ 236 if (req & KVM_REQUEST_WAIT) 237 return mode != OUTSIDE_GUEST_MODE; 238 239 /* 240 * Need to kick a running VCPU, but otherwise there is nothing to do. 241 */ 242 return mode == IN_GUEST_MODE; 243 } 244 245 static void ack_flush(void *_completed) 246 { 247 } 248 249 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 250 { 251 if (unlikely(!cpus)) 252 cpus = cpu_online_mask; 253 254 if (cpumask_empty(cpus)) 255 return false; 256 257 smp_call_function_many(cpus, ack_flush, NULL, wait); 258 return true; 259 } 260 261 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 262 struct kvm_vcpu *except, 263 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 264 { 265 int i, cpu, me; 266 struct kvm_vcpu *vcpu; 267 bool called; 268 269 me = get_cpu(); 270 271 kvm_for_each_vcpu(i, vcpu, kvm) { 272 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || 273 vcpu == except) 274 continue; 275 276 kvm_make_request(req, vcpu); 277 cpu = vcpu->cpu; 278 279 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 280 continue; 281 282 if (tmp != NULL && cpu != -1 && cpu != me && 283 kvm_request_needs_ipi(vcpu, req)) 284 __cpumask_set_cpu(cpu, tmp); 285 } 286 287 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 288 put_cpu(); 289 290 return called; 291 } 292 293 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 294 struct kvm_vcpu *except) 295 { 296 cpumask_var_t cpus; 297 bool called; 298 299 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 300 301 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); 302 303 free_cpumask_var(cpus); 304 return called; 305 } 306 307 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 308 { 309 return kvm_make_all_cpus_request_except(kvm, req, NULL); 310 } 311 312 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 313 void kvm_flush_remote_tlbs(struct kvm *kvm) 314 { 315 /* 316 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 317 * kvm_make_all_cpus_request. 318 */ 319 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 320 321 /* 322 * We want to publish modifications to the page tables before reading 323 * mode. Pairs with a memory barrier in arch-specific code. 324 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 325 * and smp_mb in walk_shadow_page_lockless_begin/end. 326 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 327 * 328 * There is already an smp_mb__after_atomic() before 329 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 330 * barrier here. 331 */ 332 if (!kvm_arch_flush_remote_tlb(kvm) 333 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 334 ++kvm->stat.remote_tlb_flush; 335 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 336 } 337 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 338 #endif 339 340 void kvm_reload_remote_mmus(struct kvm *kvm) 341 { 342 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 343 } 344 345 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 346 { 347 mutex_init(&vcpu->mutex); 348 vcpu->cpu = -1; 349 vcpu->kvm = kvm; 350 vcpu->vcpu_id = id; 351 vcpu->pid = NULL; 352 init_swait_queue_head(&vcpu->wq); 353 kvm_async_pf_vcpu_init(vcpu); 354 355 vcpu->pre_pcpu = -1; 356 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 357 358 kvm_vcpu_set_in_spin_loop(vcpu, false); 359 kvm_vcpu_set_dy_eligible(vcpu, false); 360 vcpu->preempted = false; 361 vcpu->ready = false; 362 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 363 } 364 365 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 366 { 367 kvm_arch_vcpu_destroy(vcpu); 368 369 /* 370 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 371 * the vcpu->pid pointer, and at destruction time all file descriptors 372 * are already gone. 373 */ 374 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 375 376 free_page((unsigned long)vcpu->run); 377 kmem_cache_free(kvm_vcpu_cache, vcpu); 378 } 379 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 380 381 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 382 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 383 { 384 return container_of(mn, struct kvm, mmu_notifier); 385 } 386 387 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 388 struct mm_struct *mm, 389 unsigned long address, 390 pte_t pte) 391 { 392 struct kvm *kvm = mmu_notifier_to_kvm(mn); 393 int idx; 394 395 idx = srcu_read_lock(&kvm->srcu); 396 spin_lock(&kvm->mmu_lock); 397 kvm->mmu_notifier_seq++; 398 399 if (kvm_set_spte_hva(kvm, address, pte)) 400 kvm_flush_remote_tlbs(kvm); 401 402 spin_unlock(&kvm->mmu_lock); 403 srcu_read_unlock(&kvm->srcu, idx); 404 } 405 406 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 407 const struct mmu_notifier_range *range) 408 { 409 struct kvm *kvm = mmu_notifier_to_kvm(mn); 410 int need_tlb_flush = 0, idx; 411 int ret; 412 413 idx = srcu_read_lock(&kvm->srcu); 414 spin_lock(&kvm->mmu_lock); 415 /* 416 * The count increase must become visible at unlock time as no 417 * spte can be established without taking the mmu_lock and 418 * count is also read inside the mmu_lock critical section. 419 */ 420 kvm->mmu_notifier_count++; 421 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); 422 need_tlb_flush |= kvm->tlbs_dirty; 423 /* we've to flush the tlb before the pages can be freed */ 424 if (need_tlb_flush) 425 kvm_flush_remote_tlbs(kvm); 426 427 spin_unlock(&kvm->mmu_lock); 428 429 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, 430 range->end, 431 mmu_notifier_range_blockable(range)); 432 433 srcu_read_unlock(&kvm->srcu, idx); 434 435 return ret; 436 } 437 438 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 439 const struct mmu_notifier_range *range) 440 { 441 struct kvm *kvm = mmu_notifier_to_kvm(mn); 442 443 spin_lock(&kvm->mmu_lock); 444 /* 445 * This sequence increase will notify the kvm page fault that 446 * the page that is going to be mapped in the spte could have 447 * been freed. 448 */ 449 kvm->mmu_notifier_seq++; 450 smp_wmb(); 451 /* 452 * The above sequence increase must be visible before the 453 * below count decrease, which is ensured by the smp_wmb above 454 * in conjunction with the smp_rmb in mmu_notifier_retry(). 455 */ 456 kvm->mmu_notifier_count--; 457 spin_unlock(&kvm->mmu_lock); 458 459 BUG_ON(kvm->mmu_notifier_count < 0); 460 } 461 462 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 463 struct mm_struct *mm, 464 unsigned long start, 465 unsigned long end) 466 { 467 struct kvm *kvm = mmu_notifier_to_kvm(mn); 468 int young, idx; 469 470 idx = srcu_read_lock(&kvm->srcu); 471 spin_lock(&kvm->mmu_lock); 472 473 young = kvm_age_hva(kvm, start, end); 474 if (young) 475 kvm_flush_remote_tlbs(kvm); 476 477 spin_unlock(&kvm->mmu_lock); 478 srcu_read_unlock(&kvm->srcu, idx); 479 480 return young; 481 } 482 483 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 484 struct mm_struct *mm, 485 unsigned long start, 486 unsigned long end) 487 { 488 struct kvm *kvm = mmu_notifier_to_kvm(mn); 489 int young, idx; 490 491 idx = srcu_read_lock(&kvm->srcu); 492 spin_lock(&kvm->mmu_lock); 493 /* 494 * Even though we do not flush TLB, this will still adversely 495 * affect performance on pre-Haswell Intel EPT, where there is 496 * no EPT Access Bit to clear so that we have to tear down EPT 497 * tables instead. If we find this unacceptable, we can always 498 * add a parameter to kvm_age_hva so that it effectively doesn't 499 * do anything on clear_young. 500 * 501 * Also note that currently we never issue secondary TLB flushes 502 * from clear_young, leaving this job up to the regular system 503 * cadence. If we find this inaccurate, we might come up with a 504 * more sophisticated heuristic later. 505 */ 506 young = kvm_age_hva(kvm, start, end); 507 spin_unlock(&kvm->mmu_lock); 508 srcu_read_unlock(&kvm->srcu, idx); 509 510 return young; 511 } 512 513 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 514 struct mm_struct *mm, 515 unsigned long address) 516 { 517 struct kvm *kvm = mmu_notifier_to_kvm(mn); 518 int young, idx; 519 520 idx = srcu_read_lock(&kvm->srcu); 521 spin_lock(&kvm->mmu_lock); 522 young = kvm_test_age_hva(kvm, address); 523 spin_unlock(&kvm->mmu_lock); 524 srcu_read_unlock(&kvm->srcu, idx); 525 526 return young; 527 } 528 529 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 530 struct mm_struct *mm) 531 { 532 struct kvm *kvm = mmu_notifier_to_kvm(mn); 533 int idx; 534 535 idx = srcu_read_lock(&kvm->srcu); 536 kvm_arch_flush_shadow_all(kvm); 537 srcu_read_unlock(&kvm->srcu, idx); 538 } 539 540 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 541 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 542 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 543 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 544 .clear_young = kvm_mmu_notifier_clear_young, 545 .test_young = kvm_mmu_notifier_test_young, 546 .change_pte = kvm_mmu_notifier_change_pte, 547 .release = kvm_mmu_notifier_release, 548 }; 549 550 static int kvm_init_mmu_notifier(struct kvm *kvm) 551 { 552 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 553 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 554 } 555 556 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 557 558 static int kvm_init_mmu_notifier(struct kvm *kvm) 559 { 560 return 0; 561 } 562 563 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 564 565 static struct kvm_memslots *kvm_alloc_memslots(void) 566 { 567 int i; 568 struct kvm_memslots *slots; 569 570 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 571 if (!slots) 572 return NULL; 573 574 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 575 slots->id_to_index[i] = -1; 576 577 return slots; 578 } 579 580 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 581 { 582 if (!memslot->dirty_bitmap) 583 return; 584 585 kvfree(memslot->dirty_bitmap); 586 memslot->dirty_bitmap = NULL; 587 } 588 589 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 590 { 591 kvm_destroy_dirty_bitmap(slot); 592 593 kvm_arch_free_memslot(kvm, slot); 594 595 slot->flags = 0; 596 slot->npages = 0; 597 } 598 599 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 600 { 601 struct kvm_memory_slot *memslot; 602 603 if (!slots) 604 return; 605 606 kvm_for_each_memslot(memslot, slots) 607 kvm_free_memslot(kvm, memslot); 608 609 kvfree(slots); 610 } 611 612 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 613 { 614 int i; 615 616 if (!kvm->debugfs_dentry) 617 return; 618 619 debugfs_remove_recursive(kvm->debugfs_dentry); 620 621 if (kvm->debugfs_stat_data) { 622 for (i = 0; i < kvm_debugfs_num_entries; i++) 623 kfree(kvm->debugfs_stat_data[i]); 624 kfree(kvm->debugfs_stat_data); 625 } 626 } 627 628 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 629 { 630 char dir_name[ITOA_MAX_LEN * 2]; 631 struct kvm_stat_data *stat_data; 632 struct kvm_stats_debugfs_item *p; 633 634 if (!debugfs_initialized()) 635 return 0; 636 637 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 638 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 639 640 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 641 sizeof(*kvm->debugfs_stat_data), 642 GFP_KERNEL_ACCOUNT); 643 if (!kvm->debugfs_stat_data) 644 return -ENOMEM; 645 646 for (p = debugfs_entries; p->name; p++) { 647 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 648 if (!stat_data) 649 return -ENOMEM; 650 651 stat_data->kvm = kvm; 652 stat_data->dbgfs_item = p; 653 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 654 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 655 kvm->debugfs_dentry, stat_data, 656 &stat_fops_per_vm); 657 } 658 return 0; 659 } 660 661 /* 662 * Called after the VM is otherwise initialized, but just before adding it to 663 * the vm_list. 664 */ 665 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 666 { 667 return 0; 668 } 669 670 /* 671 * Called just after removing the VM from the vm_list, but before doing any 672 * other destruction. 673 */ 674 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 675 { 676 } 677 678 static struct kvm *kvm_create_vm(unsigned long type) 679 { 680 struct kvm *kvm = kvm_arch_alloc_vm(); 681 int r = -ENOMEM; 682 int i; 683 684 if (!kvm) 685 return ERR_PTR(-ENOMEM); 686 687 spin_lock_init(&kvm->mmu_lock); 688 mmgrab(current->mm); 689 kvm->mm = current->mm; 690 kvm_eventfd_init(kvm); 691 mutex_init(&kvm->lock); 692 mutex_init(&kvm->irq_lock); 693 mutex_init(&kvm->slots_lock); 694 INIT_LIST_HEAD(&kvm->devices); 695 696 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 697 698 if (init_srcu_struct(&kvm->srcu)) 699 goto out_err_no_srcu; 700 if (init_srcu_struct(&kvm->irq_srcu)) 701 goto out_err_no_irq_srcu; 702 703 refcount_set(&kvm->users_count, 1); 704 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 705 struct kvm_memslots *slots = kvm_alloc_memslots(); 706 707 if (!slots) 708 goto out_err_no_arch_destroy_vm; 709 /* Generations must be different for each address space. */ 710 slots->generation = i; 711 rcu_assign_pointer(kvm->memslots[i], slots); 712 } 713 714 for (i = 0; i < KVM_NR_BUSES; i++) { 715 rcu_assign_pointer(kvm->buses[i], 716 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 717 if (!kvm->buses[i]) 718 goto out_err_no_arch_destroy_vm; 719 } 720 721 r = kvm_arch_init_vm(kvm, type); 722 if (r) 723 goto out_err_no_arch_destroy_vm; 724 725 r = hardware_enable_all(); 726 if (r) 727 goto out_err_no_disable; 728 729 #ifdef CONFIG_HAVE_KVM_IRQFD 730 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 731 #endif 732 733 r = kvm_init_mmu_notifier(kvm); 734 if (r) 735 goto out_err_no_mmu_notifier; 736 737 r = kvm_arch_post_init_vm(kvm); 738 if (r) 739 goto out_err; 740 741 mutex_lock(&kvm_lock); 742 list_add(&kvm->vm_list, &vm_list); 743 mutex_unlock(&kvm_lock); 744 745 preempt_notifier_inc(); 746 747 return kvm; 748 749 out_err: 750 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 751 if (kvm->mmu_notifier.ops) 752 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 753 #endif 754 out_err_no_mmu_notifier: 755 hardware_disable_all(); 756 out_err_no_disable: 757 kvm_arch_destroy_vm(kvm); 758 out_err_no_arch_destroy_vm: 759 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 760 for (i = 0; i < KVM_NR_BUSES; i++) 761 kfree(kvm_get_bus(kvm, i)); 762 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 763 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 764 cleanup_srcu_struct(&kvm->irq_srcu); 765 out_err_no_irq_srcu: 766 cleanup_srcu_struct(&kvm->srcu); 767 out_err_no_srcu: 768 kvm_arch_free_vm(kvm); 769 mmdrop(current->mm); 770 return ERR_PTR(r); 771 } 772 773 static void kvm_destroy_devices(struct kvm *kvm) 774 { 775 struct kvm_device *dev, *tmp; 776 777 /* 778 * We do not need to take the kvm->lock here, because nobody else 779 * has a reference to the struct kvm at this point and therefore 780 * cannot access the devices list anyhow. 781 */ 782 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 783 list_del(&dev->vm_node); 784 dev->ops->destroy(dev); 785 } 786 } 787 788 static void kvm_destroy_vm(struct kvm *kvm) 789 { 790 int i; 791 struct mm_struct *mm = kvm->mm; 792 793 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 794 kvm_destroy_vm_debugfs(kvm); 795 kvm_arch_sync_events(kvm); 796 mutex_lock(&kvm_lock); 797 list_del(&kvm->vm_list); 798 mutex_unlock(&kvm_lock); 799 kvm_arch_pre_destroy_vm(kvm); 800 801 kvm_free_irq_routing(kvm); 802 for (i = 0; i < KVM_NR_BUSES; i++) { 803 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 804 805 if (bus) 806 kvm_io_bus_destroy(bus); 807 kvm->buses[i] = NULL; 808 } 809 kvm_coalesced_mmio_free(kvm); 810 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 811 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 812 #else 813 kvm_arch_flush_shadow_all(kvm); 814 #endif 815 kvm_arch_destroy_vm(kvm); 816 kvm_destroy_devices(kvm); 817 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 818 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 819 cleanup_srcu_struct(&kvm->irq_srcu); 820 cleanup_srcu_struct(&kvm->srcu); 821 kvm_arch_free_vm(kvm); 822 preempt_notifier_dec(); 823 hardware_disable_all(); 824 mmdrop(mm); 825 } 826 827 void kvm_get_kvm(struct kvm *kvm) 828 { 829 refcount_inc(&kvm->users_count); 830 } 831 EXPORT_SYMBOL_GPL(kvm_get_kvm); 832 833 void kvm_put_kvm(struct kvm *kvm) 834 { 835 if (refcount_dec_and_test(&kvm->users_count)) 836 kvm_destroy_vm(kvm); 837 } 838 EXPORT_SYMBOL_GPL(kvm_put_kvm); 839 840 /* 841 * Used to put a reference that was taken on behalf of an object associated 842 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 843 * of the new file descriptor fails and the reference cannot be transferred to 844 * its final owner. In such cases, the caller is still actively using @kvm and 845 * will fail miserably if the refcount unexpectedly hits zero. 846 */ 847 void kvm_put_kvm_no_destroy(struct kvm *kvm) 848 { 849 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 850 } 851 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 852 853 static int kvm_vm_release(struct inode *inode, struct file *filp) 854 { 855 struct kvm *kvm = filp->private_data; 856 857 kvm_irqfd_release(kvm); 858 859 kvm_put_kvm(kvm); 860 return 0; 861 } 862 863 /* 864 * Allocation size is twice as large as the actual dirty bitmap size. 865 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 866 */ 867 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 868 { 869 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 870 871 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 872 if (!memslot->dirty_bitmap) 873 return -ENOMEM; 874 875 return 0; 876 } 877 878 /* 879 * Delete a memslot by decrementing the number of used slots and shifting all 880 * other entries in the array forward one spot. 881 */ 882 static inline void kvm_memslot_delete(struct kvm_memslots *slots, 883 struct kvm_memory_slot *memslot) 884 { 885 struct kvm_memory_slot *mslots = slots->memslots; 886 int i; 887 888 if (WARN_ON(slots->id_to_index[memslot->id] == -1)) 889 return; 890 891 slots->used_slots--; 892 893 if (atomic_read(&slots->lru_slot) >= slots->used_slots) 894 atomic_set(&slots->lru_slot, 0); 895 896 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { 897 mslots[i] = mslots[i + 1]; 898 slots->id_to_index[mslots[i].id] = i; 899 } 900 mslots[i] = *memslot; 901 slots->id_to_index[memslot->id] = -1; 902 } 903 904 /* 905 * "Insert" a new memslot by incrementing the number of used slots. Returns 906 * the new slot's initial index into the memslots array. 907 */ 908 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) 909 { 910 return slots->used_slots++; 911 } 912 913 /* 914 * Move a changed memslot backwards in the array by shifting existing slots 915 * with a higher GFN toward the front of the array. Note, the changed memslot 916 * itself is not preserved in the array, i.e. not swapped at this time, only 917 * its new index into the array is tracked. Returns the changed memslot's 918 * current index into the memslots array. 919 */ 920 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, 921 struct kvm_memory_slot *memslot) 922 { 923 struct kvm_memory_slot *mslots = slots->memslots; 924 int i; 925 926 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || 927 WARN_ON_ONCE(!slots->used_slots)) 928 return -1; 929 930 /* 931 * Move the target memslot backward in the array by shifting existing 932 * memslots with a higher GFN (than the target memslot) towards the 933 * front of the array. 934 */ 935 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { 936 if (memslot->base_gfn > mslots[i + 1].base_gfn) 937 break; 938 939 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); 940 941 /* Shift the next memslot forward one and update its index. */ 942 mslots[i] = mslots[i + 1]; 943 slots->id_to_index[mslots[i].id] = i; 944 } 945 return i; 946 } 947 948 /* 949 * Move a changed memslot forwards in the array by shifting existing slots with 950 * a lower GFN toward the back of the array. Note, the changed memslot itself 951 * is not preserved in the array, i.e. not swapped at this time, only its new 952 * index into the array is tracked. Returns the changed memslot's final index 953 * into the memslots array. 954 */ 955 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, 956 struct kvm_memory_slot *memslot, 957 int start) 958 { 959 struct kvm_memory_slot *mslots = slots->memslots; 960 int i; 961 962 for (i = start; i > 0; i--) { 963 if (memslot->base_gfn < mslots[i - 1].base_gfn) 964 break; 965 966 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); 967 968 /* Shift the next memslot back one and update its index. */ 969 mslots[i] = mslots[i - 1]; 970 slots->id_to_index[mslots[i].id] = i; 971 } 972 return i; 973 } 974 975 /* 976 * Re-sort memslots based on their GFN to account for an added, deleted, or 977 * moved memslot. Sorting memslots by GFN allows using a binary search during 978 * memslot lookup. 979 * 980 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry 981 * at memslots[0] has the highest GFN. 982 * 983 * The sorting algorithm takes advantage of having initially sorted memslots 984 * and knowing the position of the changed memslot. Sorting is also optimized 985 * by not swapping the updated memslot and instead only shifting other memslots 986 * and tracking the new index for the update memslot. Only once its final 987 * index is known is the updated memslot copied into its position in the array. 988 * 989 * - When deleting a memslot, the deleted memslot simply needs to be moved to 990 * the end of the array. 991 * 992 * - When creating a memslot, the algorithm "inserts" the new memslot at the 993 * end of the array and then it forward to its correct location. 994 * 995 * - When moving a memslot, the algorithm first moves the updated memslot 996 * backward to handle the scenario where the memslot's GFN was changed to a 997 * lower value. update_memslots() then falls through and runs the same flow 998 * as creating a memslot to move the memslot forward to handle the scenario 999 * where its GFN was changed to a higher value. 1000 * 1001 * Note, slots are sorted from highest->lowest instead of lowest->highest for 1002 * historical reasons. Originally, invalid memslots where denoted by having 1003 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots 1004 * to the end of the array. The current algorithm uses dedicated logic to 1005 * delete a memslot and thus does not rely on invalid memslots having GFN=0. 1006 * 1007 * The other historical motiviation for highest->lowest was to improve the 1008 * performance of memslot lookup. KVM originally used a linear search starting 1009 * at memslots[0]. On x86, the largest memslot usually has one of the highest, 1010 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a 1011 * single memslot above the 4gb boundary. As the largest memslot is also the 1012 * most likely to be referenced, sorting it to the front of the array was 1013 * advantageous. The current binary search starts from the middle of the array 1014 * and uses an LRU pointer to improve performance for all memslots and GFNs. 1015 */ 1016 static void update_memslots(struct kvm_memslots *slots, 1017 struct kvm_memory_slot *memslot, 1018 enum kvm_mr_change change) 1019 { 1020 int i; 1021 1022 if (change == KVM_MR_DELETE) { 1023 kvm_memslot_delete(slots, memslot); 1024 } else { 1025 if (change == KVM_MR_CREATE) 1026 i = kvm_memslot_insert_back(slots); 1027 else 1028 i = kvm_memslot_move_backward(slots, memslot); 1029 i = kvm_memslot_move_forward(slots, memslot, i); 1030 1031 /* 1032 * Copy the memslot to its new position in memslots and update 1033 * its index accordingly. 1034 */ 1035 slots->memslots[i] = *memslot; 1036 slots->id_to_index[memslot->id] = i; 1037 } 1038 } 1039 1040 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1041 { 1042 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1043 1044 #ifdef __KVM_HAVE_READONLY_MEM 1045 valid_flags |= KVM_MEM_READONLY; 1046 #endif 1047 1048 if (mem->flags & ~valid_flags) 1049 return -EINVAL; 1050 1051 return 0; 1052 } 1053 1054 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 1055 int as_id, struct kvm_memslots *slots) 1056 { 1057 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 1058 u64 gen = old_memslots->generation; 1059 1060 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1061 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1062 1063 rcu_assign_pointer(kvm->memslots[as_id], slots); 1064 synchronize_srcu_expedited(&kvm->srcu); 1065 1066 /* 1067 * Increment the new memslot generation a second time, dropping the 1068 * update in-progress flag and incrementing the generation based on 1069 * the number of address spaces. This provides a unique and easily 1070 * identifiable generation number while the memslots are in flux. 1071 */ 1072 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1073 1074 /* 1075 * Generations must be unique even across address spaces. We do not need 1076 * a global counter for that, instead the generation space is evenly split 1077 * across address spaces. For example, with two address spaces, address 1078 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1079 * use generations 1, 3, 5, ... 1080 */ 1081 gen += KVM_ADDRESS_SPACE_NUM; 1082 1083 kvm_arch_memslots_updated(kvm, gen); 1084 1085 slots->generation = gen; 1086 1087 return old_memslots; 1088 } 1089 1090 /* 1091 * Note, at a minimum, the current number of used slots must be allocated, even 1092 * when deleting a memslot, as we need a complete duplicate of the memslots for 1093 * use when invalidating a memslot prior to deleting/moving the memslot. 1094 */ 1095 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, 1096 enum kvm_mr_change change) 1097 { 1098 struct kvm_memslots *slots; 1099 size_t old_size, new_size; 1100 1101 old_size = sizeof(struct kvm_memslots) + 1102 (sizeof(struct kvm_memory_slot) * old->used_slots); 1103 1104 if (change == KVM_MR_CREATE) 1105 new_size = old_size + sizeof(struct kvm_memory_slot); 1106 else 1107 new_size = old_size; 1108 1109 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); 1110 if (likely(slots)) 1111 memcpy(slots, old, old_size); 1112 1113 return slots; 1114 } 1115 1116 static int kvm_set_memslot(struct kvm *kvm, 1117 const struct kvm_userspace_memory_region *mem, 1118 struct kvm_memory_slot *old, 1119 struct kvm_memory_slot *new, int as_id, 1120 enum kvm_mr_change change) 1121 { 1122 struct kvm_memory_slot *slot; 1123 struct kvm_memslots *slots; 1124 int r; 1125 1126 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); 1127 if (!slots) 1128 return -ENOMEM; 1129 1130 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1131 /* 1132 * Note, the INVALID flag needs to be in the appropriate entry 1133 * in the freshly allocated memslots, not in @old or @new. 1134 */ 1135 slot = id_to_memslot(slots, old->id); 1136 slot->flags |= KVM_MEMSLOT_INVALID; 1137 1138 /* 1139 * We can re-use the old memslots, the only difference from the 1140 * newly installed memslots is the invalid flag, which will get 1141 * dropped by update_memslots anyway. We'll also revert to the 1142 * old memslots if preparing the new memory region fails. 1143 */ 1144 slots = install_new_memslots(kvm, as_id, slots); 1145 1146 /* From this point no new shadow pages pointing to a deleted, 1147 * or moved, memslot will be created. 1148 * 1149 * validation of sp->gfn happens in: 1150 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1151 * - kvm_is_visible_gfn (mmu_check_root) 1152 */ 1153 kvm_arch_flush_shadow_memslot(kvm, slot); 1154 } 1155 1156 r = kvm_arch_prepare_memory_region(kvm, new, mem, change); 1157 if (r) 1158 goto out_slots; 1159 1160 update_memslots(slots, new, change); 1161 slots = install_new_memslots(kvm, as_id, slots); 1162 1163 kvm_arch_commit_memory_region(kvm, mem, old, new, change); 1164 1165 kvfree(slots); 1166 return 0; 1167 1168 out_slots: 1169 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1170 slots = install_new_memslots(kvm, as_id, slots); 1171 kvfree(slots); 1172 return r; 1173 } 1174 1175 static int kvm_delete_memslot(struct kvm *kvm, 1176 const struct kvm_userspace_memory_region *mem, 1177 struct kvm_memory_slot *old, int as_id) 1178 { 1179 struct kvm_memory_slot new; 1180 int r; 1181 1182 if (!old->npages) 1183 return -EINVAL; 1184 1185 memset(&new, 0, sizeof(new)); 1186 new.id = old->id; 1187 1188 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); 1189 if (r) 1190 return r; 1191 1192 kvm_free_memslot(kvm, old); 1193 return 0; 1194 } 1195 1196 /* 1197 * Allocate some memory and give it an address in the guest physical address 1198 * space. 1199 * 1200 * Discontiguous memory is allowed, mostly for framebuffers. 1201 * 1202 * Must be called holding kvm->slots_lock for write. 1203 */ 1204 int __kvm_set_memory_region(struct kvm *kvm, 1205 const struct kvm_userspace_memory_region *mem) 1206 { 1207 struct kvm_memory_slot old, new; 1208 struct kvm_memory_slot *tmp; 1209 enum kvm_mr_change change; 1210 int as_id, id; 1211 int r; 1212 1213 r = check_memory_region_flags(mem); 1214 if (r) 1215 return r; 1216 1217 as_id = mem->slot >> 16; 1218 id = (u16)mem->slot; 1219 1220 /* General sanity checks */ 1221 if (mem->memory_size & (PAGE_SIZE - 1)) 1222 return -EINVAL; 1223 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1224 return -EINVAL; 1225 /* We can read the guest memory with __xxx_user() later on. */ 1226 if ((id < KVM_USER_MEM_SLOTS) && 1227 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1228 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1229 mem->memory_size))) 1230 return -EINVAL; 1231 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1232 return -EINVAL; 1233 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1234 return -EINVAL; 1235 1236 /* 1237 * Make a full copy of the old memslot, the pointer will become stale 1238 * when the memslots are re-sorted by update_memslots(), and the old 1239 * memslot needs to be referenced after calling update_memslots(), e.g. 1240 * to free its resources and for arch specific behavior. 1241 */ 1242 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1243 if (tmp) { 1244 old = *tmp; 1245 tmp = NULL; 1246 } else { 1247 memset(&old, 0, sizeof(old)); 1248 old.id = id; 1249 } 1250 1251 if (!mem->memory_size) 1252 return kvm_delete_memslot(kvm, mem, &old, as_id); 1253 1254 new.id = id; 1255 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1256 new.npages = mem->memory_size >> PAGE_SHIFT; 1257 new.flags = mem->flags; 1258 new.userspace_addr = mem->userspace_addr; 1259 1260 if (new.npages > KVM_MEM_MAX_NR_PAGES) 1261 return -EINVAL; 1262 1263 if (!old.npages) { 1264 change = KVM_MR_CREATE; 1265 new.dirty_bitmap = NULL; 1266 memset(&new.arch, 0, sizeof(new.arch)); 1267 } else { /* Modify an existing slot. */ 1268 if ((new.userspace_addr != old.userspace_addr) || 1269 (new.npages != old.npages) || 1270 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1271 return -EINVAL; 1272 1273 if (new.base_gfn != old.base_gfn) 1274 change = KVM_MR_MOVE; 1275 else if (new.flags != old.flags) 1276 change = KVM_MR_FLAGS_ONLY; 1277 else /* Nothing to change. */ 1278 return 0; 1279 1280 /* Copy dirty_bitmap and arch from the current memslot. */ 1281 new.dirty_bitmap = old.dirty_bitmap; 1282 memcpy(&new.arch, &old.arch, sizeof(new.arch)); 1283 } 1284 1285 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1286 /* Check for overlaps */ 1287 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { 1288 if (tmp->id == id) 1289 continue; 1290 if (!((new.base_gfn + new.npages <= tmp->base_gfn) || 1291 (new.base_gfn >= tmp->base_gfn + tmp->npages))) 1292 return -EEXIST; 1293 } 1294 } 1295 1296 /* Allocate/free page dirty bitmap as needed */ 1297 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1298 new.dirty_bitmap = NULL; 1299 else if (!new.dirty_bitmap) { 1300 r = kvm_alloc_dirty_bitmap(&new); 1301 if (r) 1302 return r; 1303 1304 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1305 bitmap_set(new.dirty_bitmap, 0, new.npages); 1306 } 1307 1308 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); 1309 if (r) 1310 goto out_bitmap; 1311 1312 if (old.dirty_bitmap && !new.dirty_bitmap) 1313 kvm_destroy_dirty_bitmap(&old); 1314 return 0; 1315 1316 out_bitmap: 1317 if (new.dirty_bitmap && !old.dirty_bitmap) 1318 kvm_destroy_dirty_bitmap(&new); 1319 return r; 1320 } 1321 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1322 1323 int kvm_set_memory_region(struct kvm *kvm, 1324 const struct kvm_userspace_memory_region *mem) 1325 { 1326 int r; 1327 1328 mutex_lock(&kvm->slots_lock); 1329 r = __kvm_set_memory_region(kvm, mem); 1330 mutex_unlock(&kvm->slots_lock); 1331 return r; 1332 } 1333 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1334 1335 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1336 struct kvm_userspace_memory_region *mem) 1337 { 1338 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1339 return -EINVAL; 1340 1341 return kvm_set_memory_region(kvm, mem); 1342 } 1343 1344 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1345 /** 1346 * kvm_get_dirty_log - get a snapshot of dirty pages 1347 * @kvm: pointer to kvm instance 1348 * @log: slot id and address to which we copy the log 1349 * @is_dirty: set to '1' if any dirty pages were found 1350 * @memslot: set to the associated memslot, always valid on success 1351 */ 1352 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1353 int *is_dirty, struct kvm_memory_slot **memslot) 1354 { 1355 struct kvm_memslots *slots; 1356 int i, as_id, id; 1357 unsigned long n; 1358 unsigned long any = 0; 1359 1360 *memslot = NULL; 1361 *is_dirty = 0; 1362 1363 as_id = log->slot >> 16; 1364 id = (u16)log->slot; 1365 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1366 return -EINVAL; 1367 1368 slots = __kvm_memslots(kvm, as_id); 1369 *memslot = id_to_memslot(slots, id); 1370 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1371 return -ENOENT; 1372 1373 kvm_arch_sync_dirty_log(kvm, *memslot); 1374 1375 n = kvm_dirty_bitmap_bytes(*memslot); 1376 1377 for (i = 0; !any && i < n/sizeof(long); ++i) 1378 any = (*memslot)->dirty_bitmap[i]; 1379 1380 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 1381 return -EFAULT; 1382 1383 if (any) 1384 *is_dirty = 1; 1385 return 0; 1386 } 1387 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1388 1389 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1390 /** 1391 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1392 * and reenable dirty page tracking for the corresponding pages. 1393 * @kvm: pointer to kvm instance 1394 * @log: slot id and address to which we copy the log 1395 * 1396 * We need to keep it in mind that VCPU threads can write to the bitmap 1397 * concurrently. So, to avoid losing track of dirty pages we keep the 1398 * following order: 1399 * 1400 * 1. Take a snapshot of the bit and clear it if needed. 1401 * 2. Write protect the corresponding page. 1402 * 3. Copy the snapshot to the userspace. 1403 * 4. Upon return caller flushes TLB's if needed. 1404 * 1405 * Between 2 and 4, the guest may write to the page using the remaining TLB 1406 * entry. This is not a problem because the page is reported dirty using 1407 * the snapshot taken before and step 4 ensures that writes done after 1408 * exiting to userspace will be logged for the next call. 1409 * 1410 */ 1411 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 1412 { 1413 struct kvm_memslots *slots; 1414 struct kvm_memory_slot *memslot; 1415 int i, as_id, id; 1416 unsigned long n; 1417 unsigned long *dirty_bitmap; 1418 unsigned long *dirty_bitmap_buffer; 1419 bool flush; 1420 1421 as_id = log->slot >> 16; 1422 id = (u16)log->slot; 1423 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1424 return -EINVAL; 1425 1426 slots = __kvm_memslots(kvm, as_id); 1427 memslot = id_to_memslot(slots, id); 1428 if (!memslot || !memslot->dirty_bitmap) 1429 return -ENOENT; 1430 1431 dirty_bitmap = memslot->dirty_bitmap; 1432 1433 kvm_arch_sync_dirty_log(kvm, memslot); 1434 1435 n = kvm_dirty_bitmap_bytes(memslot); 1436 flush = false; 1437 if (kvm->manual_dirty_log_protect) { 1438 /* 1439 * Unlike kvm_get_dirty_log, we always return false in *flush, 1440 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1441 * is some code duplication between this function and 1442 * kvm_get_dirty_log, but hopefully all architecture 1443 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1444 * can be eliminated. 1445 */ 1446 dirty_bitmap_buffer = dirty_bitmap; 1447 } else { 1448 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1449 memset(dirty_bitmap_buffer, 0, n); 1450 1451 spin_lock(&kvm->mmu_lock); 1452 for (i = 0; i < n / sizeof(long); i++) { 1453 unsigned long mask; 1454 gfn_t offset; 1455 1456 if (!dirty_bitmap[i]) 1457 continue; 1458 1459 flush = true; 1460 mask = xchg(&dirty_bitmap[i], 0); 1461 dirty_bitmap_buffer[i] = mask; 1462 1463 offset = i * BITS_PER_LONG; 1464 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1465 offset, mask); 1466 } 1467 spin_unlock(&kvm->mmu_lock); 1468 } 1469 1470 if (flush) 1471 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1472 1473 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1474 return -EFAULT; 1475 return 0; 1476 } 1477 1478 1479 /** 1480 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 1481 * @kvm: kvm instance 1482 * @log: slot id and address to which we copy the log 1483 * 1484 * Steps 1-4 below provide general overview of dirty page logging. See 1485 * kvm_get_dirty_log_protect() function description for additional details. 1486 * 1487 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 1488 * always flush the TLB (step 4) even if previous step failed and the dirty 1489 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 1490 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 1491 * writes will be marked dirty for next log read. 1492 * 1493 * 1. Take a snapshot of the bit and clear it if needed. 1494 * 2. Write protect the corresponding page. 1495 * 3. Copy the snapshot to the userspace. 1496 * 4. Flush TLB's if needed. 1497 */ 1498 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1499 struct kvm_dirty_log *log) 1500 { 1501 int r; 1502 1503 mutex_lock(&kvm->slots_lock); 1504 1505 r = kvm_get_dirty_log_protect(kvm, log); 1506 1507 mutex_unlock(&kvm->slots_lock); 1508 return r; 1509 } 1510 1511 /** 1512 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1513 * and reenable dirty page tracking for the corresponding pages. 1514 * @kvm: pointer to kvm instance 1515 * @log: slot id and address from which to fetch the bitmap of dirty pages 1516 */ 1517 static int kvm_clear_dirty_log_protect(struct kvm *kvm, 1518 struct kvm_clear_dirty_log *log) 1519 { 1520 struct kvm_memslots *slots; 1521 struct kvm_memory_slot *memslot; 1522 int as_id, id; 1523 gfn_t offset; 1524 unsigned long i, n; 1525 unsigned long *dirty_bitmap; 1526 unsigned long *dirty_bitmap_buffer; 1527 bool flush; 1528 1529 as_id = log->slot >> 16; 1530 id = (u16)log->slot; 1531 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1532 return -EINVAL; 1533 1534 if (log->first_page & 63) 1535 return -EINVAL; 1536 1537 slots = __kvm_memslots(kvm, as_id); 1538 memslot = id_to_memslot(slots, id); 1539 if (!memslot || !memslot->dirty_bitmap) 1540 return -ENOENT; 1541 1542 dirty_bitmap = memslot->dirty_bitmap; 1543 1544 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1545 1546 if (log->first_page > memslot->npages || 1547 log->num_pages > memslot->npages - log->first_page || 1548 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1549 return -EINVAL; 1550 1551 kvm_arch_sync_dirty_log(kvm, memslot); 1552 1553 flush = false; 1554 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1555 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1556 return -EFAULT; 1557 1558 spin_lock(&kvm->mmu_lock); 1559 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1560 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1561 i++, offset += BITS_PER_LONG) { 1562 unsigned long mask = *dirty_bitmap_buffer++; 1563 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1564 if (!mask) 1565 continue; 1566 1567 mask &= atomic_long_fetch_andnot(mask, p); 1568 1569 /* 1570 * mask contains the bits that really have been cleared. This 1571 * never includes any bits beyond the length of the memslot (if 1572 * the length is not aligned to 64 pages), therefore it is not 1573 * a problem if userspace sets them in log->dirty_bitmap. 1574 */ 1575 if (mask) { 1576 flush = true; 1577 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1578 offset, mask); 1579 } 1580 } 1581 spin_unlock(&kvm->mmu_lock); 1582 1583 if (flush) 1584 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1585 1586 return 0; 1587 } 1588 1589 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 1590 struct kvm_clear_dirty_log *log) 1591 { 1592 int r; 1593 1594 mutex_lock(&kvm->slots_lock); 1595 1596 r = kvm_clear_dirty_log_protect(kvm, log); 1597 1598 mutex_unlock(&kvm->slots_lock); 1599 return r; 1600 } 1601 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1602 1603 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1604 { 1605 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1606 } 1607 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1608 1609 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1610 { 1611 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1612 } 1613 1614 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1615 { 1616 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1617 1618 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1619 memslot->flags & KVM_MEMSLOT_INVALID) 1620 return false; 1621 1622 return true; 1623 } 1624 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1625 1626 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1627 { 1628 struct vm_area_struct *vma; 1629 unsigned long addr, size; 1630 1631 size = PAGE_SIZE; 1632 1633 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1634 if (kvm_is_error_hva(addr)) 1635 return PAGE_SIZE; 1636 1637 down_read(¤t->mm->mmap_sem); 1638 vma = find_vma(current->mm, addr); 1639 if (!vma) 1640 goto out; 1641 1642 size = vma_kernel_pagesize(vma); 1643 1644 out: 1645 up_read(¤t->mm->mmap_sem); 1646 1647 return size; 1648 } 1649 1650 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1651 { 1652 return slot->flags & KVM_MEM_READONLY; 1653 } 1654 1655 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1656 gfn_t *nr_pages, bool write) 1657 { 1658 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1659 return KVM_HVA_ERR_BAD; 1660 1661 if (memslot_is_readonly(slot) && write) 1662 return KVM_HVA_ERR_RO_BAD; 1663 1664 if (nr_pages) 1665 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1666 1667 return __gfn_to_hva_memslot(slot, gfn); 1668 } 1669 1670 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1671 gfn_t *nr_pages) 1672 { 1673 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1674 } 1675 1676 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1677 gfn_t gfn) 1678 { 1679 return gfn_to_hva_many(slot, gfn, NULL); 1680 } 1681 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1682 1683 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1684 { 1685 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1686 } 1687 EXPORT_SYMBOL_GPL(gfn_to_hva); 1688 1689 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1690 { 1691 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1692 } 1693 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1694 1695 /* 1696 * Return the hva of a @gfn and the R/W attribute if possible. 1697 * 1698 * @slot: the kvm_memory_slot which contains @gfn 1699 * @gfn: the gfn to be translated 1700 * @writable: used to return the read/write attribute of the @slot if the hva 1701 * is valid and @writable is not NULL 1702 */ 1703 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1704 gfn_t gfn, bool *writable) 1705 { 1706 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1707 1708 if (!kvm_is_error_hva(hva) && writable) 1709 *writable = !memslot_is_readonly(slot); 1710 1711 return hva; 1712 } 1713 1714 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1715 { 1716 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1717 1718 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1719 } 1720 1721 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1722 { 1723 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1724 1725 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1726 } 1727 1728 static inline int check_user_page_hwpoison(unsigned long addr) 1729 { 1730 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1731 1732 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1733 return rc == -EHWPOISON; 1734 } 1735 1736 /* 1737 * The fast path to get the writable pfn which will be stored in @pfn, 1738 * true indicates success, otherwise false is returned. It's also the 1739 * only part that runs if we can in atomic context. 1740 */ 1741 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1742 bool *writable, kvm_pfn_t *pfn) 1743 { 1744 struct page *page[1]; 1745 int npages; 1746 1747 /* 1748 * Fast pin a writable pfn only if it is a write fault request 1749 * or the caller allows to map a writable pfn for a read fault 1750 * request. 1751 */ 1752 if (!(write_fault || writable)) 1753 return false; 1754 1755 npages = __get_user_pages_fast(addr, 1, 1, page); 1756 if (npages == 1) { 1757 *pfn = page_to_pfn(page[0]); 1758 1759 if (writable) 1760 *writable = true; 1761 return true; 1762 } 1763 1764 return false; 1765 } 1766 1767 /* 1768 * The slow path to get the pfn of the specified host virtual address, 1769 * 1 indicates success, -errno is returned if error is detected. 1770 */ 1771 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1772 bool *writable, kvm_pfn_t *pfn) 1773 { 1774 unsigned int flags = FOLL_HWPOISON; 1775 struct page *page; 1776 int npages = 0; 1777 1778 might_sleep(); 1779 1780 if (writable) 1781 *writable = write_fault; 1782 1783 if (write_fault) 1784 flags |= FOLL_WRITE; 1785 if (async) 1786 flags |= FOLL_NOWAIT; 1787 1788 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1789 if (npages != 1) 1790 return npages; 1791 1792 /* map read fault as writable if possible */ 1793 if (unlikely(!write_fault) && writable) { 1794 struct page *wpage; 1795 1796 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1797 *writable = true; 1798 put_page(page); 1799 page = wpage; 1800 } 1801 } 1802 *pfn = page_to_pfn(page); 1803 return npages; 1804 } 1805 1806 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1807 { 1808 if (unlikely(!(vma->vm_flags & VM_READ))) 1809 return false; 1810 1811 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1812 return false; 1813 1814 return true; 1815 } 1816 1817 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1818 unsigned long addr, bool *async, 1819 bool write_fault, bool *writable, 1820 kvm_pfn_t *p_pfn) 1821 { 1822 unsigned long pfn; 1823 int r; 1824 1825 r = follow_pfn(vma, addr, &pfn); 1826 if (r) { 1827 /* 1828 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1829 * not call the fault handler, so do it here. 1830 */ 1831 bool unlocked = false; 1832 r = fixup_user_fault(current, current->mm, addr, 1833 (write_fault ? FAULT_FLAG_WRITE : 0), 1834 &unlocked); 1835 if (unlocked) 1836 return -EAGAIN; 1837 if (r) 1838 return r; 1839 1840 r = follow_pfn(vma, addr, &pfn); 1841 if (r) 1842 return r; 1843 1844 } 1845 1846 if (writable) 1847 *writable = true; 1848 1849 /* 1850 * Get a reference here because callers of *hva_to_pfn* and 1851 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1852 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1853 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1854 * simply do nothing for reserved pfns. 1855 * 1856 * Whoever called remap_pfn_range is also going to call e.g. 1857 * unmap_mapping_range before the underlying pages are freed, 1858 * causing a call to our MMU notifier. 1859 */ 1860 kvm_get_pfn(pfn); 1861 1862 *p_pfn = pfn; 1863 return 0; 1864 } 1865 1866 /* 1867 * Pin guest page in memory and return its pfn. 1868 * @addr: host virtual address which maps memory to the guest 1869 * @atomic: whether this function can sleep 1870 * @async: whether this function need to wait IO complete if the 1871 * host page is not in the memory 1872 * @write_fault: whether we should get a writable host page 1873 * @writable: whether it allows to map a writable host page for !@write_fault 1874 * 1875 * The function will map a writable host page for these two cases: 1876 * 1): @write_fault = true 1877 * 2): @write_fault = false && @writable, @writable will tell the caller 1878 * whether the mapping is writable. 1879 */ 1880 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1881 bool write_fault, bool *writable) 1882 { 1883 struct vm_area_struct *vma; 1884 kvm_pfn_t pfn = 0; 1885 int npages, r; 1886 1887 /* we can do it either atomically or asynchronously, not both */ 1888 BUG_ON(atomic && async); 1889 1890 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1891 return pfn; 1892 1893 if (atomic) 1894 return KVM_PFN_ERR_FAULT; 1895 1896 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1897 if (npages == 1) 1898 return pfn; 1899 1900 down_read(¤t->mm->mmap_sem); 1901 if (npages == -EHWPOISON || 1902 (!async && check_user_page_hwpoison(addr))) { 1903 pfn = KVM_PFN_ERR_HWPOISON; 1904 goto exit; 1905 } 1906 1907 retry: 1908 vma = find_vma_intersection(current->mm, addr, addr + 1); 1909 1910 if (vma == NULL) 1911 pfn = KVM_PFN_ERR_FAULT; 1912 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1913 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1914 if (r == -EAGAIN) 1915 goto retry; 1916 if (r < 0) 1917 pfn = KVM_PFN_ERR_FAULT; 1918 } else { 1919 if (async && vma_is_valid(vma, write_fault)) 1920 *async = true; 1921 pfn = KVM_PFN_ERR_FAULT; 1922 } 1923 exit: 1924 up_read(¤t->mm->mmap_sem); 1925 return pfn; 1926 } 1927 1928 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1929 bool atomic, bool *async, bool write_fault, 1930 bool *writable) 1931 { 1932 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1933 1934 if (addr == KVM_HVA_ERR_RO_BAD) { 1935 if (writable) 1936 *writable = false; 1937 return KVM_PFN_ERR_RO_FAULT; 1938 } 1939 1940 if (kvm_is_error_hva(addr)) { 1941 if (writable) 1942 *writable = false; 1943 return KVM_PFN_NOSLOT; 1944 } 1945 1946 /* Do not map writable pfn in the readonly memslot. */ 1947 if (writable && memslot_is_readonly(slot)) { 1948 *writable = false; 1949 writable = NULL; 1950 } 1951 1952 return hva_to_pfn(addr, atomic, async, write_fault, 1953 writable); 1954 } 1955 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1956 1957 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1958 bool *writable) 1959 { 1960 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1961 write_fault, writable); 1962 } 1963 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1964 1965 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1966 { 1967 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1968 } 1969 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1970 1971 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1972 { 1973 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1974 } 1975 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1976 1977 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1978 { 1979 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1980 } 1981 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1982 1983 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1984 { 1985 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1986 } 1987 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1988 1989 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1990 { 1991 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1992 } 1993 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1994 1995 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1996 struct page **pages, int nr_pages) 1997 { 1998 unsigned long addr; 1999 gfn_t entry = 0; 2000 2001 addr = gfn_to_hva_many(slot, gfn, &entry); 2002 if (kvm_is_error_hva(addr)) 2003 return -1; 2004 2005 if (entry < nr_pages) 2006 return 0; 2007 2008 return __get_user_pages_fast(addr, nr_pages, 1, pages); 2009 } 2010 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2011 2012 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2013 { 2014 if (is_error_noslot_pfn(pfn)) 2015 return KVM_ERR_PTR_BAD_PAGE; 2016 2017 if (kvm_is_reserved_pfn(pfn)) { 2018 WARN_ON(1); 2019 return KVM_ERR_PTR_BAD_PAGE; 2020 } 2021 2022 return pfn_to_page(pfn); 2023 } 2024 2025 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2026 { 2027 kvm_pfn_t pfn; 2028 2029 pfn = gfn_to_pfn(kvm, gfn); 2030 2031 return kvm_pfn_to_page(pfn); 2032 } 2033 EXPORT_SYMBOL_GPL(gfn_to_page); 2034 2035 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 2036 { 2037 if (pfn == 0) 2038 return; 2039 2040 if (cache) 2041 cache->pfn = cache->gfn = 0; 2042 2043 if (dirty) 2044 kvm_release_pfn_dirty(pfn); 2045 else 2046 kvm_release_pfn_clean(pfn); 2047 } 2048 2049 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 2050 struct gfn_to_pfn_cache *cache, u64 gen) 2051 { 2052 kvm_release_pfn(cache->pfn, cache->dirty, cache); 2053 2054 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 2055 cache->gfn = gfn; 2056 cache->dirty = false; 2057 cache->generation = gen; 2058 } 2059 2060 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 2061 struct kvm_host_map *map, 2062 struct gfn_to_pfn_cache *cache, 2063 bool atomic) 2064 { 2065 kvm_pfn_t pfn; 2066 void *hva = NULL; 2067 struct page *page = KVM_UNMAPPED_PAGE; 2068 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 2069 u64 gen = slots->generation; 2070 2071 if (!map) 2072 return -EINVAL; 2073 2074 if (cache) { 2075 if (!cache->pfn || cache->gfn != gfn || 2076 cache->generation != gen) { 2077 if (atomic) 2078 return -EAGAIN; 2079 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 2080 } 2081 pfn = cache->pfn; 2082 } else { 2083 if (atomic) 2084 return -EAGAIN; 2085 pfn = gfn_to_pfn_memslot(slot, gfn); 2086 } 2087 if (is_error_noslot_pfn(pfn)) 2088 return -EINVAL; 2089 2090 if (pfn_valid(pfn)) { 2091 page = pfn_to_page(pfn); 2092 if (atomic) 2093 hva = kmap_atomic(page); 2094 else 2095 hva = kmap(page); 2096 #ifdef CONFIG_HAS_IOMEM 2097 } else if (!atomic) { 2098 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2099 } else { 2100 return -EINVAL; 2101 #endif 2102 } 2103 2104 if (!hva) 2105 return -EFAULT; 2106 2107 map->page = page; 2108 map->hva = hva; 2109 map->pfn = pfn; 2110 map->gfn = gfn; 2111 2112 return 0; 2113 } 2114 2115 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 2116 struct gfn_to_pfn_cache *cache, bool atomic) 2117 { 2118 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 2119 cache, atomic); 2120 } 2121 EXPORT_SYMBOL_GPL(kvm_map_gfn); 2122 2123 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2124 { 2125 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 2126 NULL, false); 2127 } 2128 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2129 2130 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 2131 struct kvm_host_map *map, 2132 struct gfn_to_pfn_cache *cache, 2133 bool dirty, bool atomic) 2134 { 2135 if (!map) 2136 return; 2137 2138 if (!map->hva) 2139 return; 2140 2141 if (map->page != KVM_UNMAPPED_PAGE) { 2142 if (atomic) 2143 kunmap_atomic(map->hva); 2144 else 2145 kunmap(map->page); 2146 } 2147 #ifdef CONFIG_HAS_IOMEM 2148 else if (!atomic) 2149 memunmap(map->hva); 2150 else 2151 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 2152 #endif 2153 2154 if (dirty) 2155 mark_page_dirty_in_slot(memslot, map->gfn); 2156 2157 if (cache) 2158 cache->dirty |= dirty; 2159 else 2160 kvm_release_pfn(map->pfn, dirty, NULL); 2161 2162 map->hva = NULL; 2163 map->page = NULL; 2164 } 2165 2166 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 2167 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 2168 { 2169 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 2170 cache, dirty, atomic); 2171 return 0; 2172 } 2173 EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 2174 2175 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2176 { 2177 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 2178 dirty, false); 2179 } 2180 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2181 2182 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2183 { 2184 kvm_pfn_t pfn; 2185 2186 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2187 2188 return kvm_pfn_to_page(pfn); 2189 } 2190 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2191 2192 void kvm_release_page_clean(struct page *page) 2193 { 2194 WARN_ON(is_error_page(page)); 2195 2196 kvm_release_pfn_clean(page_to_pfn(page)); 2197 } 2198 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2199 2200 void kvm_release_pfn_clean(kvm_pfn_t pfn) 2201 { 2202 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2203 put_page(pfn_to_page(pfn)); 2204 } 2205 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2206 2207 void kvm_release_page_dirty(struct page *page) 2208 { 2209 WARN_ON(is_error_page(page)); 2210 2211 kvm_release_pfn_dirty(page_to_pfn(page)); 2212 } 2213 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2214 2215 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2216 { 2217 kvm_set_pfn_dirty(pfn); 2218 kvm_release_pfn_clean(pfn); 2219 } 2220 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2221 2222 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2223 { 2224 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2225 SetPageDirty(pfn_to_page(pfn)); 2226 } 2227 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2228 2229 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2230 { 2231 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2232 mark_page_accessed(pfn_to_page(pfn)); 2233 } 2234 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2235 2236 void kvm_get_pfn(kvm_pfn_t pfn) 2237 { 2238 if (!kvm_is_reserved_pfn(pfn)) 2239 get_page(pfn_to_page(pfn)); 2240 } 2241 EXPORT_SYMBOL_GPL(kvm_get_pfn); 2242 2243 static int next_segment(unsigned long len, int offset) 2244 { 2245 if (len > PAGE_SIZE - offset) 2246 return PAGE_SIZE - offset; 2247 else 2248 return len; 2249 } 2250 2251 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2252 void *data, int offset, int len) 2253 { 2254 int r; 2255 unsigned long addr; 2256 2257 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2258 if (kvm_is_error_hva(addr)) 2259 return -EFAULT; 2260 r = __copy_from_user(data, (void __user *)addr + offset, len); 2261 if (r) 2262 return -EFAULT; 2263 return 0; 2264 } 2265 2266 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2267 int len) 2268 { 2269 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2270 2271 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2272 } 2273 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2274 2275 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2276 int offset, int len) 2277 { 2278 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2279 2280 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2281 } 2282 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2283 2284 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2285 { 2286 gfn_t gfn = gpa >> PAGE_SHIFT; 2287 int seg; 2288 int offset = offset_in_page(gpa); 2289 int ret; 2290 2291 while ((seg = next_segment(len, offset)) != 0) { 2292 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2293 if (ret < 0) 2294 return ret; 2295 offset = 0; 2296 len -= seg; 2297 data += seg; 2298 ++gfn; 2299 } 2300 return 0; 2301 } 2302 EXPORT_SYMBOL_GPL(kvm_read_guest); 2303 2304 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2305 { 2306 gfn_t gfn = gpa >> PAGE_SHIFT; 2307 int seg; 2308 int offset = offset_in_page(gpa); 2309 int ret; 2310 2311 while ((seg = next_segment(len, offset)) != 0) { 2312 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2313 if (ret < 0) 2314 return ret; 2315 offset = 0; 2316 len -= seg; 2317 data += seg; 2318 ++gfn; 2319 } 2320 return 0; 2321 } 2322 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2323 2324 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2325 void *data, int offset, unsigned long len) 2326 { 2327 int r; 2328 unsigned long addr; 2329 2330 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2331 if (kvm_is_error_hva(addr)) 2332 return -EFAULT; 2333 pagefault_disable(); 2334 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2335 pagefault_enable(); 2336 if (r) 2337 return -EFAULT; 2338 return 0; 2339 } 2340 2341 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2342 void *data, unsigned long len) 2343 { 2344 gfn_t gfn = gpa >> PAGE_SHIFT; 2345 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2346 int offset = offset_in_page(gpa); 2347 2348 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2349 } 2350 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2351 2352 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2353 const void *data, int offset, int len) 2354 { 2355 int r; 2356 unsigned long addr; 2357 2358 addr = gfn_to_hva_memslot(memslot, gfn); 2359 if (kvm_is_error_hva(addr)) 2360 return -EFAULT; 2361 r = __copy_to_user((void __user *)addr + offset, data, len); 2362 if (r) 2363 return -EFAULT; 2364 mark_page_dirty_in_slot(memslot, gfn); 2365 return 0; 2366 } 2367 2368 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2369 const void *data, int offset, int len) 2370 { 2371 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2372 2373 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2374 } 2375 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2376 2377 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2378 const void *data, int offset, int len) 2379 { 2380 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2381 2382 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2383 } 2384 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2385 2386 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2387 unsigned long len) 2388 { 2389 gfn_t gfn = gpa >> PAGE_SHIFT; 2390 int seg; 2391 int offset = offset_in_page(gpa); 2392 int ret; 2393 2394 while ((seg = next_segment(len, offset)) != 0) { 2395 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2396 if (ret < 0) 2397 return ret; 2398 offset = 0; 2399 len -= seg; 2400 data += seg; 2401 ++gfn; 2402 } 2403 return 0; 2404 } 2405 EXPORT_SYMBOL_GPL(kvm_write_guest); 2406 2407 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2408 unsigned long len) 2409 { 2410 gfn_t gfn = gpa >> PAGE_SHIFT; 2411 int seg; 2412 int offset = offset_in_page(gpa); 2413 int ret; 2414 2415 while ((seg = next_segment(len, offset)) != 0) { 2416 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2417 if (ret < 0) 2418 return ret; 2419 offset = 0; 2420 len -= seg; 2421 data += seg; 2422 ++gfn; 2423 } 2424 return 0; 2425 } 2426 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2427 2428 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2429 struct gfn_to_hva_cache *ghc, 2430 gpa_t gpa, unsigned long len) 2431 { 2432 int offset = offset_in_page(gpa); 2433 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2434 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2435 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2436 gfn_t nr_pages_avail; 2437 2438 /* Update ghc->generation before performing any error checks. */ 2439 ghc->generation = slots->generation; 2440 2441 if (start_gfn > end_gfn) { 2442 ghc->hva = KVM_HVA_ERR_BAD; 2443 return -EINVAL; 2444 } 2445 2446 /* 2447 * If the requested region crosses two memslots, we still 2448 * verify that the entire region is valid here. 2449 */ 2450 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2451 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2452 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2453 &nr_pages_avail); 2454 if (kvm_is_error_hva(ghc->hva)) 2455 return -EFAULT; 2456 } 2457 2458 /* Use the slow path for cross page reads and writes. */ 2459 if (nr_pages_needed == 1) 2460 ghc->hva += offset; 2461 else 2462 ghc->memslot = NULL; 2463 2464 ghc->gpa = gpa; 2465 ghc->len = len; 2466 return 0; 2467 } 2468 2469 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2470 gpa_t gpa, unsigned long len) 2471 { 2472 struct kvm_memslots *slots = kvm_memslots(kvm); 2473 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2474 } 2475 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2476 2477 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2478 void *data, unsigned int offset, 2479 unsigned long len) 2480 { 2481 struct kvm_memslots *slots = kvm_memslots(kvm); 2482 int r; 2483 gpa_t gpa = ghc->gpa + offset; 2484 2485 BUG_ON(len + offset > ghc->len); 2486 2487 if (slots->generation != ghc->generation) { 2488 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2489 return -EFAULT; 2490 } 2491 2492 if (kvm_is_error_hva(ghc->hva)) 2493 return -EFAULT; 2494 2495 if (unlikely(!ghc->memslot)) 2496 return kvm_write_guest(kvm, gpa, data, len); 2497 2498 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2499 if (r) 2500 return -EFAULT; 2501 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2502 2503 return 0; 2504 } 2505 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2506 2507 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2508 void *data, unsigned long len) 2509 { 2510 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2511 } 2512 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2513 2514 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2515 void *data, unsigned long len) 2516 { 2517 struct kvm_memslots *slots = kvm_memslots(kvm); 2518 int r; 2519 2520 BUG_ON(len > ghc->len); 2521 2522 if (slots->generation != ghc->generation) { 2523 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2524 return -EFAULT; 2525 } 2526 2527 if (kvm_is_error_hva(ghc->hva)) 2528 return -EFAULT; 2529 2530 if (unlikely(!ghc->memslot)) 2531 return kvm_read_guest(kvm, ghc->gpa, data, len); 2532 2533 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2534 if (r) 2535 return -EFAULT; 2536 2537 return 0; 2538 } 2539 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2540 2541 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2542 { 2543 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2544 2545 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2546 } 2547 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2548 2549 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2550 { 2551 gfn_t gfn = gpa >> PAGE_SHIFT; 2552 int seg; 2553 int offset = offset_in_page(gpa); 2554 int ret; 2555 2556 while ((seg = next_segment(len, offset)) != 0) { 2557 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2558 if (ret < 0) 2559 return ret; 2560 offset = 0; 2561 len -= seg; 2562 ++gfn; 2563 } 2564 return 0; 2565 } 2566 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2567 2568 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2569 gfn_t gfn) 2570 { 2571 if (memslot && memslot->dirty_bitmap) { 2572 unsigned long rel_gfn = gfn - memslot->base_gfn; 2573 2574 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2575 } 2576 } 2577 2578 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2579 { 2580 struct kvm_memory_slot *memslot; 2581 2582 memslot = gfn_to_memslot(kvm, gfn); 2583 mark_page_dirty_in_slot(memslot, gfn); 2584 } 2585 EXPORT_SYMBOL_GPL(mark_page_dirty); 2586 2587 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2588 { 2589 struct kvm_memory_slot *memslot; 2590 2591 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2592 mark_page_dirty_in_slot(memslot, gfn); 2593 } 2594 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2595 2596 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2597 { 2598 if (!vcpu->sigset_active) 2599 return; 2600 2601 /* 2602 * This does a lockless modification of ->real_blocked, which is fine 2603 * because, only current can change ->real_blocked and all readers of 2604 * ->real_blocked don't care as long ->real_blocked is always a subset 2605 * of ->blocked. 2606 */ 2607 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2608 } 2609 2610 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2611 { 2612 if (!vcpu->sigset_active) 2613 return; 2614 2615 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2616 sigemptyset(¤t->real_blocked); 2617 } 2618 2619 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2620 { 2621 unsigned int old, val, grow, grow_start; 2622 2623 old = val = vcpu->halt_poll_ns; 2624 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2625 grow = READ_ONCE(halt_poll_ns_grow); 2626 if (!grow) 2627 goto out; 2628 2629 val *= grow; 2630 if (val < grow_start) 2631 val = grow_start; 2632 2633 if (val > halt_poll_ns) 2634 val = halt_poll_ns; 2635 2636 vcpu->halt_poll_ns = val; 2637 out: 2638 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2639 } 2640 2641 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2642 { 2643 unsigned int old, val, shrink; 2644 2645 old = val = vcpu->halt_poll_ns; 2646 shrink = READ_ONCE(halt_poll_ns_shrink); 2647 if (shrink == 0) 2648 val = 0; 2649 else 2650 val /= shrink; 2651 2652 vcpu->halt_poll_ns = val; 2653 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2654 } 2655 2656 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2657 { 2658 int ret = -EINTR; 2659 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2660 2661 if (kvm_arch_vcpu_runnable(vcpu)) { 2662 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2663 goto out; 2664 } 2665 if (kvm_cpu_has_pending_timer(vcpu)) 2666 goto out; 2667 if (signal_pending(current)) 2668 goto out; 2669 2670 ret = 0; 2671 out: 2672 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2673 return ret; 2674 } 2675 2676 /* 2677 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2678 */ 2679 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2680 { 2681 ktime_t start, cur; 2682 DECLARE_SWAITQUEUE(wait); 2683 bool waited = false; 2684 u64 block_ns; 2685 2686 kvm_arch_vcpu_blocking(vcpu); 2687 2688 start = cur = ktime_get(); 2689 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2690 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2691 2692 ++vcpu->stat.halt_attempted_poll; 2693 do { 2694 /* 2695 * This sets KVM_REQ_UNHALT if an interrupt 2696 * arrives. 2697 */ 2698 if (kvm_vcpu_check_block(vcpu) < 0) { 2699 ++vcpu->stat.halt_successful_poll; 2700 if (!vcpu_valid_wakeup(vcpu)) 2701 ++vcpu->stat.halt_poll_invalid; 2702 goto out; 2703 } 2704 cur = ktime_get(); 2705 } while (single_task_running() && ktime_before(cur, stop)); 2706 } 2707 2708 for (;;) { 2709 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2710 2711 if (kvm_vcpu_check_block(vcpu) < 0) 2712 break; 2713 2714 waited = true; 2715 schedule(); 2716 } 2717 2718 finish_swait(&vcpu->wq, &wait); 2719 cur = ktime_get(); 2720 out: 2721 kvm_arch_vcpu_unblocking(vcpu); 2722 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2723 2724 if (!kvm_arch_no_poll(vcpu)) { 2725 if (!vcpu_valid_wakeup(vcpu)) { 2726 shrink_halt_poll_ns(vcpu); 2727 } else if (halt_poll_ns) { 2728 if (block_ns <= vcpu->halt_poll_ns) 2729 ; 2730 /* we had a long block, shrink polling */ 2731 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2732 shrink_halt_poll_ns(vcpu); 2733 /* we had a short halt and our poll time is too small */ 2734 else if (vcpu->halt_poll_ns < halt_poll_ns && 2735 block_ns < halt_poll_ns) 2736 grow_halt_poll_ns(vcpu); 2737 } else { 2738 vcpu->halt_poll_ns = 0; 2739 } 2740 } 2741 2742 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2743 kvm_arch_vcpu_block_finish(vcpu); 2744 } 2745 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2746 2747 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2748 { 2749 struct swait_queue_head *wqp; 2750 2751 wqp = kvm_arch_vcpu_wq(vcpu); 2752 if (swq_has_sleeper(wqp)) { 2753 swake_up_one(wqp); 2754 WRITE_ONCE(vcpu->ready, true); 2755 ++vcpu->stat.halt_wakeup; 2756 return true; 2757 } 2758 2759 return false; 2760 } 2761 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2762 2763 #ifndef CONFIG_S390 2764 /* 2765 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2766 */ 2767 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2768 { 2769 int me; 2770 int cpu = vcpu->cpu; 2771 2772 if (kvm_vcpu_wake_up(vcpu)) 2773 return; 2774 2775 me = get_cpu(); 2776 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2777 if (kvm_arch_vcpu_should_kick(vcpu)) 2778 smp_send_reschedule(cpu); 2779 put_cpu(); 2780 } 2781 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2782 #endif /* !CONFIG_S390 */ 2783 2784 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2785 { 2786 struct pid *pid; 2787 struct task_struct *task = NULL; 2788 int ret = 0; 2789 2790 rcu_read_lock(); 2791 pid = rcu_dereference(target->pid); 2792 if (pid) 2793 task = get_pid_task(pid, PIDTYPE_PID); 2794 rcu_read_unlock(); 2795 if (!task) 2796 return ret; 2797 ret = yield_to(task, 1); 2798 put_task_struct(task); 2799 2800 return ret; 2801 } 2802 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2803 2804 /* 2805 * Helper that checks whether a VCPU is eligible for directed yield. 2806 * Most eligible candidate to yield is decided by following heuristics: 2807 * 2808 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2809 * (preempted lock holder), indicated by @in_spin_loop. 2810 * Set at the beiginning and cleared at the end of interception/PLE handler. 2811 * 2812 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2813 * chance last time (mostly it has become eligible now since we have probably 2814 * yielded to lockholder in last iteration. This is done by toggling 2815 * @dy_eligible each time a VCPU checked for eligibility.) 2816 * 2817 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2818 * to preempted lock-holder could result in wrong VCPU selection and CPU 2819 * burning. Giving priority for a potential lock-holder increases lock 2820 * progress. 2821 * 2822 * Since algorithm is based on heuristics, accessing another VCPU data without 2823 * locking does not harm. It may result in trying to yield to same VCPU, fail 2824 * and continue with next VCPU and so on. 2825 */ 2826 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2827 { 2828 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2829 bool eligible; 2830 2831 eligible = !vcpu->spin_loop.in_spin_loop || 2832 vcpu->spin_loop.dy_eligible; 2833 2834 if (vcpu->spin_loop.in_spin_loop) 2835 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2836 2837 return eligible; 2838 #else 2839 return true; 2840 #endif 2841 } 2842 2843 /* 2844 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2845 * a vcpu_load/vcpu_put pair. However, for most architectures 2846 * kvm_arch_vcpu_runnable does not require vcpu_load. 2847 */ 2848 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2849 { 2850 return kvm_arch_vcpu_runnable(vcpu); 2851 } 2852 2853 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2854 { 2855 if (kvm_arch_dy_runnable(vcpu)) 2856 return true; 2857 2858 #ifdef CONFIG_KVM_ASYNC_PF 2859 if (!list_empty_careful(&vcpu->async_pf.done)) 2860 return true; 2861 #endif 2862 2863 return false; 2864 } 2865 2866 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2867 { 2868 struct kvm *kvm = me->kvm; 2869 struct kvm_vcpu *vcpu; 2870 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2871 int yielded = 0; 2872 int try = 3; 2873 int pass; 2874 int i; 2875 2876 kvm_vcpu_set_in_spin_loop(me, true); 2877 /* 2878 * We boost the priority of a VCPU that is runnable but not 2879 * currently running, because it got preempted by something 2880 * else and called schedule in __vcpu_run. Hopefully that 2881 * VCPU is holding the lock that we need and will release it. 2882 * We approximate round-robin by starting at the last boosted VCPU. 2883 */ 2884 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2885 kvm_for_each_vcpu(i, vcpu, kvm) { 2886 if (!pass && i <= last_boosted_vcpu) { 2887 i = last_boosted_vcpu; 2888 continue; 2889 } else if (pass && i > last_boosted_vcpu) 2890 break; 2891 if (!READ_ONCE(vcpu->ready)) 2892 continue; 2893 if (vcpu == me) 2894 continue; 2895 if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) 2896 continue; 2897 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2898 !kvm_arch_vcpu_in_kernel(vcpu)) 2899 continue; 2900 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2901 continue; 2902 2903 yielded = kvm_vcpu_yield_to(vcpu); 2904 if (yielded > 0) { 2905 kvm->last_boosted_vcpu = i; 2906 break; 2907 } else if (yielded < 0) { 2908 try--; 2909 if (!try) 2910 break; 2911 } 2912 } 2913 } 2914 kvm_vcpu_set_in_spin_loop(me, false); 2915 2916 /* Ensure vcpu is not eligible during next spinloop */ 2917 kvm_vcpu_set_dy_eligible(me, false); 2918 } 2919 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2920 2921 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2922 { 2923 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2924 struct page *page; 2925 2926 if (vmf->pgoff == 0) 2927 page = virt_to_page(vcpu->run); 2928 #ifdef CONFIG_X86 2929 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2930 page = virt_to_page(vcpu->arch.pio_data); 2931 #endif 2932 #ifdef CONFIG_KVM_MMIO 2933 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2934 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2935 #endif 2936 else 2937 return kvm_arch_vcpu_fault(vcpu, vmf); 2938 get_page(page); 2939 vmf->page = page; 2940 return 0; 2941 } 2942 2943 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2944 .fault = kvm_vcpu_fault, 2945 }; 2946 2947 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2948 { 2949 vma->vm_ops = &kvm_vcpu_vm_ops; 2950 return 0; 2951 } 2952 2953 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2954 { 2955 struct kvm_vcpu *vcpu = filp->private_data; 2956 2957 debugfs_remove_recursive(vcpu->debugfs_dentry); 2958 kvm_put_kvm(vcpu->kvm); 2959 return 0; 2960 } 2961 2962 static struct file_operations kvm_vcpu_fops = { 2963 .release = kvm_vcpu_release, 2964 .unlocked_ioctl = kvm_vcpu_ioctl, 2965 .mmap = kvm_vcpu_mmap, 2966 .llseek = noop_llseek, 2967 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2968 }; 2969 2970 /* 2971 * Allocates an inode for the vcpu. 2972 */ 2973 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2974 { 2975 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2976 2977 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2978 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2979 } 2980 2981 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2982 { 2983 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 2984 char dir_name[ITOA_MAX_LEN * 2]; 2985 2986 if (!debugfs_initialized()) 2987 return; 2988 2989 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2990 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2991 vcpu->kvm->debugfs_dentry); 2992 2993 kvm_arch_create_vcpu_debugfs(vcpu); 2994 #endif 2995 } 2996 2997 /* 2998 * Creates some virtual cpus. Good luck creating more than one. 2999 */ 3000 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3001 { 3002 int r; 3003 struct kvm_vcpu *vcpu; 3004 struct page *page; 3005 3006 if (id >= KVM_MAX_VCPU_ID) 3007 return -EINVAL; 3008 3009 mutex_lock(&kvm->lock); 3010 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 3011 mutex_unlock(&kvm->lock); 3012 return -EINVAL; 3013 } 3014 3015 kvm->created_vcpus++; 3016 mutex_unlock(&kvm->lock); 3017 3018 r = kvm_arch_vcpu_precreate(kvm, id); 3019 if (r) 3020 goto vcpu_decrement; 3021 3022 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3023 if (!vcpu) { 3024 r = -ENOMEM; 3025 goto vcpu_decrement; 3026 } 3027 3028 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3029 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3030 if (!page) { 3031 r = -ENOMEM; 3032 goto vcpu_free; 3033 } 3034 vcpu->run = page_address(page); 3035 3036 kvm_vcpu_init(vcpu, kvm, id); 3037 3038 r = kvm_arch_vcpu_create(vcpu); 3039 if (r) 3040 goto vcpu_free_run_page; 3041 3042 kvm_create_vcpu_debugfs(vcpu); 3043 3044 mutex_lock(&kvm->lock); 3045 if (kvm_get_vcpu_by_id(kvm, id)) { 3046 r = -EEXIST; 3047 goto unlock_vcpu_destroy; 3048 } 3049 3050 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3051 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 3052 3053 /* Now it's all set up, let userspace reach it */ 3054 kvm_get_kvm(kvm); 3055 r = create_vcpu_fd(vcpu); 3056 if (r < 0) { 3057 kvm_put_kvm_no_destroy(kvm); 3058 goto unlock_vcpu_destroy; 3059 } 3060 3061 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 3062 3063 /* 3064 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 3065 * before kvm->online_vcpu's incremented value. 3066 */ 3067 smp_wmb(); 3068 atomic_inc(&kvm->online_vcpus); 3069 3070 mutex_unlock(&kvm->lock); 3071 kvm_arch_vcpu_postcreate(vcpu); 3072 return r; 3073 3074 unlock_vcpu_destroy: 3075 mutex_unlock(&kvm->lock); 3076 debugfs_remove_recursive(vcpu->debugfs_dentry); 3077 kvm_arch_vcpu_destroy(vcpu); 3078 vcpu_free_run_page: 3079 free_page((unsigned long)vcpu->run); 3080 vcpu_free: 3081 kmem_cache_free(kvm_vcpu_cache, vcpu); 3082 vcpu_decrement: 3083 mutex_lock(&kvm->lock); 3084 kvm->created_vcpus--; 3085 mutex_unlock(&kvm->lock); 3086 return r; 3087 } 3088 3089 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3090 { 3091 if (sigset) { 3092 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3093 vcpu->sigset_active = 1; 3094 vcpu->sigset = *sigset; 3095 } else 3096 vcpu->sigset_active = 0; 3097 return 0; 3098 } 3099 3100 static long kvm_vcpu_ioctl(struct file *filp, 3101 unsigned int ioctl, unsigned long arg) 3102 { 3103 struct kvm_vcpu *vcpu = filp->private_data; 3104 void __user *argp = (void __user *)arg; 3105 int r; 3106 struct kvm_fpu *fpu = NULL; 3107 struct kvm_sregs *kvm_sregs = NULL; 3108 3109 if (vcpu->kvm->mm != current->mm) 3110 return -EIO; 3111 3112 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3113 return -EINVAL; 3114 3115 /* 3116 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3117 * execution; mutex_lock() would break them. 3118 */ 3119 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3120 if (r != -ENOIOCTLCMD) 3121 return r; 3122 3123 if (mutex_lock_killable(&vcpu->mutex)) 3124 return -EINTR; 3125 switch (ioctl) { 3126 case KVM_RUN: { 3127 struct pid *oldpid; 3128 r = -EINVAL; 3129 if (arg) 3130 goto out; 3131 oldpid = rcu_access_pointer(vcpu->pid); 3132 if (unlikely(oldpid != task_pid(current))) { 3133 /* The thread running this VCPU changed. */ 3134 struct pid *newpid; 3135 3136 r = kvm_arch_vcpu_run_pid_change(vcpu); 3137 if (r) 3138 break; 3139 3140 newpid = get_task_pid(current, PIDTYPE_PID); 3141 rcu_assign_pointer(vcpu->pid, newpid); 3142 if (oldpid) 3143 synchronize_rcu(); 3144 put_pid(oldpid); 3145 } 3146 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 3147 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3148 break; 3149 } 3150 case KVM_GET_REGS: { 3151 struct kvm_regs *kvm_regs; 3152 3153 r = -ENOMEM; 3154 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3155 if (!kvm_regs) 3156 goto out; 3157 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3158 if (r) 3159 goto out_free1; 3160 r = -EFAULT; 3161 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3162 goto out_free1; 3163 r = 0; 3164 out_free1: 3165 kfree(kvm_regs); 3166 break; 3167 } 3168 case KVM_SET_REGS: { 3169 struct kvm_regs *kvm_regs; 3170 3171 r = -ENOMEM; 3172 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3173 if (IS_ERR(kvm_regs)) { 3174 r = PTR_ERR(kvm_regs); 3175 goto out; 3176 } 3177 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3178 kfree(kvm_regs); 3179 break; 3180 } 3181 case KVM_GET_SREGS: { 3182 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3183 GFP_KERNEL_ACCOUNT); 3184 r = -ENOMEM; 3185 if (!kvm_sregs) 3186 goto out; 3187 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3188 if (r) 3189 goto out; 3190 r = -EFAULT; 3191 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3192 goto out; 3193 r = 0; 3194 break; 3195 } 3196 case KVM_SET_SREGS: { 3197 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3198 if (IS_ERR(kvm_sregs)) { 3199 r = PTR_ERR(kvm_sregs); 3200 kvm_sregs = NULL; 3201 goto out; 3202 } 3203 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3204 break; 3205 } 3206 case KVM_GET_MP_STATE: { 3207 struct kvm_mp_state mp_state; 3208 3209 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3210 if (r) 3211 goto out; 3212 r = -EFAULT; 3213 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3214 goto out; 3215 r = 0; 3216 break; 3217 } 3218 case KVM_SET_MP_STATE: { 3219 struct kvm_mp_state mp_state; 3220 3221 r = -EFAULT; 3222 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3223 goto out; 3224 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3225 break; 3226 } 3227 case KVM_TRANSLATE: { 3228 struct kvm_translation tr; 3229 3230 r = -EFAULT; 3231 if (copy_from_user(&tr, argp, sizeof(tr))) 3232 goto out; 3233 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3234 if (r) 3235 goto out; 3236 r = -EFAULT; 3237 if (copy_to_user(argp, &tr, sizeof(tr))) 3238 goto out; 3239 r = 0; 3240 break; 3241 } 3242 case KVM_SET_GUEST_DEBUG: { 3243 struct kvm_guest_debug dbg; 3244 3245 r = -EFAULT; 3246 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3247 goto out; 3248 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3249 break; 3250 } 3251 case KVM_SET_SIGNAL_MASK: { 3252 struct kvm_signal_mask __user *sigmask_arg = argp; 3253 struct kvm_signal_mask kvm_sigmask; 3254 sigset_t sigset, *p; 3255 3256 p = NULL; 3257 if (argp) { 3258 r = -EFAULT; 3259 if (copy_from_user(&kvm_sigmask, argp, 3260 sizeof(kvm_sigmask))) 3261 goto out; 3262 r = -EINVAL; 3263 if (kvm_sigmask.len != sizeof(sigset)) 3264 goto out; 3265 r = -EFAULT; 3266 if (copy_from_user(&sigset, sigmask_arg->sigset, 3267 sizeof(sigset))) 3268 goto out; 3269 p = &sigset; 3270 } 3271 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3272 break; 3273 } 3274 case KVM_GET_FPU: { 3275 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3276 r = -ENOMEM; 3277 if (!fpu) 3278 goto out; 3279 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3280 if (r) 3281 goto out; 3282 r = -EFAULT; 3283 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3284 goto out; 3285 r = 0; 3286 break; 3287 } 3288 case KVM_SET_FPU: { 3289 fpu = memdup_user(argp, sizeof(*fpu)); 3290 if (IS_ERR(fpu)) { 3291 r = PTR_ERR(fpu); 3292 fpu = NULL; 3293 goto out; 3294 } 3295 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3296 break; 3297 } 3298 default: 3299 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3300 } 3301 out: 3302 mutex_unlock(&vcpu->mutex); 3303 kfree(fpu); 3304 kfree(kvm_sregs); 3305 return r; 3306 } 3307 3308 #ifdef CONFIG_KVM_COMPAT 3309 static long kvm_vcpu_compat_ioctl(struct file *filp, 3310 unsigned int ioctl, unsigned long arg) 3311 { 3312 struct kvm_vcpu *vcpu = filp->private_data; 3313 void __user *argp = compat_ptr(arg); 3314 int r; 3315 3316 if (vcpu->kvm->mm != current->mm) 3317 return -EIO; 3318 3319 switch (ioctl) { 3320 case KVM_SET_SIGNAL_MASK: { 3321 struct kvm_signal_mask __user *sigmask_arg = argp; 3322 struct kvm_signal_mask kvm_sigmask; 3323 sigset_t sigset; 3324 3325 if (argp) { 3326 r = -EFAULT; 3327 if (copy_from_user(&kvm_sigmask, argp, 3328 sizeof(kvm_sigmask))) 3329 goto out; 3330 r = -EINVAL; 3331 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3332 goto out; 3333 r = -EFAULT; 3334 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 3335 goto out; 3336 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3337 } else 3338 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3339 break; 3340 } 3341 default: 3342 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3343 } 3344 3345 out: 3346 return r; 3347 } 3348 #endif 3349 3350 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3351 { 3352 struct kvm_device *dev = filp->private_data; 3353 3354 if (dev->ops->mmap) 3355 return dev->ops->mmap(dev, vma); 3356 3357 return -ENODEV; 3358 } 3359 3360 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3361 int (*accessor)(struct kvm_device *dev, 3362 struct kvm_device_attr *attr), 3363 unsigned long arg) 3364 { 3365 struct kvm_device_attr attr; 3366 3367 if (!accessor) 3368 return -EPERM; 3369 3370 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3371 return -EFAULT; 3372 3373 return accessor(dev, &attr); 3374 } 3375 3376 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3377 unsigned long arg) 3378 { 3379 struct kvm_device *dev = filp->private_data; 3380 3381 if (dev->kvm->mm != current->mm) 3382 return -EIO; 3383 3384 switch (ioctl) { 3385 case KVM_SET_DEVICE_ATTR: 3386 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3387 case KVM_GET_DEVICE_ATTR: 3388 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3389 case KVM_HAS_DEVICE_ATTR: 3390 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3391 default: 3392 if (dev->ops->ioctl) 3393 return dev->ops->ioctl(dev, ioctl, arg); 3394 3395 return -ENOTTY; 3396 } 3397 } 3398 3399 static int kvm_device_release(struct inode *inode, struct file *filp) 3400 { 3401 struct kvm_device *dev = filp->private_data; 3402 struct kvm *kvm = dev->kvm; 3403 3404 if (dev->ops->release) { 3405 mutex_lock(&kvm->lock); 3406 list_del(&dev->vm_node); 3407 dev->ops->release(dev); 3408 mutex_unlock(&kvm->lock); 3409 } 3410 3411 kvm_put_kvm(kvm); 3412 return 0; 3413 } 3414 3415 static const struct file_operations kvm_device_fops = { 3416 .unlocked_ioctl = kvm_device_ioctl, 3417 .release = kvm_device_release, 3418 KVM_COMPAT(kvm_device_ioctl), 3419 .mmap = kvm_device_mmap, 3420 }; 3421 3422 struct kvm_device *kvm_device_from_filp(struct file *filp) 3423 { 3424 if (filp->f_op != &kvm_device_fops) 3425 return NULL; 3426 3427 return filp->private_data; 3428 } 3429 3430 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3431 #ifdef CONFIG_KVM_MPIC 3432 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3433 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3434 #endif 3435 }; 3436 3437 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3438 { 3439 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3440 return -ENOSPC; 3441 3442 if (kvm_device_ops_table[type] != NULL) 3443 return -EEXIST; 3444 3445 kvm_device_ops_table[type] = ops; 3446 return 0; 3447 } 3448 3449 void kvm_unregister_device_ops(u32 type) 3450 { 3451 if (kvm_device_ops_table[type] != NULL) 3452 kvm_device_ops_table[type] = NULL; 3453 } 3454 3455 static int kvm_ioctl_create_device(struct kvm *kvm, 3456 struct kvm_create_device *cd) 3457 { 3458 const struct kvm_device_ops *ops = NULL; 3459 struct kvm_device *dev; 3460 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3461 int type; 3462 int ret; 3463 3464 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3465 return -ENODEV; 3466 3467 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3468 ops = kvm_device_ops_table[type]; 3469 if (ops == NULL) 3470 return -ENODEV; 3471 3472 if (test) 3473 return 0; 3474 3475 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3476 if (!dev) 3477 return -ENOMEM; 3478 3479 dev->ops = ops; 3480 dev->kvm = kvm; 3481 3482 mutex_lock(&kvm->lock); 3483 ret = ops->create(dev, type); 3484 if (ret < 0) { 3485 mutex_unlock(&kvm->lock); 3486 kfree(dev); 3487 return ret; 3488 } 3489 list_add(&dev->vm_node, &kvm->devices); 3490 mutex_unlock(&kvm->lock); 3491 3492 if (ops->init) 3493 ops->init(dev); 3494 3495 kvm_get_kvm(kvm); 3496 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3497 if (ret < 0) { 3498 kvm_put_kvm_no_destroy(kvm); 3499 mutex_lock(&kvm->lock); 3500 list_del(&dev->vm_node); 3501 mutex_unlock(&kvm->lock); 3502 ops->destroy(dev); 3503 return ret; 3504 } 3505 3506 cd->fd = ret; 3507 return 0; 3508 } 3509 3510 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3511 { 3512 switch (arg) { 3513 case KVM_CAP_USER_MEMORY: 3514 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3515 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3516 case KVM_CAP_INTERNAL_ERROR_DATA: 3517 #ifdef CONFIG_HAVE_KVM_MSI 3518 case KVM_CAP_SIGNAL_MSI: 3519 #endif 3520 #ifdef CONFIG_HAVE_KVM_IRQFD 3521 case KVM_CAP_IRQFD: 3522 case KVM_CAP_IRQFD_RESAMPLE: 3523 #endif 3524 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3525 case KVM_CAP_CHECK_EXTENSION_VM: 3526 case KVM_CAP_ENABLE_CAP_VM: 3527 return 1; 3528 #ifdef CONFIG_KVM_MMIO 3529 case KVM_CAP_COALESCED_MMIO: 3530 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3531 case KVM_CAP_COALESCED_PIO: 3532 return 1; 3533 #endif 3534 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3535 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3536 return KVM_DIRTY_LOG_MANUAL_CAPS; 3537 #endif 3538 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3539 case KVM_CAP_IRQ_ROUTING: 3540 return KVM_MAX_IRQ_ROUTES; 3541 #endif 3542 #if KVM_ADDRESS_SPACE_NUM > 1 3543 case KVM_CAP_MULTI_ADDRESS_SPACE: 3544 return KVM_ADDRESS_SPACE_NUM; 3545 #endif 3546 case KVM_CAP_NR_MEMSLOTS: 3547 return KVM_USER_MEM_SLOTS; 3548 default: 3549 break; 3550 } 3551 return kvm_vm_ioctl_check_extension(kvm, arg); 3552 } 3553 3554 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3555 struct kvm_enable_cap *cap) 3556 { 3557 return -EINVAL; 3558 } 3559 3560 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3561 struct kvm_enable_cap *cap) 3562 { 3563 switch (cap->cap) { 3564 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3565 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 3566 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 3567 3568 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 3569 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 3570 3571 if (cap->flags || (cap->args[0] & ~allowed_options)) 3572 return -EINVAL; 3573 kvm->manual_dirty_log_protect = cap->args[0]; 3574 return 0; 3575 } 3576 #endif 3577 default: 3578 return kvm_vm_ioctl_enable_cap(kvm, cap); 3579 } 3580 } 3581 3582 static long kvm_vm_ioctl(struct file *filp, 3583 unsigned int ioctl, unsigned long arg) 3584 { 3585 struct kvm *kvm = filp->private_data; 3586 void __user *argp = (void __user *)arg; 3587 int r; 3588 3589 if (kvm->mm != current->mm) 3590 return -EIO; 3591 switch (ioctl) { 3592 case KVM_CREATE_VCPU: 3593 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3594 break; 3595 case KVM_ENABLE_CAP: { 3596 struct kvm_enable_cap cap; 3597 3598 r = -EFAULT; 3599 if (copy_from_user(&cap, argp, sizeof(cap))) 3600 goto out; 3601 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3602 break; 3603 } 3604 case KVM_SET_USER_MEMORY_REGION: { 3605 struct kvm_userspace_memory_region kvm_userspace_mem; 3606 3607 r = -EFAULT; 3608 if (copy_from_user(&kvm_userspace_mem, argp, 3609 sizeof(kvm_userspace_mem))) 3610 goto out; 3611 3612 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3613 break; 3614 } 3615 case KVM_GET_DIRTY_LOG: { 3616 struct kvm_dirty_log log; 3617 3618 r = -EFAULT; 3619 if (copy_from_user(&log, argp, sizeof(log))) 3620 goto out; 3621 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3622 break; 3623 } 3624 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3625 case KVM_CLEAR_DIRTY_LOG: { 3626 struct kvm_clear_dirty_log log; 3627 3628 r = -EFAULT; 3629 if (copy_from_user(&log, argp, sizeof(log))) 3630 goto out; 3631 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3632 break; 3633 } 3634 #endif 3635 #ifdef CONFIG_KVM_MMIO 3636 case KVM_REGISTER_COALESCED_MMIO: { 3637 struct kvm_coalesced_mmio_zone zone; 3638 3639 r = -EFAULT; 3640 if (copy_from_user(&zone, argp, sizeof(zone))) 3641 goto out; 3642 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3643 break; 3644 } 3645 case KVM_UNREGISTER_COALESCED_MMIO: { 3646 struct kvm_coalesced_mmio_zone zone; 3647 3648 r = -EFAULT; 3649 if (copy_from_user(&zone, argp, sizeof(zone))) 3650 goto out; 3651 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3652 break; 3653 } 3654 #endif 3655 case KVM_IRQFD: { 3656 struct kvm_irqfd data; 3657 3658 r = -EFAULT; 3659 if (copy_from_user(&data, argp, sizeof(data))) 3660 goto out; 3661 r = kvm_irqfd(kvm, &data); 3662 break; 3663 } 3664 case KVM_IOEVENTFD: { 3665 struct kvm_ioeventfd data; 3666 3667 r = -EFAULT; 3668 if (copy_from_user(&data, argp, sizeof(data))) 3669 goto out; 3670 r = kvm_ioeventfd(kvm, &data); 3671 break; 3672 } 3673 #ifdef CONFIG_HAVE_KVM_MSI 3674 case KVM_SIGNAL_MSI: { 3675 struct kvm_msi msi; 3676 3677 r = -EFAULT; 3678 if (copy_from_user(&msi, argp, sizeof(msi))) 3679 goto out; 3680 r = kvm_send_userspace_msi(kvm, &msi); 3681 break; 3682 } 3683 #endif 3684 #ifdef __KVM_HAVE_IRQ_LINE 3685 case KVM_IRQ_LINE_STATUS: 3686 case KVM_IRQ_LINE: { 3687 struct kvm_irq_level irq_event; 3688 3689 r = -EFAULT; 3690 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3691 goto out; 3692 3693 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3694 ioctl == KVM_IRQ_LINE_STATUS); 3695 if (r) 3696 goto out; 3697 3698 r = -EFAULT; 3699 if (ioctl == KVM_IRQ_LINE_STATUS) { 3700 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3701 goto out; 3702 } 3703 3704 r = 0; 3705 break; 3706 } 3707 #endif 3708 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3709 case KVM_SET_GSI_ROUTING: { 3710 struct kvm_irq_routing routing; 3711 struct kvm_irq_routing __user *urouting; 3712 struct kvm_irq_routing_entry *entries = NULL; 3713 3714 r = -EFAULT; 3715 if (copy_from_user(&routing, argp, sizeof(routing))) 3716 goto out; 3717 r = -EINVAL; 3718 if (!kvm_arch_can_set_irq_routing(kvm)) 3719 goto out; 3720 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3721 goto out; 3722 if (routing.flags) 3723 goto out; 3724 if (routing.nr) { 3725 r = -ENOMEM; 3726 entries = vmalloc(array_size(sizeof(*entries), 3727 routing.nr)); 3728 if (!entries) 3729 goto out; 3730 r = -EFAULT; 3731 urouting = argp; 3732 if (copy_from_user(entries, urouting->entries, 3733 routing.nr * sizeof(*entries))) 3734 goto out_free_irq_routing; 3735 } 3736 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3737 routing.flags); 3738 out_free_irq_routing: 3739 vfree(entries); 3740 break; 3741 } 3742 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3743 case KVM_CREATE_DEVICE: { 3744 struct kvm_create_device cd; 3745 3746 r = -EFAULT; 3747 if (copy_from_user(&cd, argp, sizeof(cd))) 3748 goto out; 3749 3750 r = kvm_ioctl_create_device(kvm, &cd); 3751 if (r) 3752 goto out; 3753 3754 r = -EFAULT; 3755 if (copy_to_user(argp, &cd, sizeof(cd))) 3756 goto out; 3757 3758 r = 0; 3759 break; 3760 } 3761 case KVM_CHECK_EXTENSION: 3762 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3763 break; 3764 default: 3765 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3766 } 3767 out: 3768 return r; 3769 } 3770 3771 #ifdef CONFIG_KVM_COMPAT 3772 struct compat_kvm_dirty_log { 3773 __u32 slot; 3774 __u32 padding1; 3775 union { 3776 compat_uptr_t dirty_bitmap; /* one bit per page */ 3777 __u64 padding2; 3778 }; 3779 }; 3780 3781 static long kvm_vm_compat_ioctl(struct file *filp, 3782 unsigned int ioctl, unsigned long arg) 3783 { 3784 struct kvm *kvm = filp->private_data; 3785 int r; 3786 3787 if (kvm->mm != current->mm) 3788 return -EIO; 3789 switch (ioctl) { 3790 case KVM_GET_DIRTY_LOG: { 3791 struct compat_kvm_dirty_log compat_log; 3792 struct kvm_dirty_log log; 3793 3794 if (copy_from_user(&compat_log, (void __user *)arg, 3795 sizeof(compat_log))) 3796 return -EFAULT; 3797 log.slot = compat_log.slot; 3798 log.padding1 = compat_log.padding1; 3799 log.padding2 = compat_log.padding2; 3800 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3801 3802 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3803 break; 3804 } 3805 default: 3806 r = kvm_vm_ioctl(filp, ioctl, arg); 3807 } 3808 return r; 3809 } 3810 #endif 3811 3812 static struct file_operations kvm_vm_fops = { 3813 .release = kvm_vm_release, 3814 .unlocked_ioctl = kvm_vm_ioctl, 3815 .llseek = noop_llseek, 3816 KVM_COMPAT(kvm_vm_compat_ioctl), 3817 }; 3818 3819 static int kvm_dev_ioctl_create_vm(unsigned long type) 3820 { 3821 int r; 3822 struct kvm *kvm; 3823 struct file *file; 3824 3825 kvm = kvm_create_vm(type); 3826 if (IS_ERR(kvm)) 3827 return PTR_ERR(kvm); 3828 #ifdef CONFIG_KVM_MMIO 3829 r = kvm_coalesced_mmio_init(kvm); 3830 if (r < 0) 3831 goto put_kvm; 3832 #endif 3833 r = get_unused_fd_flags(O_CLOEXEC); 3834 if (r < 0) 3835 goto put_kvm; 3836 3837 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3838 if (IS_ERR(file)) { 3839 put_unused_fd(r); 3840 r = PTR_ERR(file); 3841 goto put_kvm; 3842 } 3843 3844 /* 3845 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3846 * already set, with ->release() being kvm_vm_release(). In error 3847 * cases it will be called by the final fput(file) and will take 3848 * care of doing kvm_put_kvm(kvm). 3849 */ 3850 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3851 put_unused_fd(r); 3852 fput(file); 3853 return -ENOMEM; 3854 } 3855 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3856 3857 fd_install(r, file); 3858 return r; 3859 3860 put_kvm: 3861 kvm_put_kvm(kvm); 3862 return r; 3863 } 3864 3865 static long kvm_dev_ioctl(struct file *filp, 3866 unsigned int ioctl, unsigned long arg) 3867 { 3868 long r = -EINVAL; 3869 3870 switch (ioctl) { 3871 case KVM_GET_API_VERSION: 3872 if (arg) 3873 goto out; 3874 r = KVM_API_VERSION; 3875 break; 3876 case KVM_CREATE_VM: 3877 r = kvm_dev_ioctl_create_vm(arg); 3878 break; 3879 case KVM_CHECK_EXTENSION: 3880 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3881 break; 3882 case KVM_GET_VCPU_MMAP_SIZE: 3883 if (arg) 3884 goto out; 3885 r = PAGE_SIZE; /* struct kvm_run */ 3886 #ifdef CONFIG_X86 3887 r += PAGE_SIZE; /* pio data page */ 3888 #endif 3889 #ifdef CONFIG_KVM_MMIO 3890 r += PAGE_SIZE; /* coalesced mmio ring page */ 3891 #endif 3892 break; 3893 case KVM_TRACE_ENABLE: 3894 case KVM_TRACE_PAUSE: 3895 case KVM_TRACE_DISABLE: 3896 r = -EOPNOTSUPP; 3897 break; 3898 default: 3899 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3900 } 3901 out: 3902 return r; 3903 } 3904 3905 static struct file_operations kvm_chardev_ops = { 3906 .unlocked_ioctl = kvm_dev_ioctl, 3907 .llseek = noop_llseek, 3908 KVM_COMPAT(kvm_dev_ioctl), 3909 }; 3910 3911 static struct miscdevice kvm_dev = { 3912 KVM_MINOR, 3913 "kvm", 3914 &kvm_chardev_ops, 3915 }; 3916 3917 static void hardware_enable_nolock(void *junk) 3918 { 3919 int cpu = raw_smp_processor_id(); 3920 int r; 3921 3922 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3923 return; 3924 3925 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3926 3927 r = kvm_arch_hardware_enable(); 3928 3929 if (r) { 3930 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3931 atomic_inc(&hardware_enable_failed); 3932 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3933 } 3934 } 3935 3936 static int kvm_starting_cpu(unsigned int cpu) 3937 { 3938 raw_spin_lock(&kvm_count_lock); 3939 if (kvm_usage_count) 3940 hardware_enable_nolock(NULL); 3941 raw_spin_unlock(&kvm_count_lock); 3942 return 0; 3943 } 3944 3945 static void hardware_disable_nolock(void *junk) 3946 { 3947 int cpu = raw_smp_processor_id(); 3948 3949 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3950 return; 3951 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3952 kvm_arch_hardware_disable(); 3953 } 3954 3955 static int kvm_dying_cpu(unsigned int cpu) 3956 { 3957 raw_spin_lock(&kvm_count_lock); 3958 if (kvm_usage_count) 3959 hardware_disable_nolock(NULL); 3960 raw_spin_unlock(&kvm_count_lock); 3961 return 0; 3962 } 3963 3964 static void hardware_disable_all_nolock(void) 3965 { 3966 BUG_ON(!kvm_usage_count); 3967 3968 kvm_usage_count--; 3969 if (!kvm_usage_count) 3970 on_each_cpu(hardware_disable_nolock, NULL, 1); 3971 } 3972 3973 static void hardware_disable_all(void) 3974 { 3975 raw_spin_lock(&kvm_count_lock); 3976 hardware_disable_all_nolock(); 3977 raw_spin_unlock(&kvm_count_lock); 3978 } 3979 3980 static int hardware_enable_all(void) 3981 { 3982 int r = 0; 3983 3984 raw_spin_lock(&kvm_count_lock); 3985 3986 kvm_usage_count++; 3987 if (kvm_usage_count == 1) { 3988 atomic_set(&hardware_enable_failed, 0); 3989 on_each_cpu(hardware_enable_nolock, NULL, 1); 3990 3991 if (atomic_read(&hardware_enable_failed)) { 3992 hardware_disable_all_nolock(); 3993 r = -EBUSY; 3994 } 3995 } 3996 3997 raw_spin_unlock(&kvm_count_lock); 3998 3999 return r; 4000 } 4001 4002 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4003 void *v) 4004 { 4005 /* 4006 * Some (well, at least mine) BIOSes hang on reboot if 4007 * in vmx root mode. 4008 * 4009 * And Intel TXT required VMX off for all cpu when system shutdown. 4010 */ 4011 pr_info("kvm: exiting hardware virtualization\n"); 4012 kvm_rebooting = true; 4013 on_each_cpu(hardware_disable_nolock, NULL, 1); 4014 return NOTIFY_OK; 4015 } 4016 4017 static struct notifier_block kvm_reboot_notifier = { 4018 .notifier_call = kvm_reboot, 4019 .priority = 0, 4020 }; 4021 4022 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4023 { 4024 int i; 4025 4026 for (i = 0; i < bus->dev_count; i++) { 4027 struct kvm_io_device *pos = bus->range[i].dev; 4028 4029 kvm_iodevice_destructor(pos); 4030 } 4031 kfree(bus); 4032 } 4033 4034 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4035 const struct kvm_io_range *r2) 4036 { 4037 gpa_t addr1 = r1->addr; 4038 gpa_t addr2 = r2->addr; 4039 4040 if (addr1 < addr2) 4041 return -1; 4042 4043 /* If r2->len == 0, match the exact address. If r2->len != 0, 4044 * accept any overlapping write. Any order is acceptable for 4045 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4046 * we process all of them. 4047 */ 4048 if (r2->len) { 4049 addr1 += r1->len; 4050 addr2 += r2->len; 4051 } 4052 4053 if (addr1 > addr2) 4054 return 1; 4055 4056 return 0; 4057 } 4058 4059 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 4060 { 4061 return kvm_io_bus_cmp(p1, p2); 4062 } 4063 4064 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 4065 gpa_t addr, int len) 4066 { 4067 struct kvm_io_range *range, key; 4068 int off; 4069 4070 key = (struct kvm_io_range) { 4071 .addr = addr, 4072 .len = len, 4073 }; 4074 4075 range = bsearch(&key, bus->range, bus->dev_count, 4076 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 4077 if (range == NULL) 4078 return -ENOENT; 4079 4080 off = range - bus->range; 4081 4082 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 4083 off--; 4084 4085 return off; 4086 } 4087 4088 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4089 struct kvm_io_range *range, const void *val) 4090 { 4091 int idx; 4092 4093 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4094 if (idx < 0) 4095 return -EOPNOTSUPP; 4096 4097 while (idx < bus->dev_count && 4098 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4099 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 4100 range->len, val)) 4101 return idx; 4102 idx++; 4103 } 4104 4105 return -EOPNOTSUPP; 4106 } 4107 4108 /* kvm_io_bus_write - called under kvm->slots_lock */ 4109 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4110 int len, const void *val) 4111 { 4112 struct kvm_io_bus *bus; 4113 struct kvm_io_range range; 4114 int r; 4115 4116 range = (struct kvm_io_range) { 4117 .addr = addr, 4118 .len = len, 4119 }; 4120 4121 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4122 if (!bus) 4123 return -ENOMEM; 4124 r = __kvm_io_bus_write(vcpu, bus, &range, val); 4125 return r < 0 ? r : 0; 4126 } 4127 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 4128 4129 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 4130 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 4131 gpa_t addr, int len, const void *val, long cookie) 4132 { 4133 struct kvm_io_bus *bus; 4134 struct kvm_io_range range; 4135 4136 range = (struct kvm_io_range) { 4137 .addr = addr, 4138 .len = len, 4139 }; 4140 4141 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4142 if (!bus) 4143 return -ENOMEM; 4144 4145 /* First try the device referenced by cookie. */ 4146 if ((cookie >= 0) && (cookie < bus->dev_count) && 4147 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 4148 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 4149 val)) 4150 return cookie; 4151 4152 /* 4153 * cookie contained garbage; fall back to search and return the 4154 * correct cookie value. 4155 */ 4156 return __kvm_io_bus_write(vcpu, bus, &range, val); 4157 } 4158 4159 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4160 struct kvm_io_range *range, void *val) 4161 { 4162 int idx; 4163 4164 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4165 if (idx < 0) 4166 return -EOPNOTSUPP; 4167 4168 while (idx < bus->dev_count && 4169 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4170 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 4171 range->len, val)) 4172 return idx; 4173 idx++; 4174 } 4175 4176 return -EOPNOTSUPP; 4177 } 4178 4179 /* kvm_io_bus_read - called under kvm->slots_lock */ 4180 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4181 int len, void *val) 4182 { 4183 struct kvm_io_bus *bus; 4184 struct kvm_io_range range; 4185 int r; 4186 4187 range = (struct kvm_io_range) { 4188 .addr = addr, 4189 .len = len, 4190 }; 4191 4192 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4193 if (!bus) 4194 return -ENOMEM; 4195 r = __kvm_io_bus_read(vcpu, bus, &range, val); 4196 return r < 0 ? r : 0; 4197 } 4198 4199 /* Caller must hold slots_lock. */ 4200 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 4201 int len, struct kvm_io_device *dev) 4202 { 4203 int i; 4204 struct kvm_io_bus *new_bus, *bus; 4205 struct kvm_io_range range; 4206 4207 bus = kvm_get_bus(kvm, bus_idx); 4208 if (!bus) 4209 return -ENOMEM; 4210 4211 /* exclude ioeventfd which is limited by maximum fd */ 4212 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 4213 return -ENOSPC; 4214 4215 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 4216 GFP_KERNEL_ACCOUNT); 4217 if (!new_bus) 4218 return -ENOMEM; 4219 4220 range = (struct kvm_io_range) { 4221 .addr = addr, 4222 .len = len, 4223 .dev = dev, 4224 }; 4225 4226 for (i = 0; i < bus->dev_count; i++) 4227 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4228 break; 4229 4230 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4231 new_bus->dev_count++; 4232 new_bus->range[i] = range; 4233 memcpy(new_bus->range + i + 1, bus->range + i, 4234 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4235 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4236 synchronize_srcu_expedited(&kvm->srcu); 4237 kfree(bus); 4238 4239 return 0; 4240 } 4241 4242 /* Caller must hold slots_lock. */ 4243 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4244 struct kvm_io_device *dev) 4245 { 4246 int i; 4247 struct kvm_io_bus *new_bus, *bus; 4248 4249 bus = kvm_get_bus(kvm, bus_idx); 4250 if (!bus) 4251 return; 4252 4253 for (i = 0; i < bus->dev_count; i++) 4254 if (bus->range[i].dev == dev) { 4255 break; 4256 } 4257 4258 if (i == bus->dev_count) 4259 return; 4260 4261 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4262 GFP_KERNEL_ACCOUNT); 4263 if (!new_bus) { 4264 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4265 goto broken; 4266 } 4267 4268 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4269 new_bus->dev_count--; 4270 memcpy(new_bus->range + i, bus->range + i + 1, 4271 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 4272 4273 broken: 4274 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4275 synchronize_srcu_expedited(&kvm->srcu); 4276 kfree(bus); 4277 return; 4278 } 4279 4280 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4281 gpa_t addr) 4282 { 4283 struct kvm_io_bus *bus; 4284 int dev_idx, srcu_idx; 4285 struct kvm_io_device *iodev = NULL; 4286 4287 srcu_idx = srcu_read_lock(&kvm->srcu); 4288 4289 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4290 if (!bus) 4291 goto out_unlock; 4292 4293 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4294 if (dev_idx < 0) 4295 goto out_unlock; 4296 4297 iodev = bus->range[dev_idx].dev; 4298 4299 out_unlock: 4300 srcu_read_unlock(&kvm->srcu, srcu_idx); 4301 4302 return iodev; 4303 } 4304 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4305 4306 static int kvm_debugfs_open(struct inode *inode, struct file *file, 4307 int (*get)(void *, u64 *), int (*set)(void *, u64), 4308 const char *fmt) 4309 { 4310 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4311 inode->i_private; 4312 4313 /* The debugfs files are a reference to the kvm struct which 4314 * is still valid when kvm_destroy_vm is called. 4315 * To avoid the race between open and the removal of the debugfs 4316 * directory we test against the users count. 4317 */ 4318 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4319 return -ENOENT; 4320 4321 if (simple_attr_open(inode, file, get, 4322 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4323 ? set : NULL, 4324 fmt)) { 4325 kvm_put_kvm(stat_data->kvm); 4326 return -ENOMEM; 4327 } 4328 4329 return 0; 4330 } 4331 4332 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4333 { 4334 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4335 inode->i_private; 4336 4337 simple_attr_release(inode, file); 4338 kvm_put_kvm(stat_data->kvm); 4339 4340 return 0; 4341 } 4342 4343 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4344 { 4345 *val = *(ulong *)((void *)kvm + offset); 4346 4347 return 0; 4348 } 4349 4350 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4351 { 4352 *(ulong *)((void *)kvm + offset) = 0; 4353 4354 return 0; 4355 } 4356 4357 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4358 { 4359 int i; 4360 struct kvm_vcpu *vcpu; 4361 4362 *val = 0; 4363 4364 kvm_for_each_vcpu(i, vcpu, kvm) 4365 *val += *(u64 *)((void *)vcpu + offset); 4366 4367 return 0; 4368 } 4369 4370 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4371 { 4372 int i; 4373 struct kvm_vcpu *vcpu; 4374 4375 kvm_for_each_vcpu(i, vcpu, kvm) 4376 *(u64 *)((void *)vcpu + offset) = 0; 4377 4378 return 0; 4379 } 4380 4381 static int kvm_stat_data_get(void *data, u64 *val) 4382 { 4383 int r = -EFAULT; 4384 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4385 4386 switch (stat_data->dbgfs_item->kind) { 4387 case KVM_STAT_VM: 4388 r = kvm_get_stat_per_vm(stat_data->kvm, 4389 stat_data->dbgfs_item->offset, val); 4390 break; 4391 case KVM_STAT_VCPU: 4392 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4393 stat_data->dbgfs_item->offset, val); 4394 break; 4395 } 4396 4397 return r; 4398 } 4399 4400 static int kvm_stat_data_clear(void *data, u64 val) 4401 { 4402 int r = -EFAULT; 4403 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4404 4405 if (val) 4406 return -EINVAL; 4407 4408 switch (stat_data->dbgfs_item->kind) { 4409 case KVM_STAT_VM: 4410 r = kvm_clear_stat_per_vm(stat_data->kvm, 4411 stat_data->dbgfs_item->offset); 4412 break; 4413 case KVM_STAT_VCPU: 4414 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4415 stat_data->dbgfs_item->offset); 4416 break; 4417 } 4418 4419 return r; 4420 } 4421 4422 static int kvm_stat_data_open(struct inode *inode, struct file *file) 4423 { 4424 __simple_attr_check_format("%llu\n", 0ull); 4425 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4426 kvm_stat_data_clear, "%llu\n"); 4427 } 4428 4429 static const struct file_operations stat_fops_per_vm = { 4430 .owner = THIS_MODULE, 4431 .open = kvm_stat_data_open, 4432 .release = kvm_debugfs_release, 4433 .read = simple_attr_read, 4434 .write = simple_attr_write, 4435 .llseek = no_llseek, 4436 }; 4437 4438 static int vm_stat_get(void *_offset, u64 *val) 4439 { 4440 unsigned offset = (long)_offset; 4441 struct kvm *kvm; 4442 u64 tmp_val; 4443 4444 *val = 0; 4445 mutex_lock(&kvm_lock); 4446 list_for_each_entry(kvm, &vm_list, vm_list) { 4447 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4448 *val += tmp_val; 4449 } 4450 mutex_unlock(&kvm_lock); 4451 return 0; 4452 } 4453 4454 static int vm_stat_clear(void *_offset, u64 val) 4455 { 4456 unsigned offset = (long)_offset; 4457 struct kvm *kvm; 4458 4459 if (val) 4460 return -EINVAL; 4461 4462 mutex_lock(&kvm_lock); 4463 list_for_each_entry(kvm, &vm_list, vm_list) { 4464 kvm_clear_stat_per_vm(kvm, offset); 4465 } 4466 mutex_unlock(&kvm_lock); 4467 4468 return 0; 4469 } 4470 4471 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4472 4473 static int vcpu_stat_get(void *_offset, u64 *val) 4474 { 4475 unsigned offset = (long)_offset; 4476 struct kvm *kvm; 4477 u64 tmp_val; 4478 4479 *val = 0; 4480 mutex_lock(&kvm_lock); 4481 list_for_each_entry(kvm, &vm_list, vm_list) { 4482 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4483 *val += tmp_val; 4484 } 4485 mutex_unlock(&kvm_lock); 4486 return 0; 4487 } 4488 4489 static int vcpu_stat_clear(void *_offset, u64 val) 4490 { 4491 unsigned offset = (long)_offset; 4492 struct kvm *kvm; 4493 4494 if (val) 4495 return -EINVAL; 4496 4497 mutex_lock(&kvm_lock); 4498 list_for_each_entry(kvm, &vm_list, vm_list) { 4499 kvm_clear_stat_per_vcpu(kvm, offset); 4500 } 4501 mutex_unlock(&kvm_lock); 4502 4503 return 0; 4504 } 4505 4506 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4507 "%llu\n"); 4508 4509 static const struct file_operations *stat_fops[] = { 4510 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4511 [KVM_STAT_VM] = &vm_stat_fops, 4512 }; 4513 4514 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4515 { 4516 struct kobj_uevent_env *env; 4517 unsigned long long created, active; 4518 4519 if (!kvm_dev.this_device || !kvm) 4520 return; 4521 4522 mutex_lock(&kvm_lock); 4523 if (type == KVM_EVENT_CREATE_VM) { 4524 kvm_createvm_count++; 4525 kvm_active_vms++; 4526 } else if (type == KVM_EVENT_DESTROY_VM) { 4527 kvm_active_vms--; 4528 } 4529 created = kvm_createvm_count; 4530 active = kvm_active_vms; 4531 mutex_unlock(&kvm_lock); 4532 4533 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4534 if (!env) 4535 return; 4536 4537 add_uevent_var(env, "CREATED=%llu", created); 4538 add_uevent_var(env, "COUNT=%llu", active); 4539 4540 if (type == KVM_EVENT_CREATE_VM) { 4541 add_uevent_var(env, "EVENT=create"); 4542 kvm->userspace_pid = task_pid_nr(current); 4543 } else if (type == KVM_EVENT_DESTROY_VM) { 4544 add_uevent_var(env, "EVENT=destroy"); 4545 } 4546 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4547 4548 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4549 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4550 4551 if (p) { 4552 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4553 if (!IS_ERR(tmp)) 4554 add_uevent_var(env, "STATS_PATH=%s", tmp); 4555 kfree(p); 4556 } 4557 } 4558 /* no need for checks, since we are adding at most only 5 keys */ 4559 env->envp[env->envp_idx++] = NULL; 4560 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4561 kfree(env); 4562 } 4563 4564 static void kvm_init_debug(void) 4565 { 4566 struct kvm_stats_debugfs_item *p; 4567 4568 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4569 4570 kvm_debugfs_num_entries = 0; 4571 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4572 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4573 kvm_debugfs_dir, (void *)(long)p->offset, 4574 stat_fops[p->kind]); 4575 } 4576 } 4577 4578 static int kvm_suspend(void) 4579 { 4580 if (kvm_usage_count) 4581 hardware_disable_nolock(NULL); 4582 return 0; 4583 } 4584 4585 static void kvm_resume(void) 4586 { 4587 if (kvm_usage_count) { 4588 #ifdef CONFIG_LOCKDEP 4589 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4590 #endif 4591 hardware_enable_nolock(NULL); 4592 } 4593 } 4594 4595 static struct syscore_ops kvm_syscore_ops = { 4596 .suspend = kvm_suspend, 4597 .resume = kvm_resume, 4598 }; 4599 4600 static inline 4601 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4602 { 4603 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4604 } 4605 4606 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4607 { 4608 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4609 4610 WRITE_ONCE(vcpu->preempted, false); 4611 WRITE_ONCE(vcpu->ready, false); 4612 4613 __this_cpu_write(kvm_running_vcpu, vcpu); 4614 kvm_arch_sched_in(vcpu, cpu); 4615 kvm_arch_vcpu_load(vcpu, cpu); 4616 } 4617 4618 static void kvm_sched_out(struct preempt_notifier *pn, 4619 struct task_struct *next) 4620 { 4621 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4622 4623 if (current->state == TASK_RUNNING) { 4624 WRITE_ONCE(vcpu->preempted, true); 4625 WRITE_ONCE(vcpu->ready, true); 4626 } 4627 kvm_arch_vcpu_put(vcpu); 4628 __this_cpu_write(kvm_running_vcpu, NULL); 4629 } 4630 4631 /** 4632 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4633 * 4634 * We can disable preemption locally around accessing the per-CPU variable, 4635 * and use the resolved vcpu pointer after enabling preemption again, 4636 * because even if the current thread is migrated to another CPU, reading 4637 * the per-CPU value later will give us the same value as we update the 4638 * per-CPU variable in the preempt notifier handlers. 4639 */ 4640 struct kvm_vcpu *kvm_get_running_vcpu(void) 4641 { 4642 struct kvm_vcpu *vcpu; 4643 4644 preempt_disable(); 4645 vcpu = __this_cpu_read(kvm_running_vcpu); 4646 preempt_enable(); 4647 4648 return vcpu; 4649 } 4650 4651 /** 4652 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4653 */ 4654 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4655 { 4656 return &kvm_running_vcpu; 4657 } 4658 4659 struct kvm_cpu_compat_check { 4660 void *opaque; 4661 int *ret; 4662 }; 4663 4664 static void check_processor_compat(void *data) 4665 { 4666 struct kvm_cpu_compat_check *c = data; 4667 4668 *c->ret = kvm_arch_check_processor_compat(c->opaque); 4669 } 4670 4671 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4672 struct module *module) 4673 { 4674 struct kvm_cpu_compat_check c; 4675 int r; 4676 int cpu; 4677 4678 r = kvm_arch_init(opaque); 4679 if (r) 4680 goto out_fail; 4681 4682 /* 4683 * kvm_arch_init makes sure there's at most one caller 4684 * for architectures that support multiple implementations, 4685 * like intel and amd on x86. 4686 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4687 * conflicts in case kvm is already setup for another implementation. 4688 */ 4689 r = kvm_irqfd_init(); 4690 if (r) 4691 goto out_irqfd; 4692 4693 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4694 r = -ENOMEM; 4695 goto out_free_0; 4696 } 4697 4698 r = kvm_arch_hardware_setup(opaque); 4699 if (r < 0) 4700 goto out_free_1; 4701 4702 c.ret = &r; 4703 c.opaque = opaque; 4704 for_each_online_cpu(cpu) { 4705 smp_call_function_single(cpu, check_processor_compat, &c, 1); 4706 if (r < 0) 4707 goto out_free_2; 4708 } 4709 4710 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4711 kvm_starting_cpu, kvm_dying_cpu); 4712 if (r) 4713 goto out_free_2; 4714 register_reboot_notifier(&kvm_reboot_notifier); 4715 4716 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4717 if (!vcpu_align) 4718 vcpu_align = __alignof__(struct kvm_vcpu); 4719 kvm_vcpu_cache = 4720 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4721 SLAB_ACCOUNT, 4722 offsetof(struct kvm_vcpu, arch), 4723 sizeof_field(struct kvm_vcpu, arch), 4724 NULL); 4725 if (!kvm_vcpu_cache) { 4726 r = -ENOMEM; 4727 goto out_free_3; 4728 } 4729 4730 r = kvm_async_pf_init(); 4731 if (r) 4732 goto out_free; 4733 4734 kvm_chardev_ops.owner = module; 4735 kvm_vm_fops.owner = module; 4736 kvm_vcpu_fops.owner = module; 4737 4738 r = misc_register(&kvm_dev); 4739 if (r) { 4740 pr_err("kvm: misc device register failed\n"); 4741 goto out_unreg; 4742 } 4743 4744 register_syscore_ops(&kvm_syscore_ops); 4745 4746 kvm_preempt_ops.sched_in = kvm_sched_in; 4747 kvm_preempt_ops.sched_out = kvm_sched_out; 4748 4749 kvm_init_debug(); 4750 4751 r = kvm_vfio_ops_init(); 4752 WARN_ON(r); 4753 4754 return 0; 4755 4756 out_unreg: 4757 kvm_async_pf_deinit(); 4758 out_free: 4759 kmem_cache_destroy(kvm_vcpu_cache); 4760 out_free_3: 4761 unregister_reboot_notifier(&kvm_reboot_notifier); 4762 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4763 out_free_2: 4764 kvm_arch_hardware_unsetup(); 4765 out_free_1: 4766 free_cpumask_var(cpus_hardware_enabled); 4767 out_free_0: 4768 kvm_irqfd_exit(); 4769 out_irqfd: 4770 kvm_arch_exit(); 4771 out_fail: 4772 return r; 4773 } 4774 EXPORT_SYMBOL_GPL(kvm_init); 4775 4776 void kvm_exit(void) 4777 { 4778 debugfs_remove_recursive(kvm_debugfs_dir); 4779 misc_deregister(&kvm_dev); 4780 kmem_cache_destroy(kvm_vcpu_cache); 4781 kvm_async_pf_deinit(); 4782 unregister_syscore_ops(&kvm_syscore_ops); 4783 unregister_reboot_notifier(&kvm_reboot_notifier); 4784 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4785 on_each_cpu(hardware_disable_nolock, NULL, 1); 4786 kvm_arch_hardware_unsetup(); 4787 kvm_arch_exit(); 4788 kvm_irqfd_exit(); 4789 free_cpumask_var(cpus_hardware_enabled); 4790 kvm_vfio_ops_exit(); 4791 } 4792 EXPORT_SYMBOL_GPL(kvm_exit); 4793 4794 struct kvm_vm_worker_thread_context { 4795 struct kvm *kvm; 4796 struct task_struct *parent; 4797 struct completion init_done; 4798 kvm_vm_thread_fn_t thread_fn; 4799 uintptr_t data; 4800 int err; 4801 }; 4802 4803 static int kvm_vm_worker_thread(void *context) 4804 { 4805 /* 4806 * The init_context is allocated on the stack of the parent thread, so 4807 * we have to locally copy anything that is needed beyond initialization 4808 */ 4809 struct kvm_vm_worker_thread_context *init_context = context; 4810 struct kvm *kvm = init_context->kvm; 4811 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4812 uintptr_t data = init_context->data; 4813 int err; 4814 4815 err = kthread_park(current); 4816 /* kthread_park(current) is never supposed to return an error */ 4817 WARN_ON(err != 0); 4818 if (err) 4819 goto init_complete; 4820 4821 err = cgroup_attach_task_all(init_context->parent, current); 4822 if (err) { 4823 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4824 __func__, err); 4825 goto init_complete; 4826 } 4827 4828 set_user_nice(current, task_nice(init_context->parent)); 4829 4830 init_complete: 4831 init_context->err = err; 4832 complete(&init_context->init_done); 4833 init_context = NULL; 4834 4835 if (err) 4836 return err; 4837 4838 /* Wait to be woken up by the spawner before proceeding. */ 4839 kthread_parkme(); 4840 4841 if (!kthread_should_stop()) 4842 err = thread_fn(kvm, data); 4843 4844 return err; 4845 } 4846 4847 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4848 uintptr_t data, const char *name, 4849 struct task_struct **thread_ptr) 4850 { 4851 struct kvm_vm_worker_thread_context init_context = {}; 4852 struct task_struct *thread; 4853 4854 *thread_ptr = NULL; 4855 init_context.kvm = kvm; 4856 init_context.parent = current; 4857 init_context.thread_fn = thread_fn; 4858 init_context.data = data; 4859 init_completion(&init_context.init_done); 4860 4861 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4862 "%s-%d", name, task_pid_nr(current)); 4863 if (IS_ERR(thread)) 4864 return PTR_ERR(thread); 4865 4866 /* kthread_run is never supposed to return NULL */ 4867 WARN_ON(thread == NULL); 4868 4869 wait_for_completion(&init_context.init_done); 4870 4871 if (!init_context.err) 4872 *thread_ptr = thread; 4873 4874 return init_context.err; 4875 } 4876