1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 59 #include "coalesced_mmio.h" 60 #include "async_pf.h" 61 #include "vfio.h" 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/kvm.h> 65 66 /* Worst case buffer size needed for holding an integer. */ 67 #define ITOA_MAX_LEN 12 68 69 MODULE_AUTHOR("Qumranet"); 70 MODULE_LICENSE("GPL"); 71 72 /* Architectures should define their poll value according to the halt latency */ 73 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 74 module_param(halt_poll_ns, uint, 0644); 75 EXPORT_SYMBOL_GPL(halt_poll_ns); 76 77 /* Default doubles per-vcpu halt_poll_ns. */ 78 unsigned int halt_poll_ns_grow = 2; 79 module_param(halt_poll_ns_grow, uint, 0644); 80 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 81 82 /* The start value to grow halt_poll_ns from */ 83 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 84 module_param(halt_poll_ns_grow_start, uint, 0644); 85 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 86 87 /* Default resets per-vcpu halt_poll_ns . */ 88 unsigned int halt_poll_ns_shrink; 89 module_param(halt_poll_ns_shrink, uint, 0644); 90 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 91 92 /* 93 * Ordering of locks: 94 * 95 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 96 */ 97 98 DEFINE_MUTEX(kvm_lock); 99 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 100 LIST_HEAD(vm_list); 101 102 static cpumask_var_t cpus_hardware_enabled; 103 static int kvm_usage_count; 104 static atomic_t hardware_enable_failed; 105 106 static struct kmem_cache *kvm_vcpu_cache; 107 108 static __read_mostly struct preempt_ops kvm_preempt_ops; 109 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 110 111 struct dentry *kvm_debugfs_dir; 112 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 113 114 static int kvm_debugfs_num_entries; 115 static const struct file_operations stat_fops_per_vm; 116 117 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 118 unsigned long arg); 119 #ifdef CONFIG_KVM_COMPAT 120 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 121 unsigned long arg); 122 #define KVM_COMPAT(c) .compat_ioctl = (c) 123 #else 124 /* 125 * For architectures that don't implement a compat infrastructure, 126 * adopt a double line of defense: 127 * - Prevent a compat task from opening /dev/kvm 128 * - If the open has been done by a 64bit task, and the KVM fd 129 * passed to a compat task, let the ioctls fail. 130 */ 131 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 132 unsigned long arg) { return -EINVAL; } 133 134 static int kvm_no_compat_open(struct inode *inode, struct file *file) 135 { 136 return is_compat_task() ? -ENODEV : 0; 137 } 138 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 139 .open = kvm_no_compat_open 140 #endif 141 static int hardware_enable_all(void); 142 static void hardware_disable_all(void); 143 144 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 145 146 __visible bool kvm_rebooting; 147 EXPORT_SYMBOL_GPL(kvm_rebooting); 148 149 #define KVM_EVENT_CREATE_VM 0 150 #define KVM_EVENT_DESTROY_VM 1 151 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 152 static unsigned long long kvm_createvm_count; 153 static unsigned long long kvm_active_vms; 154 155 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 156 unsigned long start, unsigned long end) 157 { 158 } 159 160 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 161 { 162 /* 163 * The metadata used by is_zone_device_page() to determine whether or 164 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 165 * the device has been pinned, e.g. by get_user_pages(). WARN if the 166 * page_count() is zero to help detect bad usage of this helper. 167 */ 168 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 169 return false; 170 171 return is_zone_device_page(pfn_to_page(pfn)); 172 } 173 174 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 175 { 176 /* 177 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 178 * perspective they are "normal" pages, albeit with slightly different 179 * usage rules. 180 */ 181 if (pfn_valid(pfn)) 182 return PageReserved(pfn_to_page(pfn)) && 183 !is_zero_pfn(pfn) && 184 !kvm_is_zone_device_pfn(pfn); 185 186 return true; 187 } 188 189 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 190 { 191 struct page *page = pfn_to_page(pfn); 192 193 if (!PageTransCompoundMap(page)) 194 return false; 195 196 return is_transparent_hugepage(compound_head(page)); 197 } 198 199 /* 200 * Switches to specified vcpu, until a matching vcpu_put() 201 */ 202 void vcpu_load(struct kvm_vcpu *vcpu) 203 { 204 int cpu = get_cpu(); 205 206 __this_cpu_write(kvm_running_vcpu, vcpu); 207 preempt_notifier_register(&vcpu->preempt_notifier); 208 kvm_arch_vcpu_load(vcpu, cpu); 209 put_cpu(); 210 } 211 EXPORT_SYMBOL_GPL(vcpu_load); 212 213 void vcpu_put(struct kvm_vcpu *vcpu) 214 { 215 preempt_disable(); 216 kvm_arch_vcpu_put(vcpu); 217 preempt_notifier_unregister(&vcpu->preempt_notifier); 218 __this_cpu_write(kvm_running_vcpu, NULL); 219 preempt_enable(); 220 } 221 EXPORT_SYMBOL_GPL(vcpu_put); 222 223 /* TODO: merge with kvm_arch_vcpu_should_kick */ 224 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 225 { 226 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 227 228 /* 229 * We need to wait for the VCPU to reenable interrupts and get out of 230 * READING_SHADOW_PAGE_TABLES mode. 231 */ 232 if (req & KVM_REQUEST_WAIT) 233 return mode != OUTSIDE_GUEST_MODE; 234 235 /* 236 * Need to kick a running VCPU, but otherwise there is nothing to do. 237 */ 238 return mode == IN_GUEST_MODE; 239 } 240 241 static void ack_flush(void *_completed) 242 { 243 } 244 245 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 246 { 247 if (unlikely(!cpus)) 248 cpus = cpu_online_mask; 249 250 if (cpumask_empty(cpus)) 251 return false; 252 253 smp_call_function_many(cpus, ack_flush, NULL, wait); 254 return true; 255 } 256 257 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 258 struct kvm_vcpu *except, 259 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 260 { 261 int i, cpu, me; 262 struct kvm_vcpu *vcpu; 263 bool called; 264 265 me = get_cpu(); 266 267 kvm_for_each_vcpu(i, vcpu, kvm) { 268 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || 269 vcpu == except) 270 continue; 271 272 kvm_make_request(req, vcpu); 273 cpu = vcpu->cpu; 274 275 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 276 continue; 277 278 if (tmp != NULL && cpu != -1 && cpu != me && 279 kvm_request_needs_ipi(vcpu, req)) 280 __cpumask_set_cpu(cpu, tmp); 281 } 282 283 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 284 put_cpu(); 285 286 return called; 287 } 288 289 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 290 struct kvm_vcpu *except) 291 { 292 cpumask_var_t cpus; 293 bool called; 294 295 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 296 297 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); 298 299 free_cpumask_var(cpus); 300 return called; 301 } 302 303 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 304 { 305 return kvm_make_all_cpus_request_except(kvm, req, NULL); 306 } 307 308 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 309 void kvm_flush_remote_tlbs(struct kvm *kvm) 310 { 311 /* 312 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 313 * kvm_make_all_cpus_request. 314 */ 315 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 316 317 /* 318 * We want to publish modifications to the page tables before reading 319 * mode. Pairs with a memory barrier in arch-specific code. 320 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 321 * and smp_mb in walk_shadow_page_lockless_begin/end. 322 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 323 * 324 * There is already an smp_mb__after_atomic() before 325 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 326 * barrier here. 327 */ 328 if (!kvm_arch_flush_remote_tlb(kvm) 329 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 330 ++kvm->stat.remote_tlb_flush; 331 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 332 } 333 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 334 #endif 335 336 void kvm_reload_remote_mmus(struct kvm *kvm) 337 { 338 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 339 } 340 341 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 342 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, 343 gfp_t gfp_flags) 344 { 345 gfp_flags |= mc->gfp_zero; 346 347 if (mc->kmem_cache) 348 return kmem_cache_alloc(mc->kmem_cache, gfp_flags); 349 else 350 return (void *)__get_free_page(gfp_flags); 351 } 352 353 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) 354 { 355 void *obj; 356 357 if (mc->nobjs >= min) 358 return 0; 359 while (mc->nobjs < ARRAY_SIZE(mc->objects)) { 360 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); 361 if (!obj) 362 return mc->nobjs >= min ? 0 : -ENOMEM; 363 mc->objects[mc->nobjs++] = obj; 364 } 365 return 0; 366 } 367 368 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) 369 { 370 return mc->nobjs; 371 } 372 373 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 374 { 375 while (mc->nobjs) { 376 if (mc->kmem_cache) 377 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); 378 else 379 free_page((unsigned long)mc->objects[--mc->nobjs]); 380 } 381 } 382 383 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 384 { 385 void *p; 386 387 if (WARN_ON(!mc->nobjs)) 388 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); 389 else 390 p = mc->objects[--mc->nobjs]; 391 BUG_ON(!p); 392 return p; 393 } 394 #endif 395 396 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 397 { 398 mutex_init(&vcpu->mutex); 399 vcpu->cpu = -1; 400 vcpu->kvm = kvm; 401 vcpu->vcpu_id = id; 402 vcpu->pid = NULL; 403 rcuwait_init(&vcpu->wait); 404 kvm_async_pf_vcpu_init(vcpu); 405 406 vcpu->pre_pcpu = -1; 407 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 408 409 kvm_vcpu_set_in_spin_loop(vcpu, false); 410 kvm_vcpu_set_dy_eligible(vcpu, false); 411 vcpu->preempted = false; 412 vcpu->ready = false; 413 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 414 } 415 416 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 417 { 418 kvm_arch_vcpu_destroy(vcpu); 419 420 /* 421 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 422 * the vcpu->pid pointer, and at destruction time all file descriptors 423 * are already gone. 424 */ 425 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 426 427 free_page((unsigned long)vcpu->run); 428 kmem_cache_free(kvm_vcpu_cache, vcpu); 429 } 430 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 431 432 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 433 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 434 { 435 return container_of(mn, struct kvm, mmu_notifier); 436 } 437 438 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, 439 struct mm_struct *mm, 440 unsigned long start, unsigned long end) 441 { 442 struct kvm *kvm = mmu_notifier_to_kvm(mn); 443 int idx; 444 445 idx = srcu_read_lock(&kvm->srcu); 446 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 447 srcu_read_unlock(&kvm->srcu, idx); 448 } 449 450 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 451 struct mm_struct *mm, 452 unsigned long address, 453 pte_t pte) 454 { 455 struct kvm *kvm = mmu_notifier_to_kvm(mn); 456 int idx; 457 458 idx = srcu_read_lock(&kvm->srcu); 459 spin_lock(&kvm->mmu_lock); 460 kvm->mmu_notifier_seq++; 461 462 if (kvm_set_spte_hva(kvm, address, pte)) 463 kvm_flush_remote_tlbs(kvm); 464 465 spin_unlock(&kvm->mmu_lock); 466 srcu_read_unlock(&kvm->srcu, idx); 467 } 468 469 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 470 const struct mmu_notifier_range *range) 471 { 472 struct kvm *kvm = mmu_notifier_to_kvm(mn); 473 int need_tlb_flush = 0, idx; 474 475 idx = srcu_read_lock(&kvm->srcu); 476 spin_lock(&kvm->mmu_lock); 477 /* 478 * The count increase must become visible at unlock time as no 479 * spte can be established without taking the mmu_lock and 480 * count is also read inside the mmu_lock critical section. 481 */ 482 kvm->mmu_notifier_count++; 483 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, 484 range->flags); 485 need_tlb_flush |= kvm->tlbs_dirty; 486 /* we've to flush the tlb before the pages can be freed */ 487 if (need_tlb_flush) 488 kvm_flush_remote_tlbs(kvm); 489 490 spin_unlock(&kvm->mmu_lock); 491 srcu_read_unlock(&kvm->srcu, idx); 492 493 return 0; 494 } 495 496 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 497 const struct mmu_notifier_range *range) 498 { 499 struct kvm *kvm = mmu_notifier_to_kvm(mn); 500 501 spin_lock(&kvm->mmu_lock); 502 /* 503 * This sequence increase will notify the kvm page fault that 504 * the page that is going to be mapped in the spte could have 505 * been freed. 506 */ 507 kvm->mmu_notifier_seq++; 508 smp_wmb(); 509 /* 510 * The above sequence increase must be visible before the 511 * below count decrease, which is ensured by the smp_wmb above 512 * in conjunction with the smp_rmb in mmu_notifier_retry(). 513 */ 514 kvm->mmu_notifier_count--; 515 spin_unlock(&kvm->mmu_lock); 516 517 BUG_ON(kvm->mmu_notifier_count < 0); 518 } 519 520 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 521 struct mm_struct *mm, 522 unsigned long start, 523 unsigned long end) 524 { 525 struct kvm *kvm = mmu_notifier_to_kvm(mn); 526 int young, idx; 527 528 idx = srcu_read_lock(&kvm->srcu); 529 spin_lock(&kvm->mmu_lock); 530 531 young = kvm_age_hva(kvm, start, end); 532 if (young) 533 kvm_flush_remote_tlbs(kvm); 534 535 spin_unlock(&kvm->mmu_lock); 536 srcu_read_unlock(&kvm->srcu, idx); 537 538 return young; 539 } 540 541 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 542 struct mm_struct *mm, 543 unsigned long start, 544 unsigned long end) 545 { 546 struct kvm *kvm = mmu_notifier_to_kvm(mn); 547 int young, idx; 548 549 idx = srcu_read_lock(&kvm->srcu); 550 spin_lock(&kvm->mmu_lock); 551 /* 552 * Even though we do not flush TLB, this will still adversely 553 * affect performance on pre-Haswell Intel EPT, where there is 554 * no EPT Access Bit to clear so that we have to tear down EPT 555 * tables instead. If we find this unacceptable, we can always 556 * add a parameter to kvm_age_hva so that it effectively doesn't 557 * do anything on clear_young. 558 * 559 * Also note that currently we never issue secondary TLB flushes 560 * from clear_young, leaving this job up to the regular system 561 * cadence. If we find this inaccurate, we might come up with a 562 * more sophisticated heuristic later. 563 */ 564 young = kvm_age_hva(kvm, start, end); 565 spin_unlock(&kvm->mmu_lock); 566 srcu_read_unlock(&kvm->srcu, idx); 567 568 return young; 569 } 570 571 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 572 struct mm_struct *mm, 573 unsigned long address) 574 { 575 struct kvm *kvm = mmu_notifier_to_kvm(mn); 576 int young, idx; 577 578 idx = srcu_read_lock(&kvm->srcu); 579 spin_lock(&kvm->mmu_lock); 580 young = kvm_test_age_hva(kvm, address); 581 spin_unlock(&kvm->mmu_lock); 582 srcu_read_unlock(&kvm->srcu, idx); 583 584 return young; 585 } 586 587 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 588 struct mm_struct *mm) 589 { 590 struct kvm *kvm = mmu_notifier_to_kvm(mn); 591 int idx; 592 593 idx = srcu_read_lock(&kvm->srcu); 594 kvm_arch_flush_shadow_all(kvm); 595 srcu_read_unlock(&kvm->srcu, idx); 596 } 597 598 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 599 .invalidate_range = kvm_mmu_notifier_invalidate_range, 600 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 601 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 602 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 603 .clear_young = kvm_mmu_notifier_clear_young, 604 .test_young = kvm_mmu_notifier_test_young, 605 .change_pte = kvm_mmu_notifier_change_pte, 606 .release = kvm_mmu_notifier_release, 607 }; 608 609 static int kvm_init_mmu_notifier(struct kvm *kvm) 610 { 611 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 612 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 613 } 614 615 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 616 617 static int kvm_init_mmu_notifier(struct kvm *kvm) 618 { 619 return 0; 620 } 621 622 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 623 624 static struct kvm_memslots *kvm_alloc_memslots(void) 625 { 626 int i; 627 struct kvm_memslots *slots; 628 629 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 630 if (!slots) 631 return NULL; 632 633 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 634 slots->id_to_index[i] = -1; 635 636 return slots; 637 } 638 639 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 640 { 641 if (!memslot->dirty_bitmap) 642 return; 643 644 kvfree(memslot->dirty_bitmap); 645 memslot->dirty_bitmap = NULL; 646 } 647 648 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 649 { 650 kvm_destroy_dirty_bitmap(slot); 651 652 kvm_arch_free_memslot(kvm, slot); 653 654 slot->flags = 0; 655 slot->npages = 0; 656 } 657 658 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 659 { 660 struct kvm_memory_slot *memslot; 661 662 if (!slots) 663 return; 664 665 kvm_for_each_memslot(memslot, slots) 666 kvm_free_memslot(kvm, memslot); 667 668 kvfree(slots); 669 } 670 671 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 672 { 673 int i; 674 675 if (!kvm->debugfs_dentry) 676 return; 677 678 debugfs_remove_recursive(kvm->debugfs_dentry); 679 680 if (kvm->debugfs_stat_data) { 681 for (i = 0; i < kvm_debugfs_num_entries; i++) 682 kfree(kvm->debugfs_stat_data[i]); 683 kfree(kvm->debugfs_stat_data); 684 } 685 } 686 687 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 688 { 689 char dir_name[ITOA_MAX_LEN * 2]; 690 struct kvm_stat_data *stat_data; 691 struct kvm_stats_debugfs_item *p; 692 693 if (!debugfs_initialized()) 694 return 0; 695 696 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 697 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 698 699 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 700 sizeof(*kvm->debugfs_stat_data), 701 GFP_KERNEL_ACCOUNT); 702 if (!kvm->debugfs_stat_data) 703 return -ENOMEM; 704 705 for (p = debugfs_entries; p->name; p++) { 706 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 707 if (!stat_data) 708 return -ENOMEM; 709 710 stat_data->kvm = kvm; 711 stat_data->dbgfs_item = p; 712 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 713 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 714 kvm->debugfs_dentry, stat_data, 715 &stat_fops_per_vm); 716 } 717 return 0; 718 } 719 720 /* 721 * Called after the VM is otherwise initialized, but just before adding it to 722 * the vm_list. 723 */ 724 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 725 { 726 return 0; 727 } 728 729 /* 730 * Called just after removing the VM from the vm_list, but before doing any 731 * other destruction. 732 */ 733 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 734 { 735 } 736 737 static struct kvm *kvm_create_vm(unsigned long type) 738 { 739 struct kvm *kvm = kvm_arch_alloc_vm(); 740 int r = -ENOMEM; 741 int i; 742 743 if (!kvm) 744 return ERR_PTR(-ENOMEM); 745 746 spin_lock_init(&kvm->mmu_lock); 747 mmgrab(current->mm); 748 kvm->mm = current->mm; 749 kvm_eventfd_init(kvm); 750 mutex_init(&kvm->lock); 751 mutex_init(&kvm->irq_lock); 752 mutex_init(&kvm->slots_lock); 753 INIT_LIST_HEAD(&kvm->devices); 754 755 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 756 757 if (init_srcu_struct(&kvm->srcu)) 758 goto out_err_no_srcu; 759 if (init_srcu_struct(&kvm->irq_srcu)) 760 goto out_err_no_irq_srcu; 761 762 refcount_set(&kvm->users_count, 1); 763 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 764 struct kvm_memslots *slots = kvm_alloc_memslots(); 765 766 if (!slots) 767 goto out_err_no_arch_destroy_vm; 768 /* Generations must be different for each address space. */ 769 slots->generation = i; 770 rcu_assign_pointer(kvm->memslots[i], slots); 771 } 772 773 for (i = 0; i < KVM_NR_BUSES; i++) { 774 rcu_assign_pointer(kvm->buses[i], 775 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 776 if (!kvm->buses[i]) 777 goto out_err_no_arch_destroy_vm; 778 } 779 780 kvm->max_halt_poll_ns = halt_poll_ns; 781 782 r = kvm_arch_init_vm(kvm, type); 783 if (r) 784 goto out_err_no_arch_destroy_vm; 785 786 r = hardware_enable_all(); 787 if (r) 788 goto out_err_no_disable; 789 790 #ifdef CONFIG_HAVE_KVM_IRQFD 791 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 792 #endif 793 794 r = kvm_init_mmu_notifier(kvm); 795 if (r) 796 goto out_err_no_mmu_notifier; 797 798 r = kvm_arch_post_init_vm(kvm); 799 if (r) 800 goto out_err; 801 802 mutex_lock(&kvm_lock); 803 list_add(&kvm->vm_list, &vm_list); 804 mutex_unlock(&kvm_lock); 805 806 preempt_notifier_inc(); 807 808 return kvm; 809 810 out_err: 811 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 812 if (kvm->mmu_notifier.ops) 813 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 814 #endif 815 out_err_no_mmu_notifier: 816 hardware_disable_all(); 817 out_err_no_disable: 818 kvm_arch_destroy_vm(kvm); 819 out_err_no_arch_destroy_vm: 820 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 821 for (i = 0; i < KVM_NR_BUSES; i++) 822 kfree(kvm_get_bus(kvm, i)); 823 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 824 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 825 cleanup_srcu_struct(&kvm->irq_srcu); 826 out_err_no_irq_srcu: 827 cleanup_srcu_struct(&kvm->srcu); 828 out_err_no_srcu: 829 kvm_arch_free_vm(kvm); 830 mmdrop(current->mm); 831 return ERR_PTR(r); 832 } 833 834 static void kvm_destroy_devices(struct kvm *kvm) 835 { 836 struct kvm_device *dev, *tmp; 837 838 /* 839 * We do not need to take the kvm->lock here, because nobody else 840 * has a reference to the struct kvm at this point and therefore 841 * cannot access the devices list anyhow. 842 */ 843 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 844 list_del(&dev->vm_node); 845 dev->ops->destroy(dev); 846 } 847 } 848 849 static void kvm_destroy_vm(struct kvm *kvm) 850 { 851 int i; 852 struct mm_struct *mm = kvm->mm; 853 854 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 855 kvm_destroy_vm_debugfs(kvm); 856 kvm_arch_sync_events(kvm); 857 mutex_lock(&kvm_lock); 858 list_del(&kvm->vm_list); 859 mutex_unlock(&kvm_lock); 860 kvm_arch_pre_destroy_vm(kvm); 861 862 kvm_free_irq_routing(kvm); 863 for (i = 0; i < KVM_NR_BUSES; i++) { 864 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 865 866 if (bus) 867 kvm_io_bus_destroy(bus); 868 kvm->buses[i] = NULL; 869 } 870 kvm_coalesced_mmio_free(kvm); 871 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 872 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 873 #else 874 kvm_arch_flush_shadow_all(kvm); 875 #endif 876 kvm_arch_destroy_vm(kvm); 877 kvm_destroy_devices(kvm); 878 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 879 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 880 cleanup_srcu_struct(&kvm->irq_srcu); 881 cleanup_srcu_struct(&kvm->srcu); 882 kvm_arch_free_vm(kvm); 883 preempt_notifier_dec(); 884 hardware_disable_all(); 885 mmdrop(mm); 886 } 887 888 void kvm_get_kvm(struct kvm *kvm) 889 { 890 refcount_inc(&kvm->users_count); 891 } 892 EXPORT_SYMBOL_GPL(kvm_get_kvm); 893 894 void kvm_put_kvm(struct kvm *kvm) 895 { 896 if (refcount_dec_and_test(&kvm->users_count)) 897 kvm_destroy_vm(kvm); 898 } 899 EXPORT_SYMBOL_GPL(kvm_put_kvm); 900 901 /* 902 * Used to put a reference that was taken on behalf of an object associated 903 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 904 * of the new file descriptor fails and the reference cannot be transferred to 905 * its final owner. In such cases, the caller is still actively using @kvm and 906 * will fail miserably if the refcount unexpectedly hits zero. 907 */ 908 void kvm_put_kvm_no_destroy(struct kvm *kvm) 909 { 910 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 911 } 912 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 913 914 static int kvm_vm_release(struct inode *inode, struct file *filp) 915 { 916 struct kvm *kvm = filp->private_data; 917 918 kvm_irqfd_release(kvm); 919 920 kvm_put_kvm(kvm); 921 return 0; 922 } 923 924 /* 925 * Allocation size is twice as large as the actual dirty bitmap size. 926 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 927 */ 928 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 929 { 930 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 931 932 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 933 if (!memslot->dirty_bitmap) 934 return -ENOMEM; 935 936 return 0; 937 } 938 939 /* 940 * Delete a memslot by decrementing the number of used slots and shifting all 941 * other entries in the array forward one spot. 942 */ 943 static inline void kvm_memslot_delete(struct kvm_memslots *slots, 944 struct kvm_memory_slot *memslot) 945 { 946 struct kvm_memory_slot *mslots = slots->memslots; 947 int i; 948 949 if (WARN_ON(slots->id_to_index[memslot->id] == -1)) 950 return; 951 952 slots->used_slots--; 953 954 if (atomic_read(&slots->lru_slot) >= slots->used_slots) 955 atomic_set(&slots->lru_slot, 0); 956 957 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { 958 mslots[i] = mslots[i + 1]; 959 slots->id_to_index[mslots[i].id] = i; 960 } 961 mslots[i] = *memslot; 962 slots->id_to_index[memslot->id] = -1; 963 } 964 965 /* 966 * "Insert" a new memslot by incrementing the number of used slots. Returns 967 * the new slot's initial index into the memslots array. 968 */ 969 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) 970 { 971 return slots->used_slots++; 972 } 973 974 /* 975 * Move a changed memslot backwards in the array by shifting existing slots 976 * with a higher GFN toward the front of the array. Note, the changed memslot 977 * itself is not preserved in the array, i.e. not swapped at this time, only 978 * its new index into the array is tracked. Returns the changed memslot's 979 * current index into the memslots array. 980 */ 981 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, 982 struct kvm_memory_slot *memslot) 983 { 984 struct kvm_memory_slot *mslots = slots->memslots; 985 int i; 986 987 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || 988 WARN_ON_ONCE(!slots->used_slots)) 989 return -1; 990 991 /* 992 * Move the target memslot backward in the array by shifting existing 993 * memslots with a higher GFN (than the target memslot) towards the 994 * front of the array. 995 */ 996 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { 997 if (memslot->base_gfn > mslots[i + 1].base_gfn) 998 break; 999 1000 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); 1001 1002 /* Shift the next memslot forward one and update its index. */ 1003 mslots[i] = mslots[i + 1]; 1004 slots->id_to_index[mslots[i].id] = i; 1005 } 1006 return i; 1007 } 1008 1009 /* 1010 * Move a changed memslot forwards in the array by shifting existing slots with 1011 * a lower GFN toward the back of the array. Note, the changed memslot itself 1012 * is not preserved in the array, i.e. not swapped at this time, only its new 1013 * index into the array is tracked. Returns the changed memslot's final index 1014 * into the memslots array. 1015 */ 1016 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, 1017 struct kvm_memory_slot *memslot, 1018 int start) 1019 { 1020 struct kvm_memory_slot *mslots = slots->memslots; 1021 int i; 1022 1023 for (i = start; i > 0; i--) { 1024 if (memslot->base_gfn < mslots[i - 1].base_gfn) 1025 break; 1026 1027 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); 1028 1029 /* Shift the next memslot back one and update its index. */ 1030 mslots[i] = mslots[i - 1]; 1031 slots->id_to_index[mslots[i].id] = i; 1032 } 1033 return i; 1034 } 1035 1036 /* 1037 * Re-sort memslots based on their GFN to account for an added, deleted, or 1038 * moved memslot. Sorting memslots by GFN allows using a binary search during 1039 * memslot lookup. 1040 * 1041 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry 1042 * at memslots[0] has the highest GFN. 1043 * 1044 * The sorting algorithm takes advantage of having initially sorted memslots 1045 * and knowing the position of the changed memslot. Sorting is also optimized 1046 * by not swapping the updated memslot and instead only shifting other memslots 1047 * and tracking the new index for the update memslot. Only once its final 1048 * index is known is the updated memslot copied into its position in the array. 1049 * 1050 * - When deleting a memslot, the deleted memslot simply needs to be moved to 1051 * the end of the array. 1052 * 1053 * - When creating a memslot, the algorithm "inserts" the new memslot at the 1054 * end of the array and then it forward to its correct location. 1055 * 1056 * - When moving a memslot, the algorithm first moves the updated memslot 1057 * backward to handle the scenario where the memslot's GFN was changed to a 1058 * lower value. update_memslots() then falls through and runs the same flow 1059 * as creating a memslot to move the memslot forward to handle the scenario 1060 * where its GFN was changed to a higher value. 1061 * 1062 * Note, slots are sorted from highest->lowest instead of lowest->highest for 1063 * historical reasons. Originally, invalid memslots where denoted by having 1064 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots 1065 * to the end of the array. The current algorithm uses dedicated logic to 1066 * delete a memslot and thus does not rely on invalid memslots having GFN=0. 1067 * 1068 * The other historical motiviation for highest->lowest was to improve the 1069 * performance of memslot lookup. KVM originally used a linear search starting 1070 * at memslots[0]. On x86, the largest memslot usually has one of the highest, 1071 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a 1072 * single memslot above the 4gb boundary. As the largest memslot is also the 1073 * most likely to be referenced, sorting it to the front of the array was 1074 * advantageous. The current binary search starts from the middle of the array 1075 * and uses an LRU pointer to improve performance for all memslots and GFNs. 1076 */ 1077 static void update_memslots(struct kvm_memslots *slots, 1078 struct kvm_memory_slot *memslot, 1079 enum kvm_mr_change change) 1080 { 1081 int i; 1082 1083 if (change == KVM_MR_DELETE) { 1084 kvm_memslot_delete(slots, memslot); 1085 } else { 1086 if (change == KVM_MR_CREATE) 1087 i = kvm_memslot_insert_back(slots); 1088 else 1089 i = kvm_memslot_move_backward(slots, memslot); 1090 i = kvm_memslot_move_forward(slots, memslot, i); 1091 1092 /* 1093 * Copy the memslot to its new position in memslots and update 1094 * its index accordingly. 1095 */ 1096 slots->memslots[i] = *memslot; 1097 slots->id_to_index[memslot->id] = i; 1098 } 1099 } 1100 1101 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1102 { 1103 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1104 1105 #ifdef __KVM_HAVE_READONLY_MEM 1106 valid_flags |= KVM_MEM_READONLY; 1107 #endif 1108 1109 if (mem->flags & ~valid_flags) 1110 return -EINVAL; 1111 1112 return 0; 1113 } 1114 1115 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 1116 int as_id, struct kvm_memslots *slots) 1117 { 1118 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 1119 u64 gen = old_memslots->generation; 1120 1121 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1122 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1123 1124 rcu_assign_pointer(kvm->memslots[as_id], slots); 1125 synchronize_srcu_expedited(&kvm->srcu); 1126 1127 /* 1128 * Increment the new memslot generation a second time, dropping the 1129 * update in-progress flag and incrementing the generation based on 1130 * the number of address spaces. This provides a unique and easily 1131 * identifiable generation number while the memslots are in flux. 1132 */ 1133 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1134 1135 /* 1136 * Generations must be unique even across address spaces. We do not need 1137 * a global counter for that, instead the generation space is evenly split 1138 * across address spaces. For example, with two address spaces, address 1139 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1140 * use generations 1, 3, 5, ... 1141 */ 1142 gen += KVM_ADDRESS_SPACE_NUM; 1143 1144 kvm_arch_memslots_updated(kvm, gen); 1145 1146 slots->generation = gen; 1147 1148 return old_memslots; 1149 } 1150 1151 /* 1152 * Note, at a minimum, the current number of used slots must be allocated, even 1153 * when deleting a memslot, as we need a complete duplicate of the memslots for 1154 * use when invalidating a memslot prior to deleting/moving the memslot. 1155 */ 1156 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, 1157 enum kvm_mr_change change) 1158 { 1159 struct kvm_memslots *slots; 1160 size_t old_size, new_size; 1161 1162 old_size = sizeof(struct kvm_memslots) + 1163 (sizeof(struct kvm_memory_slot) * old->used_slots); 1164 1165 if (change == KVM_MR_CREATE) 1166 new_size = old_size + sizeof(struct kvm_memory_slot); 1167 else 1168 new_size = old_size; 1169 1170 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); 1171 if (likely(slots)) 1172 memcpy(slots, old, old_size); 1173 1174 return slots; 1175 } 1176 1177 static int kvm_set_memslot(struct kvm *kvm, 1178 const struct kvm_userspace_memory_region *mem, 1179 struct kvm_memory_slot *old, 1180 struct kvm_memory_slot *new, int as_id, 1181 enum kvm_mr_change change) 1182 { 1183 struct kvm_memory_slot *slot; 1184 struct kvm_memslots *slots; 1185 int r; 1186 1187 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); 1188 if (!slots) 1189 return -ENOMEM; 1190 1191 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1192 /* 1193 * Note, the INVALID flag needs to be in the appropriate entry 1194 * in the freshly allocated memslots, not in @old or @new. 1195 */ 1196 slot = id_to_memslot(slots, old->id); 1197 slot->flags |= KVM_MEMSLOT_INVALID; 1198 1199 /* 1200 * We can re-use the old memslots, the only difference from the 1201 * newly installed memslots is the invalid flag, which will get 1202 * dropped by update_memslots anyway. We'll also revert to the 1203 * old memslots if preparing the new memory region fails. 1204 */ 1205 slots = install_new_memslots(kvm, as_id, slots); 1206 1207 /* From this point no new shadow pages pointing to a deleted, 1208 * or moved, memslot will be created. 1209 * 1210 * validation of sp->gfn happens in: 1211 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1212 * - kvm_is_visible_gfn (mmu_check_root) 1213 */ 1214 kvm_arch_flush_shadow_memslot(kvm, slot); 1215 } 1216 1217 r = kvm_arch_prepare_memory_region(kvm, new, mem, change); 1218 if (r) 1219 goto out_slots; 1220 1221 update_memslots(slots, new, change); 1222 slots = install_new_memslots(kvm, as_id, slots); 1223 1224 kvm_arch_commit_memory_region(kvm, mem, old, new, change); 1225 1226 kvfree(slots); 1227 return 0; 1228 1229 out_slots: 1230 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1231 slots = install_new_memslots(kvm, as_id, slots); 1232 kvfree(slots); 1233 return r; 1234 } 1235 1236 static int kvm_delete_memslot(struct kvm *kvm, 1237 const struct kvm_userspace_memory_region *mem, 1238 struct kvm_memory_slot *old, int as_id) 1239 { 1240 struct kvm_memory_slot new; 1241 int r; 1242 1243 if (!old->npages) 1244 return -EINVAL; 1245 1246 memset(&new, 0, sizeof(new)); 1247 new.id = old->id; 1248 /* 1249 * This is only for debugging purpose; it should never be referenced 1250 * for a removed memslot. 1251 */ 1252 new.as_id = as_id; 1253 1254 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); 1255 if (r) 1256 return r; 1257 1258 kvm_free_memslot(kvm, old); 1259 return 0; 1260 } 1261 1262 /* 1263 * Allocate some memory and give it an address in the guest physical address 1264 * space. 1265 * 1266 * Discontiguous memory is allowed, mostly for framebuffers. 1267 * 1268 * Must be called holding kvm->slots_lock for write. 1269 */ 1270 int __kvm_set_memory_region(struct kvm *kvm, 1271 const struct kvm_userspace_memory_region *mem) 1272 { 1273 struct kvm_memory_slot old, new; 1274 struct kvm_memory_slot *tmp; 1275 enum kvm_mr_change change; 1276 int as_id, id; 1277 int r; 1278 1279 r = check_memory_region_flags(mem); 1280 if (r) 1281 return r; 1282 1283 as_id = mem->slot >> 16; 1284 id = (u16)mem->slot; 1285 1286 /* General sanity checks */ 1287 if (mem->memory_size & (PAGE_SIZE - 1)) 1288 return -EINVAL; 1289 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1290 return -EINVAL; 1291 /* We can read the guest memory with __xxx_user() later on. */ 1292 if ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1293 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1294 mem->memory_size)) 1295 return -EINVAL; 1296 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1297 return -EINVAL; 1298 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1299 return -EINVAL; 1300 1301 /* 1302 * Make a full copy of the old memslot, the pointer will become stale 1303 * when the memslots are re-sorted by update_memslots(), and the old 1304 * memslot needs to be referenced after calling update_memslots(), e.g. 1305 * to free its resources and for arch specific behavior. 1306 */ 1307 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1308 if (tmp) { 1309 old = *tmp; 1310 tmp = NULL; 1311 } else { 1312 memset(&old, 0, sizeof(old)); 1313 old.id = id; 1314 } 1315 1316 if (!mem->memory_size) 1317 return kvm_delete_memslot(kvm, mem, &old, as_id); 1318 1319 new.as_id = as_id; 1320 new.id = id; 1321 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1322 new.npages = mem->memory_size >> PAGE_SHIFT; 1323 new.flags = mem->flags; 1324 new.userspace_addr = mem->userspace_addr; 1325 1326 if (new.npages > KVM_MEM_MAX_NR_PAGES) 1327 return -EINVAL; 1328 1329 if (!old.npages) { 1330 change = KVM_MR_CREATE; 1331 new.dirty_bitmap = NULL; 1332 memset(&new.arch, 0, sizeof(new.arch)); 1333 } else { /* Modify an existing slot. */ 1334 if ((new.userspace_addr != old.userspace_addr) || 1335 (new.npages != old.npages) || 1336 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1337 return -EINVAL; 1338 1339 if (new.base_gfn != old.base_gfn) 1340 change = KVM_MR_MOVE; 1341 else if (new.flags != old.flags) 1342 change = KVM_MR_FLAGS_ONLY; 1343 else /* Nothing to change. */ 1344 return 0; 1345 1346 /* Copy dirty_bitmap and arch from the current memslot. */ 1347 new.dirty_bitmap = old.dirty_bitmap; 1348 memcpy(&new.arch, &old.arch, sizeof(new.arch)); 1349 } 1350 1351 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1352 /* Check for overlaps */ 1353 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { 1354 if (tmp->id == id) 1355 continue; 1356 if (!((new.base_gfn + new.npages <= tmp->base_gfn) || 1357 (new.base_gfn >= tmp->base_gfn + tmp->npages))) 1358 return -EEXIST; 1359 } 1360 } 1361 1362 /* Allocate/free page dirty bitmap as needed */ 1363 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1364 new.dirty_bitmap = NULL; 1365 else if (!new.dirty_bitmap) { 1366 r = kvm_alloc_dirty_bitmap(&new); 1367 if (r) 1368 return r; 1369 1370 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1371 bitmap_set(new.dirty_bitmap, 0, new.npages); 1372 } 1373 1374 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); 1375 if (r) 1376 goto out_bitmap; 1377 1378 if (old.dirty_bitmap && !new.dirty_bitmap) 1379 kvm_destroy_dirty_bitmap(&old); 1380 return 0; 1381 1382 out_bitmap: 1383 if (new.dirty_bitmap && !old.dirty_bitmap) 1384 kvm_destroy_dirty_bitmap(&new); 1385 return r; 1386 } 1387 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1388 1389 int kvm_set_memory_region(struct kvm *kvm, 1390 const struct kvm_userspace_memory_region *mem) 1391 { 1392 int r; 1393 1394 mutex_lock(&kvm->slots_lock); 1395 r = __kvm_set_memory_region(kvm, mem); 1396 mutex_unlock(&kvm->slots_lock); 1397 return r; 1398 } 1399 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1400 1401 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1402 struct kvm_userspace_memory_region *mem) 1403 { 1404 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1405 return -EINVAL; 1406 1407 return kvm_set_memory_region(kvm, mem); 1408 } 1409 1410 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1411 /** 1412 * kvm_get_dirty_log - get a snapshot of dirty pages 1413 * @kvm: pointer to kvm instance 1414 * @log: slot id and address to which we copy the log 1415 * @is_dirty: set to '1' if any dirty pages were found 1416 * @memslot: set to the associated memslot, always valid on success 1417 */ 1418 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1419 int *is_dirty, struct kvm_memory_slot **memslot) 1420 { 1421 struct kvm_memslots *slots; 1422 int i, as_id, id; 1423 unsigned long n; 1424 unsigned long any = 0; 1425 1426 *memslot = NULL; 1427 *is_dirty = 0; 1428 1429 as_id = log->slot >> 16; 1430 id = (u16)log->slot; 1431 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1432 return -EINVAL; 1433 1434 slots = __kvm_memslots(kvm, as_id); 1435 *memslot = id_to_memslot(slots, id); 1436 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1437 return -ENOENT; 1438 1439 kvm_arch_sync_dirty_log(kvm, *memslot); 1440 1441 n = kvm_dirty_bitmap_bytes(*memslot); 1442 1443 for (i = 0; !any && i < n/sizeof(long); ++i) 1444 any = (*memslot)->dirty_bitmap[i]; 1445 1446 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 1447 return -EFAULT; 1448 1449 if (any) 1450 *is_dirty = 1; 1451 return 0; 1452 } 1453 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1454 1455 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1456 /** 1457 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1458 * and reenable dirty page tracking for the corresponding pages. 1459 * @kvm: pointer to kvm instance 1460 * @log: slot id and address to which we copy the log 1461 * 1462 * We need to keep it in mind that VCPU threads can write to the bitmap 1463 * concurrently. So, to avoid losing track of dirty pages we keep the 1464 * following order: 1465 * 1466 * 1. Take a snapshot of the bit and clear it if needed. 1467 * 2. Write protect the corresponding page. 1468 * 3. Copy the snapshot to the userspace. 1469 * 4. Upon return caller flushes TLB's if needed. 1470 * 1471 * Between 2 and 4, the guest may write to the page using the remaining TLB 1472 * entry. This is not a problem because the page is reported dirty using 1473 * the snapshot taken before and step 4 ensures that writes done after 1474 * exiting to userspace will be logged for the next call. 1475 * 1476 */ 1477 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 1478 { 1479 struct kvm_memslots *slots; 1480 struct kvm_memory_slot *memslot; 1481 int i, as_id, id; 1482 unsigned long n; 1483 unsigned long *dirty_bitmap; 1484 unsigned long *dirty_bitmap_buffer; 1485 bool flush; 1486 1487 as_id = log->slot >> 16; 1488 id = (u16)log->slot; 1489 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1490 return -EINVAL; 1491 1492 slots = __kvm_memslots(kvm, as_id); 1493 memslot = id_to_memslot(slots, id); 1494 if (!memslot || !memslot->dirty_bitmap) 1495 return -ENOENT; 1496 1497 dirty_bitmap = memslot->dirty_bitmap; 1498 1499 kvm_arch_sync_dirty_log(kvm, memslot); 1500 1501 n = kvm_dirty_bitmap_bytes(memslot); 1502 flush = false; 1503 if (kvm->manual_dirty_log_protect) { 1504 /* 1505 * Unlike kvm_get_dirty_log, we always return false in *flush, 1506 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1507 * is some code duplication between this function and 1508 * kvm_get_dirty_log, but hopefully all architecture 1509 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1510 * can be eliminated. 1511 */ 1512 dirty_bitmap_buffer = dirty_bitmap; 1513 } else { 1514 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1515 memset(dirty_bitmap_buffer, 0, n); 1516 1517 spin_lock(&kvm->mmu_lock); 1518 for (i = 0; i < n / sizeof(long); i++) { 1519 unsigned long mask; 1520 gfn_t offset; 1521 1522 if (!dirty_bitmap[i]) 1523 continue; 1524 1525 flush = true; 1526 mask = xchg(&dirty_bitmap[i], 0); 1527 dirty_bitmap_buffer[i] = mask; 1528 1529 offset = i * BITS_PER_LONG; 1530 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1531 offset, mask); 1532 } 1533 spin_unlock(&kvm->mmu_lock); 1534 } 1535 1536 if (flush) 1537 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1538 1539 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1540 return -EFAULT; 1541 return 0; 1542 } 1543 1544 1545 /** 1546 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 1547 * @kvm: kvm instance 1548 * @log: slot id and address to which we copy the log 1549 * 1550 * Steps 1-4 below provide general overview of dirty page logging. See 1551 * kvm_get_dirty_log_protect() function description for additional details. 1552 * 1553 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 1554 * always flush the TLB (step 4) even if previous step failed and the dirty 1555 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 1556 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 1557 * writes will be marked dirty for next log read. 1558 * 1559 * 1. Take a snapshot of the bit and clear it if needed. 1560 * 2. Write protect the corresponding page. 1561 * 3. Copy the snapshot to the userspace. 1562 * 4. Flush TLB's if needed. 1563 */ 1564 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1565 struct kvm_dirty_log *log) 1566 { 1567 int r; 1568 1569 mutex_lock(&kvm->slots_lock); 1570 1571 r = kvm_get_dirty_log_protect(kvm, log); 1572 1573 mutex_unlock(&kvm->slots_lock); 1574 return r; 1575 } 1576 1577 /** 1578 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1579 * and reenable dirty page tracking for the corresponding pages. 1580 * @kvm: pointer to kvm instance 1581 * @log: slot id and address from which to fetch the bitmap of dirty pages 1582 */ 1583 static int kvm_clear_dirty_log_protect(struct kvm *kvm, 1584 struct kvm_clear_dirty_log *log) 1585 { 1586 struct kvm_memslots *slots; 1587 struct kvm_memory_slot *memslot; 1588 int as_id, id; 1589 gfn_t offset; 1590 unsigned long i, n; 1591 unsigned long *dirty_bitmap; 1592 unsigned long *dirty_bitmap_buffer; 1593 bool flush; 1594 1595 as_id = log->slot >> 16; 1596 id = (u16)log->slot; 1597 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1598 return -EINVAL; 1599 1600 if (log->first_page & 63) 1601 return -EINVAL; 1602 1603 slots = __kvm_memslots(kvm, as_id); 1604 memslot = id_to_memslot(slots, id); 1605 if (!memslot || !memslot->dirty_bitmap) 1606 return -ENOENT; 1607 1608 dirty_bitmap = memslot->dirty_bitmap; 1609 1610 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1611 1612 if (log->first_page > memslot->npages || 1613 log->num_pages > memslot->npages - log->first_page || 1614 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1615 return -EINVAL; 1616 1617 kvm_arch_sync_dirty_log(kvm, memslot); 1618 1619 flush = false; 1620 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1621 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1622 return -EFAULT; 1623 1624 spin_lock(&kvm->mmu_lock); 1625 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1626 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1627 i++, offset += BITS_PER_LONG) { 1628 unsigned long mask = *dirty_bitmap_buffer++; 1629 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1630 if (!mask) 1631 continue; 1632 1633 mask &= atomic_long_fetch_andnot(mask, p); 1634 1635 /* 1636 * mask contains the bits that really have been cleared. This 1637 * never includes any bits beyond the length of the memslot (if 1638 * the length is not aligned to 64 pages), therefore it is not 1639 * a problem if userspace sets them in log->dirty_bitmap. 1640 */ 1641 if (mask) { 1642 flush = true; 1643 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1644 offset, mask); 1645 } 1646 } 1647 spin_unlock(&kvm->mmu_lock); 1648 1649 if (flush) 1650 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1651 1652 return 0; 1653 } 1654 1655 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 1656 struct kvm_clear_dirty_log *log) 1657 { 1658 int r; 1659 1660 mutex_lock(&kvm->slots_lock); 1661 1662 r = kvm_clear_dirty_log_protect(kvm, log); 1663 1664 mutex_unlock(&kvm->slots_lock); 1665 return r; 1666 } 1667 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1668 1669 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1670 { 1671 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1672 } 1673 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1674 1675 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1676 { 1677 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1678 } 1679 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); 1680 1681 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1682 { 1683 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1684 1685 return kvm_is_visible_memslot(memslot); 1686 } 1687 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1688 1689 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1690 { 1691 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1692 1693 return kvm_is_visible_memslot(memslot); 1694 } 1695 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); 1696 1697 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1698 { 1699 struct vm_area_struct *vma; 1700 unsigned long addr, size; 1701 1702 size = PAGE_SIZE; 1703 1704 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1705 if (kvm_is_error_hva(addr)) 1706 return PAGE_SIZE; 1707 1708 mmap_read_lock(current->mm); 1709 vma = find_vma(current->mm, addr); 1710 if (!vma) 1711 goto out; 1712 1713 size = vma_kernel_pagesize(vma); 1714 1715 out: 1716 mmap_read_unlock(current->mm); 1717 1718 return size; 1719 } 1720 1721 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1722 { 1723 return slot->flags & KVM_MEM_READONLY; 1724 } 1725 1726 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1727 gfn_t *nr_pages, bool write) 1728 { 1729 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1730 return KVM_HVA_ERR_BAD; 1731 1732 if (memslot_is_readonly(slot) && write) 1733 return KVM_HVA_ERR_RO_BAD; 1734 1735 if (nr_pages) 1736 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1737 1738 return __gfn_to_hva_memslot(slot, gfn); 1739 } 1740 1741 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1742 gfn_t *nr_pages) 1743 { 1744 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1745 } 1746 1747 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1748 gfn_t gfn) 1749 { 1750 return gfn_to_hva_many(slot, gfn, NULL); 1751 } 1752 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1753 1754 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1755 { 1756 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1757 } 1758 EXPORT_SYMBOL_GPL(gfn_to_hva); 1759 1760 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1761 { 1762 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1763 } 1764 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1765 1766 /* 1767 * Return the hva of a @gfn and the R/W attribute if possible. 1768 * 1769 * @slot: the kvm_memory_slot which contains @gfn 1770 * @gfn: the gfn to be translated 1771 * @writable: used to return the read/write attribute of the @slot if the hva 1772 * is valid and @writable is not NULL 1773 */ 1774 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1775 gfn_t gfn, bool *writable) 1776 { 1777 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1778 1779 if (!kvm_is_error_hva(hva) && writable) 1780 *writable = !memslot_is_readonly(slot); 1781 1782 return hva; 1783 } 1784 1785 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1786 { 1787 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1788 1789 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1790 } 1791 1792 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1793 { 1794 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1795 1796 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1797 } 1798 1799 static inline int check_user_page_hwpoison(unsigned long addr) 1800 { 1801 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1802 1803 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1804 return rc == -EHWPOISON; 1805 } 1806 1807 /* 1808 * The fast path to get the writable pfn which will be stored in @pfn, 1809 * true indicates success, otherwise false is returned. It's also the 1810 * only part that runs if we can in atomic context. 1811 */ 1812 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1813 bool *writable, kvm_pfn_t *pfn) 1814 { 1815 struct page *page[1]; 1816 1817 /* 1818 * Fast pin a writable pfn only if it is a write fault request 1819 * or the caller allows to map a writable pfn for a read fault 1820 * request. 1821 */ 1822 if (!(write_fault || writable)) 1823 return false; 1824 1825 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { 1826 *pfn = page_to_pfn(page[0]); 1827 1828 if (writable) 1829 *writable = true; 1830 return true; 1831 } 1832 1833 return false; 1834 } 1835 1836 /* 1837 * The slow path to get the pfn of the specified host virtual address, 1838 * 1 indicates success, -errno is returned if error is detected. 1839 */ 1840 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1841 bool *writable, kvm_pfn_t *pfn) 1842 { 1843 unsigned int flags = FOLL_HWPOISON; 1844 struct page *page; 1845 int npages = 0; 1846 1847 might_sleep(); 1848 1849 if (writable) 1850 *writable = write_fault; 1851 1852 if (write_fault) 1853 flags |= FOLL_WRITE; 1854 if (async) 1855 flags |= FOLL_NOWAIT; 1856 1857 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1858 if (npages != 1) 1859 return npages; 1860 1861 /* map read fault as writable if possible */ 1862 if (unlikely(!write_fault) && writable) { 1863 struct page *wpage; 1864 1865 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { 1866 *writable = true; 1867 put_page(page); 1868 page = wpage; 1869 } 1870 } 1871 *pfn = page_to_pfn(page); 1872 return npages; 1873 } 1874 1875 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1876 { 1877 if (unlikely(!(vma->vm_flags & VM_READ))) 1878 return false; 1879 1880 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1881 return false; 1882 1883 return true; 1884 } 1885 1886 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1887 unsigned long addr, bool *async, 1888 bool write_fault, bool *writable, 1889 kvm_pfn_t *p_pfn) 1890 { 1891 unsigned long pfn; 1892 int r; 1893 1894 r = follow_pfn(vma, addr, &pfn); 1895 if (r) { 1896 /* 1897 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1898 * not call the fault handler, so do it here. 1899 */ 1900 bool unlocked = false; 1901 r = fixup_user_fault(current->mm, addr, 1902 (write_fault ? FAULT_FLAG_WRITE : 0), 1903 &unlocked); 1904 if (unlocked) 1905 return -EAGAIN; 1906 if (r) 1907 return r; 1908 1909 r = follow_pfn(vma, addr, &pfn); 1910 if (r) 1911 return r; 1912 1913 } 1914 1915 if (writable) 1916 *writable = true; 1917 1918 /* 1919 * Get a reference here because callers of *hva_to_pfn* and 1920 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1921 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1922 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1923 * simply do nothing for reserved pfns. 1924 * 1925 * Whoever called remap_pfn_range is also going to call e.g. 1926 * unmap_mapping_range before the underlying pages are freed, 1927 * causing a call to our MMU notifier. 1928 */ 1929 kvm_get_pfn(pfn); 1930 1931 *p_pfn = pfn; 1932 return 0; 1933 } 1934 1935 /* 1936 * Pin guest page in memory and return its pfn. 1937 * @addr: host virtual address which maps memory to the guest 1938 * @atomic: whether this function can sleep 1939 * @async: whether this function need to wait IO complete if the 1940 * host page is not in the memory 1941 * @write_fault: whether we should get a writable host page 1942 * @writable: whether it allows to map a writable host page for !@write_fault 1943 * 1944 * The function will map a writable host page for these two cases: 1945 * 1): @write_fault = true 1946 * 2): @write_fault = false && @writable, @writable will tell the caller 1947 * whether the mapping is writable. 1948 */ 1949 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1950 bool write_fault, bool *writable) 1951 { 1952 struct vm_area_struct *vma; 1953 kvm_pfn_t pfn = 0; 1954 int npages, r; 1955 1956 /* we can do it either atomically or asynchronously, not both */ 1957 BUG_ON(atomic && async); 1958 1959 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1960 return pfn; 1961 1962 if (atomic) 1963 return KVM_PFN_ERR_FAULT; 1964 1965 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1966 if (npages == 1) 1967 return pfn; 1968 1969 mmap_read_lock(current->mm); 1970 if (npages == -EHWPOISON || 1971 (!async && check_user_page_hwpoison(addr))) { 1972 pfn = KVM_PFN_ERR_HWPOISON; 1973 goto exit; 1974 } 1975 1976 retry: 1977 vma = find_vma_intersection(current->mm, addr, addr + 1); 1978 1979 if (vma == NULL) 1980 pfn = KVM_PFN_ERR_FAULT; 1981 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1982 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1983 if (r == -EAGAIN) 1984 goto retry; 1985 if (r < 0) 1986 pfn = KVM_PFN_ERR_FAULT; 1987 } else { 1988 if (async && vma_is_valid(vma, write_fault)) 1989 *async = true; 1990 pfn = KVM_PFN_ERR_FAULT; 1991 } 1992 exit: 1993 mmap_read_unlock(current->mm); 1994 return pfn; 1995 } 1996 1997 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1998 bool atomic, bool *async, bool write_fault, 1999 bool *writable) 2000 { 2001 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 2002 2003 if (addr == KVM_HVA_ERR_RO_BAD) { 2004 if (writable) 2005 *writable = false; 2006 return KVM_PFN_ERR_RO_FAULT; 2007 } 2008 2009 if (kvm_is_error_hva(addr)) { 2010 if (writable) 2011 *writable = false; 2012 return KVM_PFN_NOSLOT; 2013 } 2014 2015 /* Do not map writable pfn in the readonly memslot. */ 2016 if (writable && memslot_is_readonly(slot)) { 2017 *writable = false; 2018 writable = NULL; 2019 } 2020 2021 return hva_to_pfn(addr, atomic, async, write_fault, 2022 writable); 2023 } 2024 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 2025 2026 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 2027 bool *writable) 2028 { 2029 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 2030 write_fault, writable); 2031 } 2032 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 2033 2034 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 2035 { 2036 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 2037 } 2038 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 2039 2040 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 2041 { 2042 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 2043 } 2044 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 2045 2046 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 2047 { 2048 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2049 } 2050 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 2051 2052 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 2053 { 2054 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 2055 } 2056 EXPORT_SYMBOL_GPL(gfn_to_pfn); 2057 2058 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 2059 { 2060 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2061 } 2062 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 2063 2064 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2065 struct page **pages, int nr_pages) 2066 { 2067 unsigned long addr; 2068 gfn_t entry = 0; 2069 2070 addr = gfn_to_hva_many(slot, gfn, &entry); 2071 if (kvm_is_error_hva(addr)) 2072 return -1; 2073 2074 if (entry < nr_pages) 2075 return 0; 2076 2077 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); 2078 } 2079 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2080 2081 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2082 { 2083 if (is_error_noslot_pfn(pfn)) 2084 return KVM_ERR_PTR_BAD_PAGE; 2085 2086 if (kvm_is_reserved_pfn(pfn)) { 2087 WARN_ON(1); 2088 return KVM_ERR_PTR_BAD_PAGE; 2089 } 2090 2091 return pfn_to_page(pfn); 2092 } 2093 2094 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2095 { 2096 kvm_pfn_t pfn; 2097 2098 pfn = gfn_to_pfn(kvm, gfn); 2099 2100 return kvm_pfn_to_page(pfn); 2101 } 2102 EXPORT_SYMBOL_GPL(gfn_to_page); 2103 2104 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 2105 { 2106 if (pfn == 0) 2107 return; 2108 2109 if (cache) 2110 cache->pfn = cache->gfn = 0; 2111 2112 if (dirty) 2113 kvm_release_pfn_dirty(pfn); 2114 else 2115 kvm_release_pfn_clean(pfn); 2116 } 2117 2118 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 2119 struct gfn_to_pfn_cache *cache, u64 gen) 2120 { 2121 kvm_release_pfn(cache->pfn, cache->dirty, cache); 2122 2123 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 2124 cache->gfn = gfn; 2125 cache->dirty = false; 2126 cache->generation = gen; 2127 } 2128 2129 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 2130 struct kvm_host_map *map, 2131 struct gfn_to_pfn_cache *cache, 2132 bool atomic) 2133 { 2134 kvm_pfn_t pfn; 2135 void *hva = NULL; 2136 struct page *page = KVM_UNMAPPED_PAGE; 2137 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 2138 u64 gen = slots->generation; 2139 2140 if (!map) 2141 return -EINVAL; 2142 2143 if (cache) { 2144 if (!cache->pfn || cache->gfn != gfn || 2145 cache->generation != gen) { 2146 if (atomic) 2147 return -EAGAIN; 2148 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 2149 } 2150 pfn = cache->pfn; 2151 } else { 2152 if (atomic) 2153 return -EAGAIN; 2154 pfn = gfn_to_pfn_memslot(slot, gfn); 2155 } 2156 if (is_error_noslot_pfn(pfn)) 2157 return -EINVAL; 2158 2159 if (pfn_valid(pfn)) { 2160 page = pfn_to_page(pfn); 2161 if (atomic) 2162 hva = kmap_atomic(page); 2163 else 2164 hva = kmap(page); 2165 #ifdef CONFIG_HAS_IOMEM 2166 } else if (!atomic) { 2167 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2168 } else { 2169 return -EINVAL; 2170 #endif 2171 } 2172 2173 if (!hva) 2174 return -EFAULT; 2175 2176 map->page = page; 2177 map->hva = hva; 2178 map->pfn = pfn; 2179 map->gfn = gfn; 2180 2181 return 0; 2182 } 2183 2184 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 2185 struct gfn_to_pfn_cache *cache, bool atomic) 2186 { 2187 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 2188 cache, atomic); 2189 } 2190 EXPORT_SYMBOL_GPL(kvm_map_gfn); 2191 2192 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2193 { 2194 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 2195 NULL, false); 2196 } 2197 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2198 2199 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 2200 struct kvm_host_map *map, 2201 struct gfn_to_pfn_cache *cache, 2202 bool dirty, bool atomic) 2203 { 2204 if (!map) 2205 return; 2206 2207 if (!map->hva) 2208 return; 2209 2210 if (map->page != KVM_UNMAPPED_PAGE) { 2211 if (atomic) 2212 kunmap_atomic(map->hva); 2213 else 2214 kunmap(map->page); 2215 } 2216 #ifdef CONFIG_HAS_IOMEM 2217 else if (!atomic) 2218 memunmap(map->hva); 2219 else 2220 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 2221 #endif 2222 2223 if (dirty) 2224 mark_page_dirty_in_slot(memslot, map->gfn); 2225 2226 if (cache) 2227 cache->dirty |= dirty; 2228 else 2229 kvm_release_pfn(map->pfn, dirty, NULL); 2230 2231 map->hva = NULL; 2232 map->page = NULL; 2233 } 2234 2235 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 2236 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 2237 { 2238 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 2239 cache, dirty, atomic); 2240 return 0; 2241 } 2242 EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 2243 2244 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2245 { 2246 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 2247 dirty, false); 2248 } 2249 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2250 2251 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2252 { 2253 kvm_pfn_t pfn; 2254 2255 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2256 2257 return kvm_pfn_to_page(pfn); 2258 } 2259 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2260 2261 void kvm_release_page_clean(struct page *page) 2262 { 2263 WARN_ON(is_error_page(page)); 2264 2265 kvm_release_pfn_clean(page_to_pfn(page)); 2266 } 2267 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2268 2269 void kvm_release_pfn_clean(kvm_pfn_t pfn) 2270 { 2271 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2272 put_page(pfn_to_page(pfn)); 2273 } 2274 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2275 2276 void kvm_release_page_dirty(struct page *page) 2277 { 2278 WARN_ON(is_error_page(page)); 2279 2280 kvm_release_pfn_dirty(page_to_pfn(page)); 2281 } 2282 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2283 2284 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2285 { 2286 kvm_set_pfn_dirty(pfn); 2287 kvm_release_pfn_clean(pfn); 2288 } 2289 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2290 2291 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2292 { 2293 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2294 SetPageDirty(pfn_to_page(pfn)); 2295 } 2296 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2297 2298 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2299 { 2300 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2301 mark_page_accessed(pfn_to_page(pfn)); 2302 } 2303 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2304 2305 void kvm_get_pfn(kvm_pfn_t pfn) 2306 { 2307 if (!kvm_is_reserved_pfn(pfn)) 2308 get_page(pfn_to_page(pfn)); 2309 } 2310 EXPORT_SYMBOL_GPL(kvm_get_pfn); 2311 2312 static int next_segment(unsigned long len, int offset) 2313 { 2314 if (len > PAGE_SIZE - offset) 2315 return PAGE_SIZE - offset; 2316 else 2317 return len; 2318 } 2319 2320 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2321 void *data, int offset, int len) 2322 { 2323 int r; 2324 unsigned long addr; 2325 2326 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2327 if (kvm_is_error_hva(addr)) 2328 return -EFAULT; 2329 r = __copy_from_user(data, (void __user *)addr + offset, len); 2330 if (r) 2331 return -EFAULT; 2332 return 0; 2333 } 2334 2335 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2336 int len) 2337 { 2338 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2339 2340 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2341 } 2342 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2343 2344 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2345 int offset, int len) 2346 { 2347 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2348 2349 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2350 } 2351 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2352 2353 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2354 { 2355 gfn_t gfn = gpa >> PAGE_SHIFT; 2356 int seg; 2357 int offset = offset_in_page(gpa); 2358 int ret; 2359 2360 while ((seg = next_segment(len, offset)) != 0) { 2361 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2362 if (ret < 0) 2363 return ret; 2364 offset = 0; 2365 len -= seg; 2366 data += seg; 2367 ++gfn; 2368 } 2369 return 0; 2370 } 2371 EXPORT_SYMBOL_GPL(kvm_read_guest); 2372 2373 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2374 { 2375 gfn_t gfn = gpa >> PAGE_SHIFT; 2376 int seg; 2377 int offset = offset_in_page(gpa); 2378 int ret; 2379 2380 while ((seg = next_segment(len, offset)) != 0) { 2381 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2382 if (ret < 0) 2383 return ret; 2384 offset = 0; 2385 len -= seg; 2386 data += seg; 2387 ++gfn; 2388 } 2389 return 0; 2390 } 2391 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2392 2393 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2394 void *data, int offset, unsigned long len) 2395 { 2396 int r; 2397 unsigned long addr; 2398 2399 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2400 if (kvm_is_error_hva(addr)) 2401 return -EFAULT; 2402 pagefault_disable(); 2403 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2404 pagefault_enable(); 2405 if (r) 2406 return -EFAULT; 2407 return 0; 2408 } 2409 2410 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2411 void *data, unsigned long len) 2412 { 2413 gfn_t gfn = gpa >> PAGE_SHIFT; 2414 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2415 int offset = offset_in_page(gpa); 2416 2417 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2418 } 2419 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2420 2421 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2422 const void *data, int offset, int len) 2423 { 2424 int r; 2425 unsigned long addr; 2426 2427 addr = gfn_to_hva_memslot(memslot, gfn); 2428 if (kvm_is_error_hva(addr)) 2429 return -EFAULT; 2430 r = __copy_to_user((void __user *)addr + offset, data, len); 2431 if (r) 2432 return -EFAULT; 2433 mark_page_dirty_in_slot(memslot, gfn); 2434 return 0; 2435 } 2436 2437 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2438 const void *data, int offset, int len) 2439 { 2440 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2441 2442 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2443 } 2444 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2445 2446 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2447 const void *data, int offset, int len) 2448 { 2449 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2450 2451 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2452 } 2453 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2454 2455 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2456 unsigned long len) 2457 { 2458 gfn_t gfn = gpa >> PAGE_SHIFT; 2459 int seg; 2460 int offset = offset_in_page(gpa); 2461 int ret; 2462 2463 while ((seg = next_segment(len, offset)) != 0) { 2464 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2465 if (ret < 0) 2466 return ret; 2467 offset = 0; 2468 len -= seg; 2469 data += seg; 2470 ++gfn; 2471 } 2472 return 0; 2473 } 2474 EXPORT_SYMBOL_GPL(kvm_write_guest); 2475 2476 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2477 unsigned long len) 2478 { 2479 gfn_t gfn = gpa >> PAGE_SHIFT; 2480 int seg; 2481 int offset = offset_in_page(gpa); 2482 int ret; 2483 2484 while ((seg = next_segment(len, offset)) != 0) { 2485 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2486 if (ret < 0) 2487 return ret; 2488 offset = 0; 2489 len -= seg; 2490 data += seg; 2491 ++gfn; 2492 } 2493 return 0; 2494 } 2495 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2496 2497 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2498 struct gfn_to_hva_cache *ghc, 2499 gpa_t gpa, unsigned long len) 2500 { 2501 int offset = offset_in_page(gpa); 2502 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2503 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2504 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2505 gfn_t nr_pages_avail; 2506 2507 /* Update ghc->generation before performing any error checks. */ 2508 ghc->generation = slots->generation; 2509 2510 if (start_gfn > end_gfn) { 2511 ghc->hva = KVM_HVA_ERR_BAD; 2512 return -EINVAL; 2513 } 2514 2515 /* 2516 * If the requested region crosses two memslots, we still 2517 * verify that the entire region is valid here. 2518 */ 2519 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2520 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2521 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2522 &nr_pages_avail); 2523 if (kvm_is_error_hva(ghc->hva)) 2524 return -EFAULT; 2525 } 2526 2527 /* Use the slow path for cross page reads and writes. */ 2528 if (nr_pages_needed == 1) 2529 ghc->hva += offset; 2530 else 2531 ghc->memslot = NULL; 2532 2533 ghc->gpa = gpa; 2534 ghc->len = len; 2535 return 0; 2536 } 2537 2538 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2539 gpa_t gpa, unsigned long len) 2540 { 2541 struct kvm_memslots *slots = kvm_memslots(kvm); 2542 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2543 } 2544 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2545 2546 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2547 void *data, unsigned int offset, 2548 unsigned long len) 2549 { 2550 struct kvm_memslots *slots = kvm_memslots(kvm); 2551 int r; 2552 gpa_t gpa = ghc->gpa + offset; 2553 2554 BUG_ON(len + offset > ghc->len); 2555 2556 if (slots->generation != ghc->generation) { 2557 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2558 return -EFAULT; 2559 } 2560 2561 if (kvm_is_error_hva(ghc->hva)) 2562 return -EFAULT; 2563 2564 if (unlikely(!ghc->memslot)) 2565 return kvm_write_guest(kvm, gpa, data, len); 2566 2567 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2568 if (r) 2569 return -EFAULT; 2570 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2571 2572 return 0; 2573 } 2574 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2575 2576 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2577 void *data, unsigned long len) 2578 { 2579 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2580 } 2581 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2582 2583 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2584 void *data, unsigned int offset, 2585 unsigned long len) 2586 { 2587 struct kvm_memslots *slots = kvm_memslots(kvm); 2588 int r; 2589 gpa_t gpa = ghc->gpa + offset; 2590 2591 BUG_ON(len + offset > ghc->len); 2592 2593 if (slots->generation != ghc->generation) { 2594 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2595 return -EFAULT; 2596 } 2597 2598 if (kvm_is_error_hva(ghc->hva)) 2599 return -EFAULT; 2600 2601 if (unlikely(!ghc->memslot)) 2602 return kvm_read_guest(kvm, gpa, data, len); 2603 2604 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); 2605 if (r) 2606 return -EFAULT; 2607 2608 return 0; 2609 } 2610 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); 2611 2612 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2613 void *data, unsigned long len) 2614 { 2615 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); 2616 } 2617 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2618 2619 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2620 { 2621 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2622 2623 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2624 } 2625 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2626 2627 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2628 { 2629 gfn_t gfn = gpa >> PAGE_SHIFT; 2630 int seg; 2631 int offset = offset_in_page(gpa); 2632 int ret; 2633 2634 while ((seg = next_segment(len, offset)) != 0) { 2635 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2636 if (ret < 0) 2637 return ret; 2638 offset = 0; 2639 len -= seg; 2640 ++gfn; 2641 } 2642 return 0; 2643 } 2644 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2645 2646 void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn) 2647 { 2648 if (memslot && memslot->dirty_bitmap) { 2649 unsigned long rel_gfn = gfn - memslot->base_gfn; 2650 2651 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2652 } 2653 } 2654 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); 2655 2656 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2657 { 2658 struct kvm_memory_slot *memslot; 2659 2660 memslot = gfn_to_memslot(kvm, gfn); 2661 mark_page_dirty_in_slot(memslot, gfn); 2662 } 2663 EXPORT_SYMBOL_GPL(mark_page_dirty); 2664 2665 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2666 { 2667 struct kvm_memory_slot *memslot; 2668 2669 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2670 mark_page_dirty_in_slot(memslot, gfn); 2671 } 2672 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2673 2674 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2675 { 2676 if (!vcpu->sigset_active) 2677 return; 2678 2679 /* 2680 * This does a lockless modification of ->real_blocked, which is fine 2681 * because, only current can change ->real_blocked and all readers of 2682 * ->real_blocked don't care as long ->real_blocked is always a subset 2683 * of ->blocked. 2684 */ 2685 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2686 } 2687 2688 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2689 { 2690 if (!vcpu->sigset_active) 2691 return; 2692 2693 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2694 sigemptyset(¤t->real_blocked); 2695 } 2696 2697 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2698 { 2699 unsigned int old, val, grow, grow_start; 2700 2701 old = val = vcpu->halt_poll_ns; 2702 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2703 grow = READ_ONCE(halt_poll_ns_grow); 2704 if (!grow) 2705 goto out; 2706 2707 val *= grow; 2708 if (val < grow_start) 2709 val = grow_start; 2710 2711 if (val > halt_poll_ns) 2712 val = halt_poll_ns; 2713 2714 vcpu->halt_poll_ns = val; 2715 out: 2716 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2717 } 2718 2719 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2720 { 2721 unsigned int old, val, shrink; 2722 2723 old = val = vcpu->halt_poll_ns; 2724 shrink = READ_ONCE(halt_poll_ns_shrink); 2725 if (shrink == 0) 2726 val = 0; 2727 else 2728 val /= shrink; 2729 2730 vcpu->halt_poll_ns = val; 2731 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2732 } 2733 2734 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2735 { 2736 int ret = -EINTR; 2737 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2738 2739 if (kvm_arch_vcpu_runnable(vcpu)) { 2740 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2741 goto out; 2742 } 2743 if (kvm_cpu_has_pending_timer(vcpu)) 2744 goto out; 2745 if (signal_pending(current)) 2746 goto out; 2747 2748 ret = 0; 2749 out: 2750 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2751 return ret; 2752 } 2753 2754 static inline void 2755 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) 2756 { 2757 if (waited) 2758 vcpu->stat.halt_poll_fail_ns += poll_ns; 2759 else 2760 vcpu->stat.halt_poll_success_ns += poll_ns; 2761 } 2762 2763 /* 2764 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2765 */ 2766 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2767 { 2768 ktime_t start, cur, poll_end; 2769 bool waited = false; 2770 u64 block_ns; 2771 2772 kvm_arch_vcpu_blocking(vcpu); 2773 2774 start = cur = poll_end = ktime_get(); 2775 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2776 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2777 2778 ++vcpu->stat.halt_attempted_poll; 2779 do { 2780 /* 2781 * This sets KVM_REQ_UNHALT if an interrupt 2782 * arrives. 2783 */ 2784 if (kvm_vcpu_check_block(vcpu) < 0) { 2785 ++vcpu->stat.halt_successful_poll; 2786 if (!vcpu_valid_wakeup(vcpu)) 2787 ++vcpu->stat.halt_poll_invalid; 2788 goto out; 2789 } 2790 poll_end = cur = ktime_get(); 2791 } while (single_task_running() && ktime_before(cur, stop)); 2792 } 2793 2794 prepare_to_rcuwait(&vcpu->wait); 2795 for (;;) { 2796 set_current_state(TASK_INTERRUPTIBLE); 2797 2798 if (kvm_vcpu_check_block(vcpu) < 0) 2799 break; 2800 2801 waited = true; 2802 schedule(); 2803 } 2804 finish_rcuwait(&vcpu->wait); 2805 cur = ktime_get(); 2806 out: 2807 kvm_arch_vcpu_unblocking(vcpu); 2808 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2809 2810 update_halt_poll_stats( 2811 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); 2812 2813 if (!kvm_arch_no_poll(vcpu)) { 2814 if (!vcpu_valid_wakeup(vcpu)) { 2815 shrink_halt_poll_ns(vcpu); 2816 } else if (vcpu->kvm->max_halt_poll_ns) { 2817 if (block_ns <= vcpu->halt_poll_ns) 2818 ; 2819 /* we had a long block, shrink polling */ 2820 else if (vcpu->halt_poll_ns && 2821 block_ns > vcpu->kvm->max_halt_poll_ns) 2822 shrink_halt_poll_ns(vcpu); 2823 /* we had a short halt and our poll time is too small */ 2824 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && 2825 block_ns < vcpu->kvm->max_halt_poll_ns) 2826 grow_halt_poll_ns(vcpu); 2827 } else { 2828 vcpu->halt_poll_ns = 0; 2829 } 2830 } 2831 2832 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2833 kvm_arch_vcpu_block_finish(vcpu); 2834 } 2835 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2836 2837 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2838 { 2839 struct rcuwait *waitp; 2840 2841 waitp = kvm_arch_vcpu_get_wait(vcpu); 2842 if (rcuwait_wake_up(waitp)) { 2843 WRITE_ONCE(vcpu->ready, true); 2844 ++vcpu->stat.halt_wakeup; 2845 return true; 2846 } 2847 2848 return false; 2849 } 2850 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2851 2852 #ifndef CONFIG_S390 2853 /* 2854 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2855 */ 2856 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2857 { 2858 int me; 2859 int cpu = vcpu->cpu; 2860 2861 if (kvm_vcpu_wake_up(vcpu)) 2862 return; 2863 2864 me = get_cpu(); 2865 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2866 if (kvm_arch_vcpu_should_kick(vcpu)) 2867 smp_send_reschedule(cpu); 2868 put_cpu(); 2869 } 2870 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2871 #endif /* !CONFIG_S390 */ 2872 2873 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2874 { 2875 struct pid *pid; 2876 struct task_struct *task = NULL; 2877 int ret = 0; 2878 2879 rcu_read_lock(); 2880 pid = rcu_dereference(target->pid); 2881 if (pid) 2882 task = get_pid_task(pid, PIDTYPE_PID); 2883 rcu_read_unlock(); 2884 if (!task) 2885 return ret; 2886 ret = yield_to(task, 1); 2887 put_task_struct(task); 2888 2889 return ret; 2890 } 2891 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2892 2893 /* 2894 * Helper that checks whether a VCPU is eligible for directed yield. 2895 * Most eligible candidate to yield is decided by following heuristics: 2896 * 2897 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2898 * (preempted lock holder), indicated by @in_spin_loop. 2899 * Set at the beginning and cleared at the end of interception/PLE handler. 2900 * 2901 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2902 * chance last time (mostly it has become eligible now since we have probably 2903 * yielded to lockholder in last iteration. This is done by toggling 2904 * @dy_eligible each time a VCPU checked for eligibility.) 2905 * 2906 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2907 * to preempted lock-holder could result in wrong VCPU selection and CPU 2908 * burning. Giving priority for a potential lock-holder increases lock 2909 * progress. 2910 * 2911 * Since algorithm is based on heuristics, accessing another VCPU data without 2912 * locking does not harm. It may result in trying to yield to same VCPU, fail 2913 * and continue with next VCPU and so on. 2914 */ 2915 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2916 { 2917 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2918 bool eligible; 2919 2920 eligible = !vcpu->spin_loop.in_spin_loop || 2921 vcpu->spin_loop.dy_eligible; 2922 2923 if (vcpu->spin_loop.in_spin_loop) 2924 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2925 2926 return eligible; 2927 #else 2928 return true; 2929 #endif 2930 } 2931 2932 /* 2933 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2934 * a vcpu_load/vcpu_put pair. However, for most architectures 2935 * kvm_arch_vcpu_runnable does not require vcpu_load. 2936 */ 2937 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2938 { 2939 return kvm_arch_vcpu_runnable(vcpu); 2940 } 2941 2942 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2943 { 2944 if (kvm_arch_dy_runnable(vcpu)) 2945 return true; 2946 2947 #ifdef CONFIG_KVM_ASYNC_PF 2948 if (!list_empty_careful(&vcpu->async_pf.done)) 2949 return true; 2950 #endif 2951 2952 return false; 2953 } 2954 2955 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2956 { 2957 struct kvm *kvm = me->kvm; 2958 struct kvm_vcpu *vcpu; 2959 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2960 int yielded = 0; 2961 int try = 3; 2962 int pass; 2963 int i; 2964 2965 kvm_vcpu_set_in_spin_loop(me, true); 2966 /* 2967 * We boost the priority of a VCPU that is runnable but not 2968 * currently running, because it got preempted by something 2969 * else and called schedule in __vcpu_run. Hopefully that 2970 * VCPU is holding the lock that we need and will release it. 2971 * We approximate round-robin by starting at the last boosted VCPU. 2972 */ 2973 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2974 kvm_for_each_vcpu(i, vcpu, kvm) { 2975 if (!pass && i <= last_boosted_vcpu) { 2976 i = last_boosted_vcpu; 2977 continue; 2978 } else if (pass && i > last_boosted_vcpu) 2979 break; 2980 if (!READ_ONCE(vcpu->ready)) 2981 continue; 2982 if (vcpu == me) 2983 continue; 2984 if (rcuwait_active(&vcpu->wait) && 2985 !vcpu_dy_runnable(vcpu)) 2986 continue; 2987 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2988 !kvm_arch_vcpu_in_kernel(vcpu)) 2989 continue; 2990 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2991 continue; 2992 2993 yielded = kvm_vcpu_yield_to(vcpu); 2994 if (yielded > 0) { 2995 kvm->last_boosted_vcpu = i; 2996 break; 2997 } else if (yielded < 0) { 2998 try--; 2999 if (!try) 3000 break; 3001 } 3002 } 3003 } 3004 kvm_vcpu_set_in_spin_loop(me, false); 3005 3006 /* Ensure vcpu is not eligible during next spinloop */ 3007 kvm_vcpu_set_dy_eligible(me, false); 3008 } 3009 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 3010 3011 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 3012 { 3013 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 3014 struct page *page; 3015 3016 if (vmf->pgoff == 0) 3017 page = virt_to_page(vcpu->run); 3018 #ifdef CONFIG_X86 3019 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 3020 page = virt_to_page(vcpu->arch.pio_data); 3021 #endif 3022 #ifdef CONFIG_KVM_MMIO 3023 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 3024 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 3025 #endif 3026 else 3027 return kvm_arch_vcpu_fault(vcpu, vmf); 3028 get_page(page); 3029 vmf->page = page; 3030 return 0; 3031 } 3032 3033 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 3034 .fault = kvm_vcpu_fault, 3035 }; 3036 3037 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 3038 { 3039 vma->vm_ops = &kvm_vcpu_vm_ops; 3040 return 0; 3041 } 3042 3043 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 3044 { 3045 struct kvm_vcpu *vcpu = filp->private_data; 3046 3047 kvm_put_kvm(vcpu->kvm); 3048 return 0; 3049 } 3050 3051 static struct file_operations kvm_vcpu_fops = { 3052 .release = kvm_vcpu_release, 3053 .unlocked_ioctl = kvm_vcpu_ioctl, 3054 .mmap = kvm_vcpu_mmap, 3055 .llseek = noop_llseek, 3056 KVM_COMPAT(kvm_vcpu_compat_ioctl), 3057 }; 3058 3059 /* 3060 * Allocates an inode for the vcpu. 3061 */ 3062 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 3063 { 3064 char name[8 + 1 + ITOA_MAX_LEN + 1]; 3065 3066 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 3067 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 3068 } 3069 3070 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 3071 { 3072 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 3073 struct dentry *debugfs_dentry; 3074 char dir_name[ITOA_MAX_LEN * 2]; 3075 3076 if (!debugfs_initialized()) 3077 return; 3078 3079 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 3080 debugfs_dentry = debugfs_create_dir(dir_name, 3081 vcpu->kvm->debugfs_dentry); 3082 3083 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); 3084 #endif 3085 } 3086 3087 /* 3088 * Creates some virtual cpus. Good luck creating more than one. 3089 */ 3090 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3091 { 3092 int r; 3093 struct kvm_vcpu *vcpu; 3094 struct page *page; 3095 3096 if (id >= KVM_MAX_VCPU_ID) 3097 return -EINVAL; 3098 3099 mutex_lock(&kvm->lock); 3100 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 3101 mutex_unlock(&kvm->lock); 3102 return -EINVAL; 3103 } 3104 3105 kvm->created_vcpus++; 3106 mutex_unlock(&kvm->lock); 3107 3108 r = kvm_arch_vcpu_precreate(kvm, id); 3109 if (r) 3110 goto vcpu_decrement; 3111 3112 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3113 if (!vcpu) { 3114 r = -ENOMEM; 3115 goto vcpu_decrement; 3116 } 3117 3118 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3119 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3120 if (!page) { 3121 r = -ENOMEM; 3122 goto vcpu_free; 3123 } 3124 vcpu->run = page_address(page); 3125 3126 kvm_vcpu_init(vcpu, kvm, id); 3127 3128 r = kvm_arch_vcpu_create(vcpu); 3129 if (r) 3130 goto vcpu_free_run_page; 3131 3132 mutex_lock(&kvm->lock); 3133 if (kvm_get_vcpu_by_id(kvm, id)) { 3134 r = -EEXIST; 3135 goto unlock_vcpu_destroy; 3136 } 3137 3138 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3139 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 3140 3141 /* Now it's all set up, let userspace reach it */ 3142 kvm_get_kvm(kvm); 3143 r = create_vcpu_fd(vcpu); 3144 if (r < 0) { 3145 kvm_put_kvm_no_destroy(kvm); 3146 goto unlock_vcpu_destroy; 3147 } 3148 3149 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 3150 3151 /* 3152 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 3153 * before kvm->online_vcpu's incremented value. 3154 */ 3155 smp_wmb(); 3156 atomic_inc(&kvm->online_vcpus); 3157 3158 mutex_unlock(&kvm->lock); 3159 kvm_arch_vcpu_postcreate(vcpu); 3160 kvm_create_vcpu_debugfs(vcpu); 3161 return r; 3162 3163 unlock_vcpu_destroy: 3164 mutex_unlock(&kvm->lock); 3165 kvm_arch_vcpu_destroy(vcpu); 3166 vcpu_free_run_page: 3167 free_page((unsigned long)vcpu->run); 3168 vcpu_free: 3169 kmem_cache_free(kvm_vcpu_cache, vcpu); 3170 vcpu_decrement: 3171 mutex_lock(&kvm->lock); 3172 kvm->created_vcpus--; 3173 mutex_unlock(&kvm->lock); 3174 return r; 3175 } 3176 3177 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3178 { 3179 if (sigset) { 3180 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3181 vcpu->sigset_active = 1; 3182 vcpu->sigset = *sigset; 3183 } else 3184 vcpu->sigset_active = 0; 3185 return 0; 3186 } 3187 3188 static long kvm_vcpu_ioctl(struct file *filp, 3189 unsigned int ioctl, unsigned long arg) 3190 { 3191 struct kvm_vcpu *vcpu = filp->private_data; 3192 void __user *argp = (void __user *)arg; 3193 int r; 3194 struct kvm_fpu *fpu = NULL; 3195 struct kvm_sregs *kvm_sregs = NULL; 3196 3197 if (vcpu->kvm->mm != current->mm) 3198 return -EIO; 3199 3200 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3201 return -EINVAL; 3202 3203 /* 3204 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3205 * execution; mutex_lock() would break them. 3206 */ 3207 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3208 if (r != -ENOIOCTLCMD) 3209 return r; 3210 3211 if (mutex_lock_killable(&vcpu->mutex)) 3212 return -EINTR; 3213 switch (ioctl) { 3214 case KVM_RUN: { 3215 struct pid *oldpid; 3216 r = -EINVAL; 3217 if (arg) 3218 goto out; 3219 oldpid = rcu_access_pointer(vcpu->pid); 3220 if (unlikely(oldpid != task_pid(current))) { 3221 /* The thread running this VCPU changed. */ 3222 struct pid *newpid; 3223 3224 r = kvm_arch_vcpu_run_pid_change(vcpu); 3225 if (r) 3226 break; 3227 3228 newpid = get_task_pid(current, PIDTYPE_PID); 3229 rcu_assign_pointer(vcpu->pid, newpid); 3230 if (oldpid) 3231 synchronize_rcu(); 3232 put_pid(oldpid); 3233 } 3234 r = kvm_arch_vcpu_ioctl_run(vcpu); 3235 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3236 break; 3237 } 3238 case KVM_GET_REGS: { 3239 struct kvm_regs *kvm_regs; 3240 3241 r = -ENOMEM; 3242 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3243 if (!kvm_regs) 3244 goto out; 3245 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3246 if (r) 3247 goto out_free1; 3248 r = -EFAULT; 3249 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3250 goto out_free1; 3251 r = 0; 3252 out_free1: 3253 kfree(kvm_regs); 3254 break; 3255 } 3256 case KVM_SET_REGS: { 3257 struct kvm_regs *kvm_regs; 3258 3259 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3260 if (IS_ERR(kvm_regs)) { 3261 r = PTR_ERR(kvm_regs); 3262 goto out; 3263 } 3264 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3265 kfree(kvm_regs); 3266 break; 3267 } 3268 case KVM_GET_SREGS: { 3269 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3270 GFP_KERNEL_ACCOUNT); 3271 r = -ENOMEM; 3272 if (!kvm_sregs) 3273 goto out; 3274 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3275 if (r) 3276 goto out; 3277 r = -EFAULT; 3278 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3279 goto out; 3280 r = 0; 3281 break; 3282 } 3283 case KVM_SET_SREGS: { 3284 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3285 if (IS_ERR(kvm_sregs)) { 3286 r = PTR_ERR(kvm_sregs); 3287 kvm_sregs = NULL; 3288 goto out; 3289 } 3290 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3291 break; 3292 } 3293 case KVM_GET_MP_STATE: { 3294 struct kvm_mp_state mp_state; 3295 3296 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3297 if (r) 3298 goto out; 3299 r = -EFAULT; 3300 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3301 goto out; 3302 r = 0; 3303 break; 3304 } 3305 case KVM_SET_MP_STATE: { 3306 struct kvm_mp_state mp_state; 3307 3308 r = -EFAULT; 3309 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3310 goto out; 3311 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3312 break; 3313 } 3314 case KVM_TRANSLATE: { 3315 struct kvm_translation tr; 3316 3317 r = -EFAULT; 3318 if (copy_from_user(&tr, argp, sizeof(tr))) 3319 goto out; 3320 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3321 if (r) 3322 goto out; 3323 r = -EFAULT; 3324 if (copy_to_user(argp, &tr, sizeof(tr))) 3325 goto out; 3326 r = 0; 3327 break; 3328 } 3329 case KVM_SET_GUEST_DEBUG: { 3330 struct kvm_guest_debug dbg; 3331 3332 r = -EFAULT; 3333 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3334 goto out; 3335 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3336 break; 3337 } 3338 case KVM_SET_SIGNAL_MASK: { 3339 struct kvm_signal_mask __user *sigmask_arg = argp; 3340 struct kvm_signal_mask kvm_sigmask; 3341 sigset_t sigset, *p; 3342 3343 p = NULL; 3344 if (argp) { 3345 r = -EFAULT; 3346 if (copy_from_user(&kvm_sigmask, argp, 3347 sizeof(kvm_sigmask))) 3348 goto out; 3349 r = -EINVAL; 3350 if (kvm_sigmask.len != sizeof(sigset)) 3351 goto out; 3352 r = -EFAULT; 3353 if (copy_from_user(&sigset, sigmask_arg->sigset, 3354 sizeof(sigset))) 3355 goto out; 3356 p = &sigset; 3357 } 3358 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3359 break; 3360 } 3361 case KVM_GET_FPU: { 3362 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3363 r = -ENOMEM; 3364 if (!fpu) 3365 goto out; 3366 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3367 if (r) 3368 goto out; 3369 r = -EFAULT; 3370 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3371 goto out; 3372 r = 0; 3373 break; 3374 } 3375 case KVM_SET_FPU: { 3376 fpu = memdup_user(argp, sizeof(*fpu)); 3377 if (IS_ERR(fpu)) { 3378 r = PTR_ERR(fpu); 3379 fpu = NULL; 3380 goto out; 3381 } 3382 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3383 break; 3384 } 3385 default: 3386 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3387 } 3388 out: 3389 mutex_unlock(&vcpu->mutex); 3390 kfree(fpu); 3391 kfree(kvm_sregs); 3392 return r; 3393 } 3394 3395 #ifdef CONFIG_KVM_COMPAT 3396 static long kvm_vcpu_compat_ioctl(struct file *filp, 3397 unsigned int ioctl, unsigned long arg) 3398 { 3399 struct kvm_vcpu *vcpu = filp->private_data; 3400 void __user *argp = compat_ptr(arg); 3401 int r; 3402 3403 if (vcpu->kvm->mm != current->mm) 3404 return -EIO; 3405 3406 switch (ioctl) { 3407 case KVM_SET_SIGNAL_MASK: { 3408 struct kvm_signal_mask __user *sigmask_arg = argp; 3409 struct kvm_signal_mask kvm_sigmask; 3410 sigset_t sigset; 3411 3412 if (argp) { 3413 r = -EFAULT; 3414 if (copy_from_user(&kvm_sigmask, argp, 3415 sizeof(kvm_sigmask))) 3416 goto out; 3417 r = -EINVAL; 3418 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3419 goto out; 3420 r = -EFAULT; 3421 if (get_compat_sigset(&sigset, 3422 (compat_sigset_t __user *)sigmask_arg->sigset)) 3423 goto out; 3424 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3425 } else 3426 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3427 break; 3428 } 3429 default: 3430 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3431 } 3432 3433 out: 3434 return r; 3435 } 3436 #endif 3437 3438 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3439 { 3440 struct kvm_device *dev = filp->private_data; 3441 3442 if (dev->ops->mmap) 3443 return dev->ops->mmap(dev, vma); 3444 3445 return -ENODEV; 3446 } 3447 3448 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3449 int (*accessor)(struct kvm_device *dev, 3450 struct kvm_device_attr *attr), 3451 unsigned long arg) 3452 { 3453 struct kvm_device_attr attr; 3454 3455 if (!accessor) 3456 return -EPERM; 3457 3458 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3459 return -EFAULT; 3460 3461 return accessor(dev, &attr); 3462 } 3463 3464 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3465 unsigned long arg) 3466 { 3467 struct kvm_device *dev = filp->private_data; 3468 3469 if (dev->kvm->mm != current->mm) 3470 return -EIO; 3471 3472 switch (ioctl) { 3473 case KVM_SET_DEVICE_ATTR: 3474 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3475 case KVM_GET_DEVICE_ATTR: 3476 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3477 case KVM_HAS_DEVICE_ATTR: 3478 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3479 default: 3480 if (dev->ops->ioctl) 3481 return dev->ops->ioctl(dev, ioctl, arg); 3482 3483 return -ENOTTY; 3484 } 3485 } 3486 3487 static int kvm_device_release(struct inode *inode, struct file *filp) 3488 { 3489 struct kvm_device *dev = filp->private_data; 3490 struct kvm *kvm = dev->kvm; 3491 3492 if (dev->ops->release) { 3493 mutex_lock(&kvm->lock); 3494 list_del(&dev->vm_node); 3495 dev->ops->release(dev); 3496 mutex_unlock(&kvm->lock); 3497 } 3498 3499 kvm_put_kvm(kvm); 3500 return 0; 3501 } 3502 3503 static const struct file_operations kvm_device_fops = { 3504 .unlocked_ioctl = kvm_device_ioctl, 3505 .release = kvm_device_release, 3506 KVM_COMPAT(kvm_device_ioctl), 3507 .mmap = kvm_device_mmap, 3508 }; 3509 3510 struct kvm_device *kvm_device_from_filp(struct file *filp) 3511 { 3512 if (filp->f_op != &kvm_device_fops) 3513 return NULL; 3514 3515 return filp->private_data; 3516 } 3517 3518 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3519 #ifdef CONFIG_KVM_MPIC 3520 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3521 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3522 #endif 3523 }; 3524 3525 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3526 { 3527 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3528 return -ENOSPC; 3529 3530 if (kvm_device_ops_table[type] != NULL) 3531 return -EEXIST; 3532 3533 kvm_device_ops_table[type] = ops; 3534 return 0; 3535 } 3536 3537 void kvm_unregister_device_ops(u32 type) 3538 { 3539 if (kvm_device_ops_table[type] != NULL) 3540 kvm_device_ops_table[type] = NULL; 3541 } 3542 3543 static int kvm_ioctl_create_device(struct kvm *kvm, 3544 struct kvm_create_device *cd) 3545 { 3546 const struct kvm_device_ops *ops = NULL; 3547 struct kvm_device *dev; 3548 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3549 int type; 3550 int ret; 3551 3552 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3553 return -ENODEV; 3554 3555 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3556 ops = kvm_device_ops_table[type]; 3557 if (ops == NULL) 3558 return -ENODEV; 3559 3560 if (test) 3561 return 0; 3562 3563 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3564 if (!dev) 3565 return -ENOMEM; 3566 3567 dev->ops = ops; 3568 dev->kvm = kvm; 3569 3570 mutex_lock(&kvm->lock); 3571 ret = ops->create(dev, type); 3572 if (ret < 0) { 3573 mutex_unlock(&kvm->lock); 3574 kfree(dev); 3575 return ret; 3576 } 3577 list_add(&dev->vm_node, &kvm->devices); 3578 mutex_unlock(&kvm->lock); 3579 3580 if (ops->init) 3581 ops->init(dev); 3582 3583 kvm_get_kvm(kvm); 3584 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3585 if (ret < 0) { 3586 kvm_put_kvm_no_destroy(kvm); 3587 mutex_lock(&kvm->lock); 3588 list_del(&dev->vm_node); 3589 mutex_unlock(&kvm->lock); 3590 ops->destroy(dev); 3591 return ret; 3592 } 3593 3594 cd->fd = ret; 3595 return 0; 3596 } 3597 3598 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3599 { 3600 switch (arg) { 3601 case KVM_CAP_USER_MEMORY: 3602 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3603 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3604 case KVM_CAP_INTERNAL_ERROR_DATA: 3605 #ifdef CONFIG_HAVE_KVM_MSI 3606 case KVM_CAP_SIGNAL_MSI: 3607 #endif 3608 #ifdef CONFIG_HAVE_KVM_IRQFD 3609 case KVM_CAP_IRQFD: 3610 case KVM_CAP_IRQFD_RESAMPLE: 3611 #endif 3612 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3613 case KVM_CAP_CHECK_EXTENSION_VM: 3614 case KVM_CAP_ENABLE_CAP_VM: 3615 case KVM_CAP_HALT_POLL: 3616 return 1; 3617 #ifdef CONFIG_KVM_MMIO 3618 case KVM_CAP_COALESCED_MMIO: 3619 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3620 case KVM_CAP_COALESCED_PIO: 3621 return 1; 3622 #endif 3623 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3624 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3625 return KVM_DIRTY_LOG_MANUAL_CAPS; 3626 #endif 3627 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3628 case KVM_CAP_IRQ_ROUTING: 3629 return KVM_MAX_IRQ_ROUTES; 3630 #endif 3631 #if KVM_ADDRESS_SPACE_NUM > 1 3632 case KVM_CAP_MULTI_ADDRESS_SPACE: 3633 return KVM_ADDRESS_SPACE_NUM; 3634 #endif 3635 case KVM_CAP_NR_MEMSLOTS: 3636 return KVM_USER_MEM_SLOTS; 3637 default: 3638 break; 3639 } 3640 return kvm_vm_ioctl_check_extension(kvm, arg); 3641 } 3642 3643 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3644 struct kvm_enable_cap *cap) 3645 { 3646 return -EINVAL; 3647 } 3648 3649 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3650 struct kvm_enable_cap *cap) 3651 { 3652 switch (cap->cap) { 3653 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3654 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 3655 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 3656 3657 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 3658 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 3659 3660 if (cap->flags || (cap->args[0] & ~allowed_options)) 3661 return -EINVAL; 3662 kvm->manual_dirty_log_protect = cap->args[0]; 3663 return 0; 3664 } 3665 #endif 3666 case KVM_CAP_HALT_POLL: { 3667 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) 3668 return -EINVAL; 3669 3670 kvm->max_halt_poll_ns = cap->args[0]; 3671 return 0; 3672 } 3673 default: 3674 return kvm_vm_ioctl_enable_cap(kvm, cap); 3675 } 3676 } 3677 3678 static long kvm_vm_ioctl(struct file *filp, 3679 unsigned int ioctl, unsigned long arg) 3680 { 3681 struct kvm *kvm = filp->private_data; 3682 void __user *argp = (void __user *)arg; 3683 int r; 3684 3685 if (kvm->mm != current->mm) 3686 return -EIO; 3687 switch (ioctl) { 3688 case KVM_CREATE_VCPU: 3689 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3690 break; 3691 case KVM_ENABLE_CAP: { 3692 struct kvm_enable_cap cap; 3693 3694 r = -EFAULT; 3695 if (copy_from_user(&cap, argp, sizeof(cap))) 3696 goto out; 3697 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3698 break; 3699 } 3700 case KVM_SET_USER_MEMORY_REGION: { 3701 struct kvm_userspace_memory_region kvm_userspace_mem; 3702 3703 r = -EFAULT; 3704 if (copy_from_user(&kvm_userspace_mem, argp, 3705 sizeof(kvm_userspace_mem))) 3706 goto out; 3707 3708 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3709 break; 3710 } 3711 case KVM_GET_DIRTY_LOG: { 3712 struct kvm_dirty_log log; 3713 3714 r = -EFAULT; 3715 if (copy_from_user(&log, argp, sizeof(log))) 3716 goto out; 3717 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3718 break; 3719 } 3720 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3721 case KVM_CLEAR_DIRTY_LOG: { 3722 struct kvm_clear_dirty_log log; 3723 3724 r = -EFAULT; 3725 if (copy_from_user(&log, argp, sizeof(log))) 3726 goto out; 3727 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3728 break; 3729 } 3730 #endif 3731 #ifdef CONFIG_KVM_MMIO 3732 case KVM_REGISTER_COALESCED_MMIO: { 3733 struct kvm_coalesced_mmio_zone zone; 3734 3735 r = -EFAULT; 3736 if (copy_from_user(&zone, argp, sizeof(zone))) 3737 goto out; 3738 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3739 break; 3740 } 3741 case KVM_UNREGISTER_COALESCED_MMIO: { 3742 struct kvm_coalesced_mmio_zone zone; 3743 3744 r = -EFAULT; 3745 if (copy_from_user(&zone, argp, sizeof(zone))) 3746 goto out; 3747 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3748 break; 3749 } 3750 #endif 3751 case KVM_IRQFD: { 3752 struct kvm_irqfd data; 3753 3754 r = -EFAULT; 3755 if (copy_from_user(&data, argp, sizeof(data))) 3756 goto out; 3757 r = kvm_irqfd(kvm, &data); 3758 break; 3759 } 3760 case KVM_IOEVENTFD: { 3761 struct kvm_ioeventfd data; 3762 3763 r = -EFAULT; 3764 if (copy_from_user(&data, argp, sizeof(data))) 3765 goto out; 3766 r = kvm_ioeventfd(kvm, &data); 3767 break; 3768 } 3769 #ifdef CONFIG_HAVE_KVM_MSI 3770 case KVM_SIGNAL_MSI: { 3771 struct kvm_msi msi; 3772 3773 r = -EFAULT; 3774 if (copy_from_user(&msi, argp, sizeof(msi))) 3775 goto out; 3776 r = kvm_send_userspace_msi(kvm, &msi); 3777 break; 3778 } 3779 #endif 3780 #ifdef __KVM_HAVE_IRQ_LINE 3781 case KVM_IRQ_LINE_STATUS: 3782 case KVM_IRQ_LINE: { 3783 struct kvm_irq_level irq_event; 3784 3785 r = -EFAULT; 3786 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3787 goto out; 3788 3789 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3790 ioctl == KVM_IRQ_LINE_STATUS); 3791 if (r) 3792 goto out; 3793 3794 r = -EFAULT; 3795 if (ioctl == KVM_IRQ_LINE_STATUS) { 3796 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3797 goto out; 3798 } 3799 3800 r = 0; 3801 break; 3802 } 3803 #endif 3804 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3805 case KVM_SET_GSI_ROUTING: { 3806 struct kvm_irq_routing routing; 3807 struct kvm_irq_routing __user *urouting; 3808 struct kvm_irq_routing_entry *entries = NULL; 3809 3810 r = -EFAULT; 3811 if (copy_from_user(&routing, argp, sizeof(routing))) 3812 goto out; 3813 r = -EINVAL; 3814 if (!kvm_arch_can_set_irq_routing(kvm)) 3815 goto out; 3816 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3817 goto out; 3818 if (routing.flags) 3819 goto out; 3820 if (routing.nr) { 3821 urouting = argp; 3822 entries = vmemdup_user(urouting->entries, 3823 array_size(sizeof(*entries), 3824 routing.nr)); 3825 if (IS_ERR(entries)) { 3826 r = PTR_ERR(entries); 3827 goto out; 3828 } 3829 } 3830 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3831 routing.flags); 3832 kvfree(entries); 3833 break; 3834 } 3835 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3836 case KVM_CREATE_DEVICE: { 3837 struct kvm_create_device cd; 3838 3839 r = -EFAULT; 3840 if (copy_from_user(&cd, argp, sizeof(cd))) 3841 goto out; 3842 3843 r = kvm_ioctl_create_device(kvm, &cd); 3844 if (r) 3845 goto out; 3846 3847 r = -EFAULT; 3848 if (copy_to_user(argp, &cd, sizeof(cd))) 3849 goto out; 3850 3851 r = 0; 3852 break; 3853 } 3854 case KVM_CHECK_EXTENSION: 3855 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3856 break; 3857 default: 3858 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3859 } 3860 out: 3861 return r; 3862 } 3863 3864 #ifdef CONFIG_KVM_COMPAT 3865 struct compat_kvm_dirty_log { 3866 __u32 slot; 3867 __u32 padding1; 3868 union { 3869 compat_uptr_t dirty_bitmap; /* one bit per page */ 3870 __u64 padding2; 3871 }; 3872 }; 3873 3874 static long kvm_vm_compat_ioctl(struct file *filp, 3875 unsigned int ioctl, unsigned long arg) 3876 { 3877 struct kvm *kvm = filp->private_data; 3878 int r; 3879 3880 if (kvm->mm != current->mm) 3881 return -EIO; 3882 switch (ioctl) { 3883 case KVM_GET_DIRTY_LOG: { 3884 struct compat_kvm_dirty_log compat_log; 3885 struct kvm_dirty_log log; 3886 3887 if (copy_from_user(&compat_log, (void __user *)arg, 3888 sizeof(compat_log))) 3889 return -EFAULT; 3890 log.slot = compat_log.slot; 3891 log.padding1 = compat_log.padding1; 3892 log.padding2 = compat_log.padding2; 3893 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3894 3895 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3896 break; 3897 } 3898 default: 3899 r = kvm_vm_ioctl(filp, ioctl, arg); 3900 } 3901 return r; 3902 } 3903 #endif 3904 3905 static struct file_operations kvm_vm_fops = { 3906 .release = kvm_vm_release, 3907 .unlocked_ioctl = kvm_vm_ioctl, 3908 .llseek = noop_llseek, 3909 KVM_COMPAT(kvm_vm_compat_ioctl), 3910 }; 3911 3912 static int kvm_dev_ioctl_create_vm(unsigned long type) 3913 { 3914 int r; 3915 struct kvm *kvm; 3916 struct file *file; 3917 3918 kvm = kvm_create_vm(type); 3919 if (IS_ERR(kvm)) 3920 return PTR_ERR(kvm); 3921 #ifdef CONFIG_KVM_MMIO 3922 r = kvm_coalesced_mmio_init(kvm); 3923 if (r < 0) 3924 goto put_kvm; 3925 #endif 3926 r = get_unused_fd_flags(O_CLOEXEC); 3927 if (r < 0) 3928 goto put_kvm; 3929 3930 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3931 if (IS_ERR(file)) { 3932 put_unused_fd(r); 3933 r = PTR_ERR(file); 3934 goto put_kvm; 3935 } 3936 3937 /* 3938 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3939 * already set, with ->release() being kvm_vm_release(). In error 3940 * cases it will be called by the final fput(file) and will take 3941 * care of doing kvm_put_kvm(kvm). 3942 */ 3943 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3944 put_unused_fd(r); 3945 fput(file); 3946 return -ENOMEM; 3947 } 3948 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3949 3950 fd_install(r, file); 3951 return r; 3952 3953 put_kvm: 3954 kvm_put_kvm(kvm); 3955 return r; 3956 } 3957 3958 static long kvm_dev_ioctl(struct file *filp, 3959 unsigned int ioctl, unsigned long arg) 3960 { 3961 long r = -EINVAL; 3962 3963 switch (ioctl) { 3964 case KVM_GET_API_VERSION: 3965 if (arg) 3966 goto out; 3967 r = KVM_API_VERSION; 3968 break; 3969 case KVM_CREATE_VM: 3970 r = kvm_dev_ioctl_create_vm(arg); 3971 break; 3972 case KVM_CHECK_EXTENSION: 3973 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3974 break; 3975 case KVM_GET_VCPU_MMAP_SIZE: 3976 if (arg) 3977 goto out; 3978 r = PAGE_SIZE; /* struct kvm_run */ 3979 #ifdef CONFIG_X86 3980 r += PAGE_SIZE; /* pio data page */ 3981 #endif 3982 #ifdef CONFIG_KVM_MMIO 3983 r += PAGE_SIZE; /* coalesced mmio ring page */ 3984 #endif 3985 break; 3986 case KVM_TRACE_ENABLE: 3987 case KVM_TRACE_PAUSE: 3988 case KVM_TRACE_DISABLE: 3989 r = -EOPNOTSUPP; 3990 break; 3991 default: 3992 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3993 } 3994 out: 3995 return r; 3996 } 3997 3998 static struct file_operations kvm_chardev_ops = { 3999 .unlocked_ioctl = kvm_dev_ioctl, 4000 .llseek = noop_llseek, 4001 KVM_COMPAT(kvm_dev_ioctl), 4002 }; 4003 4004 static struct miscdevice kvm_dev = { 4005 KVM_MINOR, 4006 "kvm", 4007 &kvm_chardev_ops, 4008 }; 4009 4010 static void hardware_enable_nolock(void *junk) 4011 { 4012 int cpu = raw_smp_processor_id(); 4013 int r; 4014 4015 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4016 return; 4017 4018 cpumask_set_cpu(cpu, cpus_hardware_enabled); 4019 4020 r = kvm_arch_hardware_enable(); 4021 4022 if (r) { 4023 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4024 atomic_inc(&hardware_enable_failed); 4025 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 4026 } 4027 } 4028 4029 static int kvm_starting_cpu(unsigned int cpu) 4030 { 4031 raw_spin_lock(&kvm_count_lock); 4032 if (kvm_usage_count) 4033 hardware_enable_nolock(NULL); 4034 raw_spin_unlock(&kvm_count_lock); 4035 return 0; 4036 } 4037 4038 static void hardware_disable_nolock(void *junk) 4039 { 4040 int cpu = raw_smp_processor_id(); 4041 4042 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4043 return; 4044 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4045 kvm_arch_hardware_disable(); 4046 } 4047 4048 static int kvm_dying_cpu(unsigned int cpu) 4049 { 4050 raw_spin_lock(&kvm_count_lock); 4051 if (kvm_usage_count) 4052 hardware_disable_nolock(NULL); 4053 raw_spin_unlock(&kvm_count_lock); 4054 return 0; 4055 } 4056 4057 static void hardware_disable_all_nolock(void) 4058 { 4059 BUG_ON(!kvm_usage_count); 4060 4061 kvm_usage_count--; 4062 if (!kvm_usage_count) 4063 on_each_cpu(hardware_disable_nolock, NULL, 1); 4064 } 4065 4066 static void hardware_disable_all(void) 4067 { 4068 raw_spin_lock(&kvm_count_lock); 4069 hardware_disable_all_nolock(); 4070 raw_spin_unlock(&kvm_count_lock); 4071 } 4072 4073 static int hardware_enable_all(void) 4074 { 4075 int r = 0; 4076 4077 raw_spin_lock(&kvm_count_lock); 4078 4079 kvm_usage_count++; 4080 if (kvm_usage_count == 1) { 4081 atomic_set(&hardware_enable_failed, 0); 4082 on_each_cpu(hardware_enable_nolock, NULL, 1); 4083 4084 if (atomic_read(&hardware_enable_failed)) { 4085 hardware_disable_all_nolock(); 4086 r = -EBUSY; 4087 } 4088 } 4089 4090 raw_spin_unlock(&kvm_count_lock); 4091 4092 return r; 4093 } 4094 4095 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4096 void *v) 4097 { 4098 /* 4099 * Some (well, at least mine) BIOSes hang on reboot if 4100 * in vmx root mode. 4101 * 4102 * And Intel TXT required VMX off for all cpu when system shutdown. 4103 */ 4104 pr_info("kvm: exiting hardware virtualization\n"); 4105 kvm_rebooting = true; 4106 on_each_cpu(hardware_disable_nolock, NULL, 1); 4107 return NOTIFY_OK; 4108 } 4109 4110 static struct notifier_block kvm_reboot_notifier = { 4111 .notifier_call = kvm_reboot, 4112 .priority = 0, 4113 }; 4114 4115 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4116 { 4117 int i; 4118 4119 for (i = 0; i < bus->dev_count; i++) { 4120 struct kvm_io_device *pos = bus->range[i].dev; 4121 4122 kvm_iodevice_destructor(pos); 4123 } 4124 kfree(bus); 4125 } 4126 4127 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4128 const struct kvm_io_range *r2) 4129 { 4130 gpa_t addr1 = r1->addr; 4131 gpa_t addr2 = r2->addr; 4132 4133 if (addr1 < addr2) 4134 return -1; 4135 4136 /* If r2->len == 0, match the exact address. If r2->len != 0, 4137 * accept any overlapping write. Any order is acceptable for 4138 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4139 * we process all of them. 4140 */ 4141 if (r2->len) { 4142 addr1 += r1->len; 4143 addr2 += r2->len; 4144 } 4145 4146 if (addr1 > addr2) 4147 return 1; 4148 4149 return 0; 4150 } 4151 4152 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 4153 { 4154 return kvm_io_bus_cmp(p1, p2); 4155 } 4156 4157 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 4158 gpa_t addr, int len) 4159 { 4160 struct kvm_io_range *range, key; 4161 int off; 4162 4163 key = (struct kvm_io_range) { 4164 .addr = addr, 4165 .len = len, 4166 }; 4167 4168 range = bsearch(&key, bus->range, bus->dev_count, 4169 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 4170 if (range == NULL) 4171 return -ENOENT; 4172 4173 off = range - bus->range; 4174 4175 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 4176 off--; 4177 4178 return off; 4179 } 4180 4181 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4182 struct kvm_io_range *range, const void *val) 4183 { 4184 int idx; 4185 4186 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4187 if (idx < 0) 4188 return -EOPNOTSUPP; 4189 4190 while (idx < bus->dev_count && 4191 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4192 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 4193 range->len, val)) 4194 return idx; 4195 idx++; 4196 } 4197 4198 return -EOPNOTSUPP; 4199 } 4200 4201 /* kvm_io_bus_write - called under kvm->slots_lock */ 4202 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4203 int len, const void *val) 4204 { 4205 struct kvm_io_bus *bus; 4206 struct kvm_io_range range; 4207 int r; 4208 4209 range = (struct kvm_io_range) { 4210 .addr = addr, 4211 .len = len, 4212 }; 4213 4214 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4215 if (!bus) 4216 return -ENOMEM; 4217 r = __kvm_io_bus_write(vcpu, bus, &range, val); 4218 return r < 0 ? r : 0; 4219 } 4220 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 4221 4222 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 4223 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 4224 gpa_t addr, int len, const void *val, long cookie) 4225 { 4226 struct kvm_io_bus *bus; 4227 struct kvm_io_range range; 4228 4229 range = (struct kvm_io_range) { 4230 .addr = addr, 4231 .len = len, 4232 }; 4233 4234 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4235 if (!bus) 4236 return -ENOMEM; 4237 4238 /* First try the device referenced by cookie. */ 4239 if ((cookie >= 0) && (cookie < bus->dev_count) && 4240 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 4241 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 4242 val)) 4243 return cookie; 4244 4245 /* 4246 * cookie contained garbage; fall back to search and return the 4247 * correct cookie value. 4248 */ 4249 return __kvm_io_bus_write(vcpu, bus, &range, val); 4250 } 4251 4252 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4253 struct kvm_io_range *range, void *val) 4254 { 4255 int idx; 4256 4257 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4258 if (idx < 0) 4259 return -EOPNOTSUPP; 4260 4261 while (idx < bus->dev_count && 4262 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4263 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 4264 range->len, val)) 4265 return idx; 4266 idx++; 4267 } 4268 4269 return -EOPNOTSUPP; 4270 } 4271 4272 /* kvm_io_bus_read - called under kvm->slots_lock */ 4273 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4274 int len, void *val) 4275 { 4276 struct kvm_io_bus *bus; 4277 struct kvm_io_range range; 4278 int r; 4279 4280 range = (struct kvm_io_range) { 4281 .addr = addr, 4282 .len = len, 4283 }; 4284 4285 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4286 if (!bus) 4287 return -ENOMEM; 4288 r = __kvm_io_bus_read(vcpu, bus, &range, val); 4289 return r < 0 ? r : 0; 4290 } 4291 4292 /* Caller must hold slots_lock. */ 4293 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 4294 int len, struct kvm_io_device *dev) 4295 { 4296 int i; 4297 struct kvm_io_bus *new_bus, *bus; 4298 struct kvm_io_range range; 4299 4300 bus = kvm_get_bus(kvm, bus_idx); 4301 if (!bus) 4302 return -ENOMEM; 4303 4304 /* exclude ioeventfd which is limited by maximum fd */ 4305 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 4306 return -ENOSPC; 4307 4308 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 4309 GFP_KERNEL_ACCOUNT); 4310 if (!new_bus) 4311 return -ENOMEM; 4312 4313 range = (struct kvm_io_range) { 4314 .addr = addr, 4315 .len = len, 4316 .dev = dev, 4317 }; 4318 4319 for (i = 0; i < bus->dev_count; i++) 4320 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4321 break; 4322 4323 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4324 new_bus->dev_count++; 4325 new_bus->range[i] = range; 4326 memcpy(new_bus->range + i + 1, bus->range + i, 4327 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4328 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4329 synchronize_srcu_expedited(&kvm->srcu); 4330 kfree(bus); 4331 4332 return 0; 4333 } 4334 4335 /* Caller must hold slots_lock. */ 4336 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4337 struct kvm_io_device *dev) 4338 { 4339 int i, j; 4340 struct kvm_io_bus *new_bus, *bus; 4341 4342 bus = kvm_get_bus(kvm, bus_idx); 4343 if (!bus) 4344 return; 4345 4346 for (i = 0; i < bus->dev_count; i++) 4347 if (bus->range[i].dev == dev) { 4348 break; 4349 } 4350 4351 if (i == bus->dev_count) 4352 return; 4353 4354 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4355 GFP_KERNEL_ACCOUNT); 4356 if (new_bus) { 4357 memcpy(new_bus, bus, struct_size(bus, range, i)); 4358 new_bus->dev_count--; 4359 memcpy(new_bus->range + i, bus->range + i + 1, 4360 flex_array_size(new_bus, range, new_bus->dev_count - i)); 4361 } else { 4362 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4363 for (j = 0; j < bus->dev_count; j++) { 4364 if (j == i) 4365 continue; 4366 kvm_iodevice_destructor(bus->range[j].dev); 4367 } 4368 } 4369 4370 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4371 synchronize_srcu_expedited(&kvm->srcu); 4372 kfree(bus); 4373 return; 4374 } 4375 4376 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4377 gpa_t addr) 4378 { 4379 struct kvm_io_bus *bus; 4380 int dev_idx, srcu_idx; 4381 struct kvm_io_device *iodev = NULL; 4382 4383 srcu_idx = srcu_read_lock(&kvm->srcu); 4384 4385 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4386 if (!bus) 4387 goto out_unlock; 4388 4389 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4390 if (dev_idx < 0) 4391 goto out_unlock; 4392 4393 iodev = bus->range[dev_idx].dev; 4394 4395 out_unlock: 4396 srcu_read_unlock(&kvm->srcu, srcu_idx); 4397 4398 return iodev; 4399 } 4400 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4401 4402 static int kvm_debugfs_open(struct inode *inode, struct file *file, 4403 int (*get)(void *, u64 *), int (*set)(void *, u64), 4404 const char *fmt) 4405 { 4406 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4407 inode->i_private; 4408 4409 /* The debugfs files are a reference to the kvm struct which 4410 * is still valid when kvm_destroy_vm is called. 4411 * To avoid the race between open and the removal of the debugfs 4412 * directory we test against the users count. 4413 */ 4414 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4415 return -ENOENT; 4416 4417 if (simple_attr_open(inode, file, get, 4418 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4419 ? set : NULL, 4420 fmt)) { 4421 kvm_put_kvm(stat_data->kvm); 4422 return -ENOMEM; 4423 } 4424 4425 return 0; 4426 } 4427 4428 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4429 { 4430 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4431 inode->i_private; 4432 4433 simple_attr_release(inode, file); 4434 kvm_put_kvm(stat_data->kvm); 4435 4436 return 0; 4437 } 4438 4439 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4440 { 4441 *val = *(ulong *)((void *)kvm + offset); 4442 4443 return 0; 4444 } 4445 4446 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4447 { 4448 *(ulong *)((void *)kvm + offset) = 0; 4449 4450 return 0; 4451 } 4452 4453 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4454 { 4455 int i; 4456 struct kvm_vcpu *vcpu; 4457 4458 *val = 0; 4459 4460 kvm_for_each_vcpu(i, vcpu, kvm) 4461 *val += *(u64 *)((void *)vcpu + offset); 4462 4463 return 0; 4464 } 4465 4466 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4467 { 4468 int i; 4469 struct kvm_vcpu *vcpu; 4470 4471 kvm_for_each_vcpu(i, vcpu, kvm) 4472 *(u64 *)((void *)vcpu + offset) = 0; 4473 4474 return 0; 4475 } 4476 4477 static int kvm_stat_data_get(void *data, u64 *val) 4478 { 4479 int r = -EFAULT; 4480 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4481 4482 switch (stat_data->dbgfs_item->kind) { 4483 case KVM_STAT_VM: 4484 r = kvm_get_stat_per_vm(stat_data->kvm, 4485 stat_data->dbgfs_item->offset, val); 4486 break; 4487 case KVM_STAT_VCPU: 4488 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4489 stat_data->dbgfs_item->offset, val); 4490 break; 4491 } 4492 4493 return r; 4494 } 4495 4496 static int kvm_stat_data_clear(void *data, u64 val) 4497 { 4498 int r = -EFAULT; 4499 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4500 4501 if (val) 4502 return -EINVAL; 4503 4504 switch (stat_data->dbgfs_item->kind) { 4505 case KVM_STAT_VM: 4506 r = kvm_clear_stat_per_vm(stat_data->kvm, 4507 stat_data->dbgfs_item->offset); 4508 break; 4509 case KVM_STAT_VCPU: 4510 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4511 stat_data->dbgfs_item->offset); 4512 break; 4513 } 4514 4515 return r; 4516 } 4517 4518 static int kvm_stat_data_open(struct inode *inode, struct file *file) 4519 { 4520 __simple_attr_check_format("%llu\n", 0ull); 4521 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4522 kvm_stat_data_clear, "%llu\n"); 4523 } 4524 4525 static const struct file_operations stat_fops_per_vm = { 4526 .owner = THIS_MODULE, 4527 .open = kvm_stat_data_open, 4528 .release = kvm_debugfs_release, 4529 .read = simple_attr_read, 4530 .write = simple_attr_write, 4531 .llseek = no_llseek, 4532 }; 4533 4534 static int vm_stat_get(void *_offset, u64 *val) 4535 { 4536 unsigned offset = (long)_offset; 4537 struct kvm *kvm; 4538 u64 tmp_val; 4539 4540 *val = 0; 4541 mutex_lock(&kvm_lock); 4542 list_for_each_entry(kvm, &vm_list, vm_list) { 4543 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4544 *val += tmp_val; 4545 } 4546 mutex_unlock(&kvm_lock); 4547 return 0; 4548 } 4549 4550 static int vm_stat_clear(void *_offset, u64 val) 4551 { 4552 unsigned offset = (long)_offset; 4553 struct kvm *kvm; 4554 4555 if (val) 4556 return -EINVAL; 4557 4558 mutex_lock(&kvm_lock); 4559 list_for_each_entry(kvm, &vm_list, vm_list) { 4560 kvm_clear_stat_per_vm(kvm, offset); 4561 } 4562 mutex_unlock(&kvm_lock); 4563 4564 return 0; 4565 } 4566 4567 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4568 4569 static int vcpu_stat_get(void *_offset, u64 *val) 4570 { 4571 unsigned offset = (long)_offset; 4572 struct kvm *kvm; 4573 u64 tmp_val; 4574 4575 *val = 0; 4576 mutex_lock(&kvm_lock); 4577 list_for_each_entry(kvm, &vm_list, vm_list) { 4578 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4579 *val += tmp_val; 4580 } 4581 mutex_unlock(&kvm_lock); 4582 return 0; 4583 } 4584 4585 static int vcpu_stat_clear(void *_offset, u64 val) 4586 { 4587 unsigned offset = (long)_offset; 4588 struct kvm *kvm; 4589 4590 if (val) 4591 return -EINVAL; 4592 4593 mutex_lock(&kvm_lock); 4594 list_for_each_entry(kvm, &vm_list, vm_list) { 4595 kvm_clear_stat_per_vcpu(kvm, offset); 4596 } 4597 mutex_unlock(&kvm_lock); 4598 4599 return 0; 4600 } 4601 4602 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4603 "%llu\n"); 4604 4605 static const struct file_operations *stat_fops[] = { 4606 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4607 [KVM_STAT_VM] = &vm_stat_fops, 4608 }; 4609 4610 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4611 { 4612 struct kobj_uevent_env *env; 4613 unsigned long long created, active; 4614 4615 if (!kvm_dev.this_device || !kvm) 4616 return; 4617 4618 mutex_lock(&kvm_lock); 4619 if (type == KVM_EVENT_CREATE_VM) { 4620 kvm_createvm_count++; 4621 kvm_active_vms++; 4622 } else if (type == KVM_EVENT_DESTROY_VM) { 4623 kvm_active_vms--; 4624 } 4625 created = kvm_createvm_count; 4626 active = kvm_active_vms; 4627 mutex_unlock(&kvm_lock); 4628 4629 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4630 if (!env) 4631 return; 4632 4633 add_uevent_var(env, "CREATED=%llu", created); 4634 add_uevent_var(env, "COUNT=%llu", active); 4635 4636 if (type == KVM_EVENT_CREATE_VM) { 4637 add_uevent_var(env, "EVENT=create"); 4638 kvm->userspace_pid = task_pid_nr(current); 4639 } else if (type == KVM_EVENT_DESTROY_VM) { 4640 add_uevent_var(env, "EVENT=destroy"); 4641 } 4642 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4643 4644 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4645 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4646 4647 if (p) { 4648 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4649 if (!IS_ERR(tmp)) 4650 add_uevent_var(env, "STATS_PATH=%s", tmp); 4651 kfree(p); 4652 } 4653 } 4654 /* no need for checks, since we are adding at most only 5 keys */ 4655 env->envp[env->envp_idx++] = NULL; 4656 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4657 kfree(env); 4658 } 4659 4660 static void kvm_init_debug(void) 4661 { 4662 struct kvm_stats_debugfs_item *p; 4663 4664 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4665 4666 kvm_debugfs_num_entries = 0; 4667 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4668 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4669 kvm_debugfs_dir, (void *)(long)p->offset, 4670 stat_fops[p->kind]); 4671 } 4672 } 4673 4674 static int kvm_suspend(void) 4675 { 4676 if (kvm_usage_count) 4677 hardware_disable_nolock(NULL); 4678 return 0; 4679 } 4680 4681 static void kvm_resume(void) 4682 { 4683 if (kvm_usage_count) { 4684 #ifdef CONFIG_LOCKDEP 4685 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4686 #endif 4687 hardware_enable_nolock(NULL); 4688 } 4689 } 4690 4691 static struct syscore_ops kvm_syscore_ops = { 4692 .suspend = kvm_suspend, 4693 .resume = kvm_resume, 4694 }; 4695 4696 static inline 4697 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4698 { 4699 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4700 } 4701 4702 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4703 { 4704 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4705 4706 WRITE_ONCE(vcpu->preempted, false); 4707 WRITE_ONCE(vcpu->ready, false); 4708 4709 __this_cpu_write(kvm_running_vcpu, vcpu); 4710 kvm_arch_sched_in(vcpu, cpu); 4711 kvm_arch_vcpu_load(vcpu, cpu); 4712 } 4713 4714 static void kvm_sched_out(struct preempt_notifier *pn, 4715 struct task_struct *next) 4716 { 4717 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4718 4719 if (current->state == TASK_RUNNING) { 4720 WRITE_ONCE(vcpu->preempted, true); 4721 WRITE_ONCE(vcpu->ready, true); 4722 } 4723 kvm_arch_vcpu_put(vcpu); 4724 __this_cpu_write(kvm_running_vcpu, NULL); 4725 } 4726 4727 /** 4728 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4729 * 4730 * We can disable preemption locally around accessing the per-CPU variable, 4731 * and use the resolved vcpu pointer after enabling preemption again, 4732 * because even if the current thread is migrated to another CPU, reading 4733 * the per-CPU value later will give us the same value as we update the 4734 * per-CPU variable in the preempt notifier handlers. 4735 */ 4736 struct kvm_vcpu *kvm_get_running_vcpu(void) 4737 { 4738 struct kvm_vcpu *vcpu; 4739 4740 preempt_disable(); 4741 vcpu = __this_cpu_read(kvm_running_vcpu); 4742 preempt_enable(); 4743 4744 return vcpu; 4745 } 4746 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); 4747 4748 /** 4749 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4750 */ 4751 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4752 { 4753 return &kvm_running_vcpu; 4754 } 4755 4756 struct kvm_cpu_compat_check { 4757 void *opaque; 4758 int *ret; 4759 }; 4760 4761 static void check_processor_compat(void *data) 4762 { 4763 struct kvm_cpu_compat_check *c = data; 4764 4765 *c->ret = kvm_arch_check_processor_compat(c->opaque); 4766 } 4767 4768 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4769 struct module *module) 4770 { 4771 struct kvm_cpu_compat_check c; 4772 int r; 4773 int cpu; 4774 4775 r = kvm_arch_init(opaque); 4776 if (r) 4777 goto out_fail; 4778 4779 /* 4780 * kvm_arch_init makes sure there's at most one caller 4781 * for architectures that support multiple implementations, 4782 * like intel and amd on x86. 4783 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4784 * conflicts in case kvm is already setup for another implementation. 4785 */ 4786 r = kvm_irqfd_init(); 4787 if (r) 4788 goto out_irqfd; 4789 4790 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4791 r = -ENOMEM; 4792 goto out_free_0; 4793 } 4794 4795 r = kvm_arch_hardware_setup(opaque); 4796 if (r < 0) 4797 goto out_free_1; 4798 4799 c.ret = &r; 4800 c.opaque = opaque; 4801 for_each_online_cpu(cpu) { 4802 smp_call_function_single(cpu, check_processor_compat, &c, 1); 4803 if (r < 0) 4804 goto out_free_2; 4805 } 4806 4807 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4808 kvm_starting_cpu, kvm_dying_cpu); 4809 if (r) 4810 goto out_free_2; 4811 register_reboot_notifier(&kvm_reboot_notifier); 4812 4813 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4814 if (!vcpu_align) 4815 vcpu_align = __alignof__(struct kvm_vcpu); 4816 kvm_vcpu_cache = 4817 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4818 SLAB_ACCOUNT, 4819 offsetof(struct kvm_vcpu, arch), 4820 sizeof_field(struct kvm_vcpu, arch), 4821 NULL); 4822 if (!kvm_vcpu_cache) { 4823 r = -ENOMEM; 4824 goto out_free_3; 4825 } 4826 4827 r = kvm_async_pf_init(); 4828 if (r) 4829 goto out_free; 4830 4831 kvm_chardev_ops.owner = module; 4832 kvm_vm_fops.owner = module; 4833 kvm_vcpu_fops.owner = module; 4834 4835 r = misc_register(&kvm_dev); 4836 if (r) { 4837 pr_err("kvm: misc device register failed\n"); 4838 goto out_unreg; 4839 } 4840 4841 register_syscore_ops(&kvm_syscore_ops); 4842 4843 kvm_preempt_ops.sched_in = kvm_sched_in; 4844 kvm_preempt_ops.sched_out = kvm_sched_out; 4845 4846 kvm_init_debug(); 4847 4848 r = kvm_vfio_ops_init(); 4849 WARN_ON(r); 4850 4851 return 0; 4852 4853 out_unreg: 4854 kvm_async_pf_deinit(); 4855 out_free: 4856 kmem_cache_destroy(kvm_vcpu_cache); 4857 out_free_3: 4858 unregister_reboot_notifier(&kvm_reboot_notifier); 4859 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4860 out_free_2: 4861 kvm_arch_hardware_unsetup(); 4862 out_free_1: 4863 free_cpumask_var(cpus_hardware_enabled); 4864 out_free_0: 4865 kvm_irqfd_exit(); 4866 out_irqfd: 4867 kvm_arch_exit(); 4868 out_fail: 4869 return r; 4870 } 4871 EXPORT_SYMBOL_GPL(kvm_init); 4872 4873 void kvm_exit(void) 4874 { 4875 debugfs_remove_recursive(kvm_debugfs_dir); 4876 misc_deregister(&kvm_dev); 4877 kmem_cache_destroy(kvm_vcpu_cache); 4878 kvm_async_pf_deinit(); 4879 unregister_syscore_ops(&kvm_syscore_ops); 4880 unregister_reboot_notifier(&kvm_reboot_notifier); 4881 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4882 on_each_cpu(hardware_disable_nolock, NULL, 1); 4883 kvm_arch_hardware_unsetup(); 4884 kvm_arch_exit(); 4885 kvm_irqfd_exit(); 4886 free_cpumask_var(cpus_hardware_enabled); 4887 kvm_vfio_ops_exit(); 4888 } 4889 EXPORT_SYMBOL_GPL(kvm_exit); 4890 4891 struct kvm_vm_worker_thread_context { 4892 struct kvm *kvm; 4893 struct task_struct *parent; 4894 struct completion init_done; 4895 kvm_vm_thread_fn_t thread_fn; 4896 uintptr_t data; 4897 int err; 4898 }; 4899 4900 static int kvm_vm_worker_thread(void *context) 4901 { 4902 /* 4903 * The init_context is allocated on the stack of the parent thread, so 4904 * we have to locally copy anything that is needed beyond initialization 4905 */ 4906 struct kvm_vm_worker_thread_context *init_context = context; 4907 struct kvm *kvm = init_context->kvm; 4908 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4909 uintptr_t data = init_context->data; 4910 int err; 4911 4912 err = kthread_park(current); 4913 /* kthread_park(current) is never supposed to return an error */ 4914 WARN_ON(err != 0); 4915 if (err) 4916 goto init_complete; 4917 4918 err = cgroup_attach_task_all(init_context->parent, current); 4919 if (err) { 4920 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4921 __func__, err); 4922 goto init_complete; 4923 } 4924 4925 set_user_nice(current, task_nice(init_context->parent)); 4926 4927 init_complete: 4928 init_context->err = err; 4929 complete(&init_context->init_done); 4930 init_context = NULL; 4931 4932 if (err) 4933 return err; 4934 4935 /* Wait to be woken up by the spawner before proceeding. */ 4936 kthread_parkme(); 4937 4938 if (!kthread_should_stop()) 4939 err = thread_fn(kvm, data); 4940 4941 return err; 4942 } 4943 4944 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4945 uintptr_t data, const char *name, 4946 struct task_struct **thread_ptr) 4947 { 4948 struct kvm_vm_worker_thread_context init_context = {}; 4949 struct task_struct *thread; 4950 4951 *thread_ptr = NULL; 4952 init_context.kvm = kvm; 4953 init_context.parent = current; 4954 init_context.thread_fn = thread_fn; 4955 init_context.data = data; 4956 init_completion(&init_context.init_done); 4957 4958 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4959 "%s-%d", name, task_pid_nr(current)); 4960 if (IS_ERR(thread)) 4961 return PTR_ERR(thread); 4962 4963 /* kthread_run is never supposed to return NULL */ 4964 WARN_ON(thread == NULL); 4965 4966 wait_for_completion(&init_context.init_done); 4967 4968 if (!init_context.err) 4969 *thread_ptr = thread; 4970 4971 return init_context.err; 4972 } 4973