1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 59 #include "coalesced_mmio.h" 60 #include "async_pf.h" 61 #include "vfio.h" 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/kvm.h> 65 66 #include <linux/kvm_dirty_ring.h> 67 68 /* Worst case buffer size needed for holding an integer. */ 69 #define ITOA_MAX_LEN 12 70 71 MODULE_AUTHOR("Qumranet"); 72 MODULE_LICENSE("GPL"); 73 74 /* Architectures should define their poll value according to the halt latency */ 75 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 76 module_param(halt_poll_ns, uint, 0644); 77 EXPORT_SYMBOL_GPL(halt_poll_ns); 78 79 /* Default doubles per-vcpu halt_poll_ns. */ 80 unsigned int halt_poll_ns_grow = 2; 81 module_param(halt_poll_ns_grow, uint, 0644); 82 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 83 84 /* The start value to grow halt_poll_ns from */ 85 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 86 module_param(halt_poll_ns_grow_start, uint, 0644); 87 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 88 89 /* Default resets per-vcpu halt_poll_ns . */ 90 unsigned int halt_poll_ns_shrink; 91 module_param(halt_poll_ns_shrink, uint, 0644); 92 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 93 94 /* 95 * Ordering of locks: 96 * 97 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 98 */ 99 100 DEFINE_MUTEX(kvm_lock); 101 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 102 LIST_HEAD(vm_list); 103 104 static cpumask_var_t cpus_hardware_enabled; 105 static int kvm_usage_count; 106 static atomic_t hardware_enable_failed; 107 108 static struct kmem_cache *kvm_vcpu_cache; 109 110 static __read_mostly struct preempt_ops kvm_preempt_ops; 111 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 112 113 struct dentry *kvm_debugfs_dir; 114 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 115 116 static int kvm_debugfs_num_entries; 117 static const struct file_operations stat_fops_per_vm; 118 119 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 120 unsigned long arg); 121 #ifdef CONFIG_KVM_COMPAT 122 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 123 unsigned long arg); 124 #define KVM_COMPAT(c) .compat_ioctl = (c) 125 #else 126 /* 127 * For architectures that don't implement a compat infrastructure, 128 * adopt a double line of defense: 129 * - Prevent a compat task from opening /dev/kvm 130 * - If the open has been done by a 64bit task, and the KVM fd 131 * passed to a compat task, let the ioctls fail. 132 */ 133 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 134 unsigned long arg) { return -EINVAL; } 135 136 static int kvm_no_compat_open(struct inode *inode, struct file *file) 137 { 138 return is_compat_task() ? -ENODEV : 0; 139 } 140 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 141 .open = kvm_no_compat_open 142 #endif 143 static int hardware_enable_all(void); 144 static void hardware_disable_all(void); 145 146 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 147 148 __visible bool kvm_rebooting; 149 EXPORT_SYMBOL_GPL(kvm_rebooting); 150 151 #define KVM_EVENT_CREATE_VM 0 152 #define KVM_EVENT_DESTROY_VM 1 153 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 154 static unsigned long long kvm_createvm_count; 155 static unsigned long long kvm_active_vms; 156 157 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 158 unsigned long start, unsigned long end) 159 { 160 } 161 162 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 163 { 164 /* 165 * The metadata used by is_zone_device_page() to determine whether or 166 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 167 * the device has been pinned, e.g. by get_user_pages(). WARN if the 168 * page_count() is zero to help detect bad usage of this helper. 169 */ 170 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 171 return false; 172 173 return is_zone_device_page(pfn_to_page(pfn)); 174 } 175 176 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 177 { 178 /* 179 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 180 * perspective they are "normal" pages, albeit with slightly different 181 * usage rules. 182 */ 183 if (pfn_valid(pfn)) 184 return PageReserved(pfn_to_page(pfn)) && 185 !is_zero_pfn(pfn) && 186 !kvm_is_zone_device_pfn(pfn); 187 188 return true; 189 } 190 191 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 192 { 193 struct page *page = pfn_to_page(pfn); 194 195 if (!PageTransCompoundMap(page)) 196 return false; 197 198 return is_transparent_hugepage(compound_head(page)); 199 } 200 201 /* 202 * Switches to specified vcpu, until a matching vcpu_put() 203 */ 204 void vcpu_load(struct kvm_vcpu *vcpu) 205 { 206 int cpu = get_cpu(); 207 208 __this_cpu_write(kvm_running_vcpu, vcpu); 209 preempt_notifier_register(&vcpu->preempt_notifier); 210 kvm_arch_vcpu_load(vcpu, cpu); 211 put_cpu(); 212 } 213 EXPORT_SYMBOL_GPL(vcpu_load); 214 215 void vcpu_put(struct kvm_vcpu *vcpu) 216 { 217 preempt_disable(); 218 kvm_arch_vcpu_put(vcpu); 219 preempt_notifier_unregister(&vcpu->preempt_notifier); 220 __this_cpu_write(kvm_running_vcpu, NULL); 221 preempt_enable(); 222 } 223 EXPORT_SYMBOL_GPL(vcpu_put); 224 225 /* TODO: merge with kvm_arch_vcpu_should_kick */ 226 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 227 { 228 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 229 230 /* 231 * We need to wait for the VCPU to reenable interrupts and get out of 232 * READING_SHADOW_PAGE_TABLES mode. 233 */ 234 if (req & KVM_REQUEST_WAIT) 235 return mode != OUTSIDE_GUEST_MODE; 236 237 /* 238 * Need to kick a running VCPU, but otherwise there is nothing to do. 239 */ 240 return mode == IN_GUEST_MODE; 241 } 242 243 static void ack_flush(void *_completed) 244 { 245 } 246 247 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 248 { 249 if (unlikely(!cpus)) 250 cpus = cpu_online_mask; 251 252 if (cpumask_empty(cpus)) 253 return false; 254 255 smp_call_function_many(cpus, ack_flush, NULL, wait); 256 return true; 257 } 258 259 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 260 struct kvm_vcpu *except, 261 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 262 { 263 int i, cpu, me; 264 struct kvm_vcpu *vcpu; 265 bool called; 266 267 me = get_cpu(); 268 269 kvm_for_each_vcpu(i, vcpu, kvm) { 270 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || 271 vcpu == except) 272 continue; 273 274 kvm_make_request(req, vcpu); 275 cpu = vcpu->cpu; 276 277 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 278 continue; 279 280 if (tmp != NULL && cpu != -1 && cpu != me && 281 kvm_request_needs_ipi(vcpu, req)) 282 __cpumask_set_cpu(cpu, tmp); 283 } 284 285 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 286 put_cpu(); 287 288 return called; 289 } 290 291 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 292 struct kvm_vcpu *except) 293 { 294 cpumask_var_t cpus; 295 bool called; 296 297 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 298 299 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); 300 301 free_cpumask_var(cpus); 302 return called; 303 } 304 305 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 306 { 307 return kvm_make_all_cpus_request_except(kvm, req, NULL); 308 } 309 310 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 311 void kvm_flush_remote_tlbs(struct kvm *kvm) 312 { 313 /* 314 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 315 * kvm_make_all_cpus_request. 316 */ 317 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 318 319 /* 320 * We want to publish modifications to the page tables before reading 321 * mode. Pairs with a memory barrier in arch-specific code. 322 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 323 * and smp_mb in walk_shadow_page_lockless_begin/end. 324 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 325 * 326 * There is already an smp_mb__after_atomic() before 327 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 328 * barrier here. 329 */ 330 if (!kvm_arch_flush_remote_tlb(kvm) 331 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 332 ++kvm->stat.remote_tlb_flush; 333 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 334 } 335 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 336 #endif 337 338 void kvm_reload_remote_mmus(struct kvm *kvm) 339 { 340 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 341 } 342 343 #ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 344 static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, 345 gfp_t gfp_flags) 346 { 347 gfp_flags |= mc->gfp_zero; 348 349 if (mc->kmem_cache) 350 return kmem_cache_alloc(mc->kmem_cache, gfp_flags); 351 else 352 return (void *)__get_free_page(gfp_flags); 353 } 354 355 int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) 356 { 357 void *obj; 358 359 if (mc->nobjs >= min) 360 return 0; 361 while (mc->nobjs < ARRAY_SIZE(mc->objects)) { 362 obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); 363 if (!obj) 364 return mc->nobjs >= min ? 0 : -ENOMEM; 365 mc->objects[mc->nobjs++] = obj; 366 } 367 return 0; 368 } 369 370 int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) 371 { 372 return mc->nobjs; 373 } 374 375 void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 376 { 377 while (mc->nobjs) { 378 if (mc->kmem_cache) 379 kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); 380 else 381 free_page((unsigned long)mc->objects[--mc->nobjs]); 382 } 383 } 384 385 void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) 386 { 387 void *p; 388 389 if (WARN_ON(!mc->nobjs)) 390 p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); 391 else 392 p = mc->objects[--mc->nobjs]; 393 BUG_ON(!p); 394 return p; 395 } 396 #endif 397 398 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 399 { 400 mutex_init(&vcpu->mutex); 401 vcpu->cpu = -1; 402 vcpu->kvm = kvm; 403 vcpu->vcpu_id = id; 404 vcpu->pid = NULL; 405 rcuwait_init(&vcpu->wait); 406 kvm_async_pf_vcpu_init(vcpu); 407 408 vcpu->pre_pcpu = -1; 409 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 410 411 kvm_vcpu_set_in_spin_loop(vcpu, false); 412 kvm_vcpu_set_dy_eligible(vcpu, false); 413 vcpu->preempted = false; 414 vcpu->ready = false; 415 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 416 } 417 418 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 419 { 420 kvm_dirty_ring_free(&vcpu->dirty_ring); 421 kvm_arch_vcpu_destroy(vcpu); 422 423 /* 424 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 425 * the vcpu->pid pointer, and at destruction time all file descriptors 426 * are already gone. 427 */ 428 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 429 430 free_page((unsigned long)vcpu->run); 431 kmem_cache_free(kvm_vcpu_cache, vcpu); 432 } 433 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 434 435 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 436 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 437 { 438 return container_of(mn, struct kvm, mmu_notifier); 439 } 440 441 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, 442 struct mm_struct *mm, 443 unsigned long start, unsigned long end) 444 { 445 struct kvm *kvm = mmu_notifier_to_kvm(mn); 446 int idx; 447 448 idx = srcu_read_lock(&kvm->srcu); 449 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 450 srcu_read_unlock(&kvm->srcu, idx); 451 } 452 453 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 454 struct mm_struct *mm, 455 unsigned long address, 456 pte_t pte) 457 { 458 struct kvm *kvm = mmu_notifier_to_kvm(mn); 459 int idx; 460 461 idx = srcu_read_lock(&kvm->srcu); 462 spin_lock(&kvm->mmu_lock); 463 kvm->mmu_notifier_seq++; 464 465 if (kvm_set_spte_hva(kvm, address, pte)) 466 kvm_flush_remote_tlbs(kvm); 467 468 spin_unlock(&kvm->mmu_lock); 469 srcu_read_unlock(&kvm->srcu, idx); 470 } 471 472 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 473 const struct mmu_notifier_range *range) 474 { 475 struct kvm *kvm = mmu_notifier_to_kvm(mn); 476 int need_tlb_flush = 0, idx; 477 478 idx = srcu_read_lock(&kvm->srcu); 479 spin_lock(&kvm->mmu_lock); 480 /* 481 * The count increase must become visible at unlock time as no 482 * spte can be established without taking the mmu_lock and 483 * count is also read inside the mmu_lock critical section. 484 */ 485 kvm->mmu_notifier_count++; 486 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end, 487 range->flags); 488 need_tlb_flush |= kvm->tlbs_dirty; 489 /* we've to flush the tlb before the pages can be freed */ 490 if (need_tlb_flush) 491 kvm_flush_remote_tlbs(kvm); 492 493 spin_unlock(&kvm->mmu_lock); 494 srcu_read_unlock(&kvm->srcu, idx); 495 496 return 0; 497 } 498 499 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 500 const struct mmu_notifier_range *range) 501 { 502 struct kvm *kvm = mmu_notifier_to_kvm(mn); 503 504 spin_lock(&kvm->mmu_lock); 505 /* 506 * This sequence increase will notify the kvm page fault that 507 * the page that is going to be mapped in the spte could have 508 * been freed. 509 */ 510 kvm->mmu_notifier_seq++; 511 smp_wmb(); 512 /* 513 * The above sequence increase must be visible before the 514 * below count decrease, which is ensured by the smp_wmb above 515 * in conjunction with the smp_rmb in mmu_notifier_retry(). 516 */ 517 kvm->mmu_notifier_count--; 518 spin_unlock(&kvm->mmu_lock); 519 520 BUG_ON(kvm->mmu_notifier_count < 0); 521 } 522 523 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 524 struct mm_struct *mm, 525 unsigned long start, 526 unsigned long end) 527 { 528 struct kvm *kvm = mmu_notifier_to_kvm(mn); 529 int young, idx; 530 531 idx = srcu_read_lock(&kvm->srcu); 532 spin_lock(&kvm->mmu_lock); 533 534 young = kvm_age_hva(kvm, start, end); 535 if (young) 536 kvm_flush_remote_tlbs(kvm); 537 538 spin_unlock(&kvm->mmu_lock); 539 srcu_read_unlock(&kvm->srcu, idx); 540 541 return young; 542 } 543 544 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 545 struct mm_struct *mm, 546 unsigned long start, 547 unsigned long end) 548 { 549 struct kvm *kvm = mmu_notifier_to_kvm(mn); 550 int young, idx; 551 552 idx = srcu_read_lock(&kvm->srcu); 553 spin_lock(&kvm->mmu_lock); 554 /* 555 * Even though we do not flush TLB, this will still adversely 556 * affect performance on pre-Haswell Intel EPT, where there is 557 * no EPT Access Bit to clear so that we have to tear down EPT 558 * tables instead. If we find this unacceptable, we can always 559 * add a parameter to kvm_age_hva so that it effectively doesn't 560 * do anything on clear_young. 561 * 562 * Also note that currently we never issue secondary TLB flushes 563 * from clear_young, leaving this job up to the regular system 564 * cadence. If we find this inaccurate, we might come up with a 565 * more sophisticated heuristic later. 566 */ 567 young = kvm_age_hva(kvm, start, end); 568 spin_unlock(&kvm->mmu_lock); 569 srcu_read_unlock(&kvm->srcu, idx); 570 571 return young; 572 } 573 574 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 575 struct mm_struct *mm, 576 unsigned long address) 577 { 578 struct kvm *kvm = mmu_notifier_to_kvm(mn); 579 int young, idx; 580 581 idx = srcu_read_lock(&kvm->srcu); 582 spin_lock(&kvm->mmu_lock); 583 young = kvm_test_age_hva(kvm, address); 584 spin_unlock(&kvm->mmu_lock); 585 srcu_read_unlock(&kvm->srcu, idx); 586 587 return young; 588 } 589 590 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 591 struct mm_struct *mm) 592 { 593 struct kvm *kvm = mmu_notifier_to_kvm(mn); 594 int idx; 595 596 idx = srcu_read_lock(&kvm->srcu); 597 kvm_arch_flush_shadow_all(kvm); 598 srcu_read_unlock(&kvm->srcu, idx); 599 } 600 601 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 602 .invalidate_range = kvm_mmu_notifier_invalidate_range, 603 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 604 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 605 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 606 .clear_young = kvm_mmu_notifier_clear_young, 607 .test_young = kvm_mmu_notifier_test_young, 608 .change_pte = kvm_mmu_notifier_change_pte, 609 .release = kvm_mmu_notifier_release, 610 }; 611 612 static int kvm_init_mmu_notifier(struct kvm *kvm) 613 { 614 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 615 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 616 } 617 618 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 619 620 static int kvm_init_mmu_notifier(struct kvm *kvm) 621 { 622 return 0; 623 } 624 625 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 626 627 static struct kvm_memslots *kvm_alloc_memslots(void) 628 { 629 int i; 630 struct kvm_memslots *slots; 631 632 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 633 if (!slots) 634 return NULL; 635 636 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 637 slots->id_to_index[i] = -1; 638 639 return slots; 640 } 641 642 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 643 { 644 if (!memslot->dirty_bitmap) 645 return; 646 647 kvfree(memslot->dirty_bitmap); 648 memslot->dirty_bitmap = NULL; 649 } 650 651 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 652 { 653 kvm_destroy_dirty_bitmap(slot); 654 655 kvm_arch_free_memslot(kvm, slot); 656 657 slot->flags = 0; 658 slot->npages = 0; 659 } 660 661 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 662 { 663 struct kvm_memory_slot *memslot; 664 665 if (!slots) 666 return; 667 668 kvm_for_each_memslot(memslot, slots) 669 kvm_free_memslot(kvm, memslot); 670 671 kvfree(slots); 672 } 673 674 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 675 { 676 int i; 677 678 if (!kvm->debugfs_dentry) 679 return; 680 681 debugfs_remove_recursive(kvm->debugfs_dentry); 682 683 if (kvm->debugfs_stat_data) { 684 for (i = 0; i < kvm_debugfs_num_entries; i++) 685 kfree(kvm->debugfs_stat_data[i]); 686 kfree(kvm->debugfs_stat_data); 687 } 688 } 689 690 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 691 { 692 char dir_name[ITOA_MAX_LEN * 2]; 693 struct kvm_stat_data *stat_data; 694 struct kvm_stats_debugfs_item *p; 695 696 if (!debugfs_initialized()) 697 return 0; 698 699 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 700 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 701 702 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 703 sizeof(*kvm->debugfs_stat_data), 704 GFP_KERNEL_ACCOUNT); 705 if (!kvm->debugfs_stat_data) 706 return -ENOMEM; 707 708 for (p = debugfs_entries; p->name; p++) { 709 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 710 if (!stat_data) 711 return -ENOMEM; 712 713 stat_data->kvm = kvm; 714 stat_data->dbgfs_item = p; 715 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 716 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 717 kvm->debugfs_dentry, stat_data, 718 &stat_fops_per_vm); 719 } 720 return 0; 721 } 722 723 /* 724 * Called after the VM is otherwise initialized, but just before adding it to 725 * the vm_list. 726 */ 727 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 728 { 729 return 0; 730 } 731 732 /* 733 * Called just after removing the VM from the vm_list, but before doing any 734 * other destruction. 735 */ 736 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 737 { 738 } 739 740 static struct kvm *kvm_create_vm(unsigned long type) 741 { 742 struct kvm *kvm = kvm_arch_alloc_vm(); 743 int r = -ENOMEM; 744 int i; 745 746 if (!kvm) 747 return ERR_PTR(-ENOMEM); 748 749 spin_lock_init(&kvm->mmu_lock); 750 mmgrab(current->mm); 751 kvm->mm = current->mm; 752 kvm_eventfd_init(kvm); 753 mutex_init(&kvm->lock); 754 mutex_init(&kvm->irq_lock); 755 mutex_init(&kvm->slots_lock); 756 INIT_LIST_HEAD(&kvm->devices); 757 758 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 759 760 if (init_srcu_struct(&kvm->srcu)) 761 goto out_err_no_srcu; 762 if (init_srcu_struct(&kvm->irq_srcu)) 763 goto out_err_no_irq_srcu; 764 765 refcount_set(&kvm->users_count, 1); 766 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 767 struct kvm_memslots *slots = kvm_alloc_memslots(); 768 769 if (!slots) 770 goto out_err_no_arch_destroy_vm; 771 /* Generations must be different for each address space. */ 772 slots->generation = i; 773 rcu_assign_pointer(kvm->memslots[i], slots); 774 } 775 776 for (i = 0; i < KVM_NR_BUSES; i++) { 777 rcu_assign_pointer(kvm->buses[i], 778 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 779 if (!kvm->buses[i]) 780 goto out_err_no_arch_destroy_vm; 781 } 782 783 kvm->max_halt_poll_ns = halt_poll_ns; 784 785 r = kvm_arch_init_vm(kvm, type); 786 if (r) 787 goto out_err_no_arch_destroy_vm; 788 789 r = hardware_enable_all(); 790 if (r) 791 goto out_err_no_disable; 792 793 #ifdef CONFIG_HAVE_KVM_IRQFD 794 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 795 #endif 796 797 r = kvm_init_mmu_notifier(kvm); 798 if (r) 799 goto out_err_no_mmu_notifier; 800 801 r = kvm_arch_post_init_vm(kvm); 802 if (r) 803 goto out_err; 804 805 mutex_lock(&kvm_lock); 806 list_add(&kvm->vm_list, &vm_list); 807 mutex_unlock(&kvm_lock); 808 809 preempt_notifier_inc(); 810 811 return kvm; 812 813 out_err: 814 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 815 if (kvm->mmu_notifier.ops) 816 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 817 #endif 818 out_err_no_mmu_notifier: 819 hardware_disable_all(); 820 out_err_no_disable: 821 kvm_arch_destroy_vm(kvm); 822 out_err_no_arch_destroy_vm: 823 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 824 for (i = 0; i < KVM_NR_BUSES; i++) 825 kfree(kvm_get_bus(kvm, i)); 826 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 827 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 828 cleanup_srcu_struct(&kvm->irq_srcu); 829 out_err_no_irq_srcu: 830 cleanup_srcu_struct(&kvm->srcu); 831 out_err_no_srcu: 832 kvm_arch_free_vm(kvm); 833 mmdrop(current->mm); 834 return ERR_PTR(r); 835 } 836 837 static void kvm_destroy_devices(struct kvm *kvm) 838 { 839 struct kvm_device *dev, *tmp; 840 841 /* 842 * We do not need to take the kvm->lock here, because nobody else 843 * has a reference to the struct kvm at this point and therefore 844 * cannot access the devices list anyhow. 845 */ 846 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 847 list_del(&dev->vm_node); 848 dev->ops->destroy(dev); 849 } 850 } 851 852 static void kvm_destroy_vm(struct kvm *kvm) 853 { 854 int i; 855 struct mm_struct *mm = kvm->mm; 856 857 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 858 kvm_destroy_vm_debugfs(kvm); 859 kvm_arch_sync_events(kvm); 860 mutex_lock(&kvm_lock); 861 list_del(&kvm->vm_list); 862 mutex_unlock(&kvm_lock); 863 kvm_arch_pre_destroy_vm(kvm); 864 865 kvm_free_irq_routing(kvm); 866 for (i = 0; i < KVM_NR_BUSES; i++) { 867 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 868 869 if (bus) 870 kvm_io_bus_destroy(bus); 871 kvm->buses[i] = NULL; 872 } 873 kvm_coalesced_mmio_free(kvm); 874 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 875 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 876 #else 877 kvm_arch_flush_shadow_all(kvm); 878 #endif 879 kvm_arch_destroy_vm(kvm); 880 kvm_destroy_devices(kvm); 881 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 882 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 883 cleanup_srcu_struct(&kvm->irq_srcu); 884 cleanup_srcu_struct(&kvm->srcu); 885 kvm_arch_free_vm(kvm); 886 preempt_notifier_dec(); 887 hardware_disable_all(); 888 mmdrop(mm); 889 } 890 891 void kvm_get_kvm(struct kvm *kvm) 892 { 893 refcount_inc(&kvm->users_count); 894 } 895 EXPORT_SYMBOL_GPL(kvm_get_kvm); 896 897 void kvm_put_kvm(struct kvm *kvm) 898 { 899 if (refcount_dec_and_test(&kvm->users_count)) 900 kvm_destroy_vm(kvm); 901 } 902 EXPORT_SYMBOL_GPL(kvm_put_kvm); 903 904 /* 905 * Used to put a reference that was taken on behalf of an object associated 906 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 907 * of the new file descriptor fails and the reference cannot be transferred to 908 * its final owner. In such cases, the caller is still actively using @kvm and 909 * will fail miserably if the refcount unexpectedly hits zero. 910 */ 911 void kvm_put_kvm_no_destroy(struct kvm *kvm) 912 { 913 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 914 } 915 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 916 917 static int kvm_vm_release(struct inode *inode, struct file *filp) 918 { 919 struct kvm *kvm = filp->private_data; 920 921 kvm_irqfd_release(kvm); 922 923 kvm_put_kvm(kvm); 924 return 0; 925 } 926 927 /* 928 * Allocation size is twice as large as the actual dirty bitmap size. 929 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 930 */ 931 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 932 { 933 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 934 935 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 936 if (!memslot->dirty_bitmap) 937 return -ENOMEM; 938 939 return 0; 940 } 941 942 /* 943 * Delete a memslot by decrementing the number of used slots and shifting all 944 * other entries in the array forward one spot. 945 */ 946 static inline void kvm_memslot_delete(struct kvm_memslots *slots, 947 struct kvm_memory_slot *memslot) 948 { 949 struct kvm_memory_slot *mslots = slots->memslots; 950 int i; 951 952 if (WARN_ON(slots->id_to_index[memslot->id] == -1)) 953 return; 954 955 slots->used_slots--; 956 957 if (atomic_read(&slots->lru_slot) >= slots->used_slots) 958 atomic_set(&slots->lru_slot, 0); 959 960 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { 961 mslots[i] = mslots[i + 1]; 962 slots->id_to_index[mslots[i].id] = i; 963 } 964 mslots[i] = *memslot; 965 slots->id_to_index[memslot->id] = -1; 966 } 967 968 /* 969 * "Insert" a new memslot by incrementing the number of used slots. Returns 970 * the new slot's initial index into the memslots array. 971 */ 972 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) 973 { 974 return slots->used_slots++; 975 } 976 977 /* 978 * Move a changed memslot backwards in the array by shifting existing slots 979 * with a higher GFN toward the front of the array. Note, the changed memslot 980 * itself is not preserved in the array, i.e. not swapped at this time, only 981 * its new index into the array is tracked. Returns the changed memslot's 982 * current index into the memslots array. 983 */ 984 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, 985 struct kvm_memory_slot *memslot) 986 { 987 struct kvm_memory_slot *mslots = slots->memslots; 988 int i; 989 990 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || 991 WARN_ON_ONCE(!slots->used_slots)) 992 return -1; 993 994 /* 995 * Move the target memslot backward in the array by shifting existing 996 * memslots with a higher GFN (than the target memslot) towards the 997 * front of the array. 998 */ 999 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { 1000 if (memslot->base_gfn > mslots[i + 1].base_gfn) 1001 break; 1002 1003 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); 1004 1005 /* Shift the next memslot forward one and update its index. */ 1006 mslots[i] = mslots[i + 1]; 1007 slots->id_to_index[mslots[i].id] = i; 1008 } 1009 return i; 1010 } 1011 1012 /* 1013 * Move a changed memslot forwards in the array by shifting existing slots with 1014 * a lower GFN toward the back of the array. Note, the changed memslot itself 1015 * is not preserved in the array, i.e. not swapped at this time, only its new 1016 * index into the array is tracked. Returns the changed memslot's final index 1017 * into the memslots array. 1018 */ 1019 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, 1020 struct kvm_memory_slot *memslot, 1021 int start) 1022 { 1023 struct kvm_memory_slot *mslots = slots->memslots; 1024 int i; 1025 1026 for (i = start; i > 0; i--) { 1027 if (memslot->base_gfn < mslots[i - 1].base_gfn) 1028 break; 1029 1030 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); 1031 1032 /* Shift the next memslot back one and update its index. */ 1033 mslots[i] = mslots[i - 1]; 1034 slots->id_to_index[mslots[i].id] = i; 1035 } 1036 return i; 1037 } 1038 1039 /* 1040 * Re-sort memslots based on their GFN to account for an added, deleted, or 1041 * moved memslot. Sorting memslots by GFN allows using a binary search during 1042 * memslot lookup. 1043 * 1044 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry 1045 * at memslots[0] has the highest GFN. 1046 * 1047 * The sorting algorithm takes advantage of having initially sorted memslots 1048 * and knowing the position of the changed memslot. Sorting is also optimized 1049 * by not swapping the updated memslot and instead only shifting other memslots 1050 * and tracking the new index for the update memslot. Only once its final 1051 * index is known is the updated memslot copied into its position in the array. 1052 * 1053 * - When deleting a memslot, the deleted memslot simply needs to be moved to 1054 * the end of the array. 1055 * 1056 * - When creating a memslot, the algorithm "inserts" the new memslot at the 1057 * end of the array and then it forward to its correct location. 1058 * 1059 * - When moving a memslot, the algorithm first moves the updated memslot 1060 * backward to handle the scenario where the memslot's GFN was changed to a 1061 * lower value. update_memslots() then falls through and runs the same flow 1062 * as creating a memslot to move the memslot forward to handle the scenario 1063 * where its GFN was changed to a higher value. 1064 * 1065 * Note, slots are sorted from highest->lowest instead of lowest->highest for 1066 * historical reasons. Originally, invalid memslots where denoted by having 1067 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots 1068 * to the end of the array. The current algorithm uses dedicated logic to 1069 * delete a memslot and thus does not rely on invalid memslots having GFN=0. 1070 * 1071 * The other historical motiviation for highest->lowest was to improve the 1072 * performance of memslot lookup. KVM originally used a linear search starting 1073 * at memslots[0]. On x86, the largest memslot usually has one of the highest, 1074 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a 1075 * single memslot above the 4gb boundary. As the largest memslot is also the 1076 * most likely to be referenced, sorting it to the front of the array was 1077 * advantageous. The current binary search starts from the middle of the array 1078 * and uses an LRU pointer to improve performance for all memslots and GFNs. 1079 */ 1080 static void update_memslots(struct kvm_memslots *slots, 1081 struct kvm_memory_slot *memslot, 1082 enum kvm_mr_change change) 1083 { 1084 int i; 1085 1086 if (change == KVM_MR_DELETE) { 1087 kvm_memslot_delete(slots, memslot); 1088 } else { 1089 if (change == KVM_MR_CREATE) 1090 i = kvm_memslot_insert_back(slots); 1091 else 1092 i = kvm_memslot_move_backward(slots, memslot); 1093 i = kvm_memslot_move_forward(slots, memslot, i); 1094 1095 /* 1096 * Copy the memslot to its new position in memslots and update 1097 * its index accordingly. 1098 */ 1099 slots->memslots[i] = *memslot; 1100 slots->id_to_index[memslot->id] = i; 1101 } 1102 } 1103 1104 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1105 { 1106 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1107 1108 #ifdef __KVM_HAVE_READONLY_MEM 1109 valid_flags |= KVM_MEM_READONLY; 1110 #endif 1111 1112 if (mem->flags & ~valid_flags) 1113 return -EINVAL; 1114 1115 return 0; 1116 } 1117 1118 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 1119 int as_id, struct kvm_memslots *slots) 1120 { 1121 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 1122 u64 gen = old_memslots->generation; 1123 1124 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1125 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1126 1127 rcu_assign_pointer(kvm->memslots[as_id], slots); 1128 synchronize_srcu_expedited(&kvm->srcu); 1129 1130 /* 1131 * Increment the new memslot generation a second time, dropping the 1132 * update in-progress flag and incrementing the generation based on 1133 * the number of address spaces. This provides a unique and easily 1134 * identifiable generation number while the memslots are in flux. 1135 */ 1136 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1137 1138 /* 1139 * Generations must be unique even across address spaces. We do not need 1140 * a global counter for that, instead the generation space is evenly split 1141 * across address spaces. For example, with two address spaces, address 1142 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1143 * use generations 1, 3, 5, ... 1144 */ 1145 gen += KVM_ADDRESS_SPACE_NUM; 1146 1147 kvm_arch_memslots_updated(kvm, gen); 1148 1149 slots->generation = gen; 1150 1151 return old_memslots; 1152 } 1153 1154 /* 1155 * Note, at a minimum, the current number of used slots must be allocated, even 1156 * when deleting a memslot, as we need a complete duplicate of the memslots for 1157 * use when invalidating a memslot prior to deleting/moving the memslot. 1158 */ 1159 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, 1160 enum kvm_mr_change change) 1161 { 1162 struct kvm_memslots *slots; 1163 size_t old_size, new_size; 1164 1165 old_size = sizeof(struct kvm_memslots) + 1166 (sizeof(struct kvm_memory_slot) * old->used_slots); 1167 1168 if (change == KVM_MR_CREATE) 1169 new_size = old_size + sizeof(struct kvm_memory_slot); 1170 else 1171 new_size = old_size; 1172 1173 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); 1174 if (likely(slots)) 1175 memcpy(slots, old, old_size); 1176 1177 return slots; 1178 } 1179 1180 static int kvm_set_memslot(struct kvm *kvm, 1181 const struct kvm_userspace_memory_region *mem, 1182 struct kvm_memory_slot *old, 1183 struct kvm_memory_slot *new, int as_id, 1184 enum kvm_mr_change change) 1185 { 1186 struct kvm_memory_slot *slot; 1187 struct kvm_memslots *slots; 1188 int r; 1189 1190 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); 1191 if (!slots) 1192 return -ENOMEM; 1193 1194 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1195 /* 1196 * Note, the INVALID flag needs to be in the appropriate entry 1197 * in the freshly allocated memslots, not in @old or @new. 1198 */ 1199 slot = id_to_memslot(slots, old->id); 1200 slot->flags |= KVM_MEMSLOT_INVALID; 1201 1202 /* 1203 * We can re-use the old memslots, the only difference from the 1204 * newly installed memslots is the invalid flag, which will get 1205 * dropped by update_memslots anyway. We'll also revert to the 1206 * old memslots if preparing the new memory region fails. 1207 */ 1208 slots = install_new_memslots(kvm, as_id, slots); 1209 1210 /* From this point no new shadow pages pointing to a deleted, 1211 * or moved, memslot will be created. 1212 * 1213 * validation of sp->gfn happens in: 1214 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1215 * - kvm_is_visible_gfn (mmu_check_root) 1216 */ 1217 kvm_arch_flush_shadow_memslot(kvm, slot); 1218 } 1219 1220 r = kvm_arch_prepare_memory_region(kvm, new, mem, change); 1221 if (r) 1222 goto out_slots; 1223 1224 update_memslots(slots, new, change); 1225 slots = install_new_memslots(kvm, as_id, slots); 1226 1227 kvm_arch_commit_memory_region(kvm, mem, old, new, change); 1228 1229 kvfree(slots); 1230 return 0; 1231 1232 out_slots: 1233 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1234 slots = install_new_memslots(kvm, as_id, slots); 1235 kvfree(slots); 1236 return r; 1237 } 1238 1239 static int kvm_delete_memslot(struct kvm *kvm, 1240 const struct kvm_userspace_memory_region *mem, 1241 struct kvm_memory_slot *old, int as_id) 1242 { 1243 struct kvm_memory_slot new; 1244 int r; 1245 1246 if (!old->npages) 1247 return -EINVAL; 1248 1249 memset(&new, 0, sizeof(new)); 1250 new.id = old->id; 1251 /* 1252 * This is only for debugging purpose; it should never be referenced 1253 * for a removed memslot. 1254 */ 1255 new.as_id = as_id; 1256 1257 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); 1258 if (r) 1259 return r; 1260 1261 kvm_free_memslot(kvm, old); 1262 return 0; 1263 } 1264 1265 /* 1266 * Allocate some memory and give it an address in the guest physical address 1267 * space. 1268 * 1269 * Discontiguous memory is allowed, mostly for framebuffers. 1270 * 1271 * Must be called holding kvm->slots_lock for write. 1272 */ 1273 int __kvm_set_memory_region(struct kvm *kvm, 1274 const struct kvm_userspace_memory_region *mem) 1275 { 1276 struct kvm_memory_slot old, new; 1277 struct kvm_memory_slot *tmp; 1278 enum kvm_mr_change change; 1279 int as_id, id; 1280 int r; 1281 1282 r = check_memory_region_flags(mem); 1283 if (r) 1284 return r; 1285 1286 as_id = mem->slot >> 16; 1287 id = (u16)mem->slot; 1288 1289 /* General sanity checks */ 1290 if (mem->memory_size & (PAGE_SIZE - 1)) 1291 return -EINVAL; 1292 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1293 return -EINVAL; 1294 /* We can read the guest memory with __xxx_user() later on. */ 1295 if ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1296 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1297 mem->memory_size)) 1298 return -EINVAL; 1299 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1300 return -EINVAL; 1301 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1302 return -EINVAL; 1303 1304 /* 1305 * Make a full copy of the old memslot, the pointer will become stale 1306 * when the memslots are re-sorted by update_memslots(), and the old 1307 * memslot needs to be referenced after calling update_memslots(), e.g. 1308 * to free its resources and for arch specific behavior. 1309 */ 1310 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1311 if (tmp) { 1312 old = *tmp; 1313 tmp = NULL; 1314 } else { 1315 memset(&old, 0, sizeof(old)); 1316 old.id = id; 1317 } 1318 1319 if (!mem->memory_size) 1320 return kvm_delete_memslot(kvm, mem, &old, as_id); 1321 1322 new.as_id = as_id; 1323 new.id = id; 1324 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1325 new.npages = mem->memory_size >> PAGE_SHIFT; 1326 new.flags = mem->flags; 1327 new.userspace_addr = mem->userspace_addr; 1328 1329 if (new.npages > KVM_MEM_MAX_NR_PAGES) 1330 return -EINVAL; 1331 1332 if (!old.npages) { 1333 change = KVM_MR_CREATE; 1334 new.dirty_bitmap = NULL; 1335 memset(&new.arch, 0, sizeof(new.arch)); 1336 } else { /* Modify an existing slot. */ 1337 if ((new.userspace_addr != old.userspace_addr) || 1338 (new.npages != old.npages) || 1339 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1340 return -EINVAL; 1341 1342 if (new.base_gfn != old.base_gfn) 1343 change = KVM_MR_MOVE; 1344 else if (new.flags != old.flags) 1345 change = KVM_MR_FLAGS_ONLY; 1346 else /* Nothing to change. */ 1347 return 0; 1348 1349 /* Copy dirty_bitmap and arch from the current memslot. */ 1350 new.dirty_bitmap = old.dirty_bitmap; 1351 memcpy(&new.arch, &old.arch, sizeof(new.arch)); 1352 } 1353 1354 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1355 /* Check for overlaps */ 1356 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { 1357 if (tmp->id == id) 1358 continue; 1359 if (!((new.base_gfn + new.npages <= tmp->base_gfn) || 1360 (new.base_gfn >= tmp->base_gfn + tmp->npages))) 1361 return -EEXIST; 1362 } 1363 } 1364 1365 /* Allocate/free page dirty bitmap as needed */ 1366 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1367 new.dirty_bitmap = NULL; 1368 else if (!new.dirty_bitmap && !kvm->dirty_ring_size) { 1369 r = kvm_alloc_dirty_bitmap(&new); 1370 if (r) 1371 return r; 1372 1373 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1374 bitmap_set(new.dirty_bitmap, 0, new.npages); 1375 } 1376 1377 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); 1378 if (r) 1379 goto out_bitmap; 1380 1381 if (old.dirty_bitmap && !new.dirty_bitmap) 1382 kvm_destroy_dirty_bitmap(&old); 1383 return 0; 1384 1385 out_bitmap: 1386 if (new.dirty_bitmap && !old.dirty_bitmap) 1387 kvm_destroy_dirty_bitmap(&new); 1388 return r; 1389 } 1390 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1391 1392 int kvm_set_memory_region(struct kvm *kvm, 1393 const struct kvm_userspace_memory_region *mem) 1394 { 1395 int r; 1396 1397 mutex_lock(&kvm->slots_lock); 1398 r = __kvm_set_memory_region(kvm, mem); 1399 mutex_unlock(&kvm->slots_lock); 1400 return r; 1401 } 1402 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1403 1404 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1405 struct kvm_userspace_memory_region *mem) 1406 { 1407 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1408 return -EINVAL; 1409 1410 return kvm_set_memory_region(kvm, mem); 1411 } 1412 1413 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1414 /** 1415 * kvm_get_dirty_log - get a snapshot of dirty pages 1416 * @kvm: pointer to kvm instance 1417 * @log: slot id and address to which we copy the log 1418 * @is_dirty: set to '1' if any dirty pages were found 1419 * @memslot: set to the associated memslot, always valid on success 1420 */ 1421 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1422 int *is_dirty, struct kvm_memory_slot **memslot) 1423 { 1424 struct kvm_memslots *slots; 1425 int i, as_id, id; 1426 unsigned long n; 1427 unsigned long any = 0; 1428 1429 /* Dirty ring tracking is exclusive to dirty log tracking */ 1430 if (kvm->dirty_ring_size) 1431 return -ENXIO; 1432 1433 *memslot = NULL; 1434 *is_dirty = 0; 1435 1436 as_id = log->slot >> 16; 1437 id = (u16)log->slot; 1438 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1439 return -EINVAL; 1440 1441 slots = __kvm_memslots(kvm, as_id); 1442 *memslot = id_to_memslot(slots, id); 1443 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1444 return -ENOENT; 1445 1446 kvm_arch_sync_dirty_log(kvm, *memslot); 1447 1448 n = kvm_dirty_bitmap_bytes(*memslot); 1449 1450 for (i = 0; !any && i < n/sizeof(long); ++i) 1451 any = (*memslot)->dirty_bitmap[i]; 1452 1453 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 1454 return -EFAULT; 1455 1456 if (any) 1457 *is_dirty = 1; 1458 return 0; 1459 } 1460 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1461 1462 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1463 /** 1464 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1465 * and reenable dirty page tracking for the corresponding pages. 1466 * @kvm: pointer to kvm instance 1467 * @log: slot id and address to which we copy the log 1468 * 1469 * We need to keep it in mind that VCPU threads can write to the bitmap 1470 * concurrently. So, to avoid losing track of dirty pages we keep the 1471 * following order: 1472 * 1473 * 1. Take a snapshot of the bit and clear it if needed. 1474 * 2. Write protect the corresponding page. 1475 * 3. Copy the snapshot to the userspace. 1476 * 4. Upon return caller flushes TLB's if needed. 1477 * 1478 * Between 2 and 4, the guest may write to the page using the remaining TLB 1479 * entry. This is not a problem because the page is reported dirty using 1480 * the snapshot taken before and step 4 ensures that writes done after 1481 * exiting to userspace will be logged for the next call. 1482 * 1483 */ 1484 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 1485 { 1486 struct kvm_memslots *slots; 1487 struct kvm_memory_slot *memslot; 1488 int i, as_id, id; 1489 unsigned long n; 1490 unsigned long *dirty_bitmap; 1491 unsigned long *dirty_bitmap_buffer; 1492 bool flush; 1493 1494 /* Dirty ring tracking is exclusive to dirty log tracking */ 1495 if (kvm->dirty_ring_size) 1496 return -ENXIO; 1497 1498 as_id = log->slot >> 16; 1499 id = (u16)log->slot; 1500 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1501 return -EINVAL; 1502 1503 slots = __kvm_memslots(kvm, as_id); 1504 memslot = id_to_memslot(slots, id); 1505 if (!memslot || !memslot->dirty_bitmap) 1506 return -ENOENT; 1507 1508 dirty_bitmap = memslot->dirty_bitmap; 1509 1510 kvm_arch_sync_dirty_log(kvm, memslot); 1511 1512 n = kvm_dirty_bitmap_bytes(memslot); 1513 flush = false; 1514 if (kvm->manual_dirty_log_protect) { 1515 /* 1516 * Unlike kvm_get_dirty_log, we always return false in *flush, 1517 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1518 * is some code duplication between this function and 1519 * kvm_get_dirty_log, but hopefully all architecture 1520 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1521 * can be eliminated. 1522 */ 1523 dirty_bitmap_buffer = dirty_bitmap; 1524 } else { 1525 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1526 memset(dirty_bitmap_buffer, 0, n); 1527 1528 spin_lock(&kvm->mmu_lock); 1529 for (i = 0; i < n / sizeof(long); i++) { 1530 unsigned long mask; 1531 gfn_t offset; 1532 1533 if (!dirty_bitmap[i]) 1534 continue; 1535 1536 flush = true; 1537 mask = xchg(&dirty_bitmap[i], 0); 1538 dirty_bitmap_buffer[i] = mask; 1539 1540 offset = i * BITS_PER_LONG; 1541 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1542 offset, mask); 1543 } 1544 spin_unlock(&kvm->mmu_lock); 1545 } 1546 1547 if (flush) 1548 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1549 1550 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1551 return -EFAULT; 1552 return 0; 1553 } 1554 1555 1556 /** 1557 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 1558 * @kvm: kvm instance 1559 * @log: slot id and address to which we copy the log 1560 * 1561 * Steps 1-4 below provide general overview of dirty page logging. See 1562 * kvm_get_dirty_log_protect() function description for additional details. 1563 * 1564 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 1565 * always flush the TLB (step 4) even if previous step failed and the dirty 1566 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 1567 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 1568 * writes will be marked dirty for next log read. 1569 * 1570 * 1. Take a snapshot of the bit and clear it if needed. 1571 * 2. Write protect the corresponding page. 1572 * 3. Copy the snapshot to the userspace. 1573 * 4. Flush TLB's if needed. 1574 */ 1575 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1576 struct kvm_dirty_log *log) 1577 { 1578 int r; 1579 1580 mutex_lock(&kvm->slots_lock); 1581 1582 r = kvm_get_dirty_log_protect(kvm, log); 1583 1584 mutex_unlock(&kvm->slots_lock); 1585 return r; 1586 } 1587 1588 /** 1589 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1590 * and reenable dirty page tracking for the corresponding pages. 1591 * @kvm: pointer to kvm instance 1592 * @log: slot id and address from which to fetch the bitmap of dirty pages 1593 */ 1594 static int kvm_clear_dirty_log_protect(struct kvm *kvm, 1595 struct kvm_clear_dirty_log *log) 1596 { 1597 struct kvm_memslots *slots; 1598 struct kvm_memory_slot *memslot; 1599 int as_id, id; 1600 gfn_t offset; 1601 unsigned long i, n; 1602 unsigned long *dirty_bitmap; 1603 unsigned long *dirty_bitmap_buffer; 1604 bool flush; 1605 1606 /* Dirty ring tracking is exclusive to dirty log tracking */ 1607 if (kvm->dirty_ring_size) 1608 return -ENXIO; 1609 1610 as_id = log->slot >> 16; 1611 id = (u16)log->slot; 1612 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1613 return -EINVAL; 1614 1615 if (log->first_page & 63) 1616 return -EINVAL; 1617 1618 slots = __kvm_memslots(kvm, as_id); 1619 memslot = id_to_memslot(slots, id); 1620 if (!memslot || !memslot->dirty_bitmap) 1621 return -ENOENT; 1622 1623 dirty_bitmap = memslot->dirty_bitmap; 1624 1625 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1626 1627 if (log->first_page > memslot->npages || 1628 log->num_pages > memslot->npages - log->first_page || 1629 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1630 return -EINVAL; 1631 1632 kvm_arch_sync_dirty_log(kvm, memslot); 1633 1634 flush = false; 1635 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1636 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1637 return -EFAULT; 1638 1639 spin_lock(&kvm->mmu_lock); 1640 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1641 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1642 i++, offset += BITS_PER_LONG) { 1643 unsigned long mask = *dirty_bitmap_buffer++; 1644 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1645 if (!mask) 1646 continue; 1647 1648 mask &= atomic_long_fetch_andnot(mask, p); 1649 1650 /* 1651 * mask contains the bits that really have been cleared. This 1652 * never includes any bits beyond the length of the memslot (if 1653 * the length is not aligned to 64 pages), therefore it is not 1654 * a problem if userspace sets them in log->dirty_bitmap. 1655 */ 1656 if (mask) { 1657 flush = true; 1658 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1659 offset, mask); 1660 } 1661 } 1662 spin_unlock(&kvm->mmu_lock); 1663 1664 if (flush) 1665 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1666 1667 return 0; 1668 } 1669 1670 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 1671 struct kvm_clear_dirty_log *log) 1672 { 1673 int r; 1674 1675 mutex_lock(&kvm->slots_lock); 1676 1677 r = kvm_clear_dirty_log_protect(kvm, log); 1678 1679 mutex_unlock(&kvm->slots_lock); 1680 return r; 1681 } 1682 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1683 1684 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1685 { 1686 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1687 } 1688 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1689 1690 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1691 { 1692 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1693 } 1694 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); 1695 1696 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1697 { 1698 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1699 1700 return kvm_is_visible_memslot(memslot); 1701 } 1702 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1703 1704 bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1705 { 1706 struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1707 1708 return kvm_is_visible_memslot(memslot); 1709 } 1710 EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); 1711 1712 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1713 { 1714 struct vm_area_struct *vma; 1715 unsigned long addr, size; 1716 1717 size = PAGE_SIZE; 1718 1719 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1720 if (kvm_is_error_hva(addr)) 1721 return PAGE_SIZE; 1722 1723 mmap_read_lock(current->mm); 1724 vma = find_vma(current->mm, addr); 1725 if (!vma) 1726 goto out; 1727 1728 size = vma_kernel_pagesize(vma); 1729 1730 out: 1731 mmap_read_unlock(current->mm); 1732 1733 return size; 1734 } 1735 1736 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1737 { 1738 return slot->flags & KVM_MEM_READONLY; 1739 } 1740 1741 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1742 gfn_t *nr_pages, bool write) 1743 { 1744 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1745 return KVM_HVA_ERR_BAD; 1746 1747 if (memslot_is_readonly(slot) && write) 1748 return KVM_HVA_ERR_RO_BAD; 1749 1750 if (nr_pages) 1751 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1752 1753 return __gfn_to_hva_memslot(slot, gfn); 1754 } 1755 1756 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1757 gfn_t *nr_pages) 1758 { 1759 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1760 } 1761 1762 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1763 gfn_t gfn) 1764 { 1765 return gfn_to_hva_many(slot, gfn, NULL); 1766 } 1767 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1768 1769 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1770 { 1771 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1772 } 1773 EXPORT_SYMBOL_GPL(gfn_to_hva); 1774 1775 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1776 { 1777 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1778 } 1779 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1780 1781 /* 1782 * Return the hva of a @gfn and the R/W attribute if possible. 1783 * 1784 * @slot: the kvm_memory_slot which contains @gfn 1785 * @gfn: the gfn to be translated 1786 * @writable: used to return the read/write attribute of the @slot if the hva 1787 * is valid and @writable is not NULL 1788 */ 1789 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1790 gfn_t gfn, bool *writable) 1791 { 1792 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1793 1794 if (!kvm_is_error_hva(hva) && writable) 1795 *writable = !memslot_is_readonly(slot); 1796 1797 return hva; 1798 } 1799 1800 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1801 { 1802 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1803 1804 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1805 } 1806 1807 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1808 { 1809 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1810 1811 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1812 } 1813 1814 static inline int check_user_page_hwpoison(unsigned long addr) 1815 { 1816 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1817 1818 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1819 return rc == -EHWPOISON; 1820 } 1821 1822 /* 1823 * The fast path to get the writable pfn which will be stored in @pfn, 1824 * true indicates success, otherwise false is returned. It's also the 1825 * only part that runs if we can in atomic context. 1826 */ 1827 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1828 bool *writable, kvm_pfn_t *pfn) 1829 { 1830 struct page *page[1]; 1831 1832 /* 1833 * Fast pin a writable pfn only if it is a write fault request 1834 * or the caller allows to map a writable pfn for a read fault 1835 * request. 1836 */ 1837 if (!(write_fault || writable)) 1838 return false; 1839 1840 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { 1841 *pfn = page_to_pfn(page[0]); 1842 1843 if (writable) 1844 *writable = true; 1845 return true; 1846 } 1847 1848 return false; 1849 } 1850 1851 /* 1852 * The slow path to get the pfn of the specified host virtual address, 1853 * 1 indicates success, -errno is returned if error is detected. 1854 */ 1855 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1856 bool *writable, kvm_pfn_t *pfn) 1857 { 1858 unsigned int flags = FOLL_HWPOISON; 1859 struct page *page; 1860 int npages = 0; 1861 1862 might_sleep(); 1863 1864 if (writable) 1865 *writable = write_fault; 1866 1867 if (write_fault) 1868 flags |= FOLL_WRITE; 1869 if (async) 1870 flags |= FOLL_NOWAIT; 1871 1872 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1873 if (npages != 1) 1874 return npages; 1875 1876 /* map read fault as writable if possible */ 1877 if (unlikely(!write_fault) && writable) { 1878 struct page *wpage; 1879 1880 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { 1881 *writable = true; 1882 put_page(page); 1883 page = wpage; 1884 } 1885 } 1886 *pfn = page_to_pfn(page); 1887 return npages; 1888 } 1889 1890 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1891 { 1892 if (unlikely(!(vma->vm_flags & VM_READ))) 1893 return false; 1894 1895 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1896 return false; 1897 1898 return true; 1899 } 1900 1901 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1902 unsigned long addr, bool *async, 1903 bool write_fault, bool *writable, 1904 kvm_pfn_t *p_pfn) 1905 { 1906 unsigned long pfn; 1907 int r; 1908 1909 r = follow_pfn(vma, addr, &pfn); 1910 if (r) { 1911 /* 1912 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1913 * not call the fault handler, so do it here. 1914 */ 1915 bool unlocked = false; 1916 r = fixup_user_fault(current->mm, addr, 1917 (write_fault ? FAULT_FLAG_WRITE : 0), 1918 &unlocked); 1919 if (unlocked) 1920 return -EAGAIN; 1921 if (r) 1922 return r; 1923 1924 r = follow_pfn(vma, addr, &pfn); 1925 if (r) 1926 return r; 1927 1928 } 1929 1930 if (writable) 1931 *writable = true; 1932 1933 /* 1934 * Get a reference here because callers of *hva_to_pfn* and 1935 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1936 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1937 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1938 * simply do nothing for reserved pfns. 1939 * 1940 * Whoever called remap_pfn_range is also going to call e.g. 1941 * unmap_mapping_range before the underlying pages are freed, 1942 * causing a call to our MMU notifier. 1943 */ 1944 kvm_get_pfn(pfn); 1945 1946 *p_pfn = pfn; 1947 return 0; 1948 } 1949 1950 /* 1951 * Pin guest page in memory and return its pfn. 1952 * @addr: host virtual address which maps memory to the guest 1953 * @atomic: whether this function can sleep 1954 * @async: whether this function need to wait IO complete if the 1955 * host page is not in the memory 1956 * @write_fault: whether we should get a writable host page 1957 * @writable: whether it allows to map a writable host page for !@write_fault 1958 * 1959 * The function will map a writable host page for these two cases: 1960 * 1): @write_fault = true 1961 * 2): @write_fault = false && @writable, @writable will tell the caller 1962 * whether the mapping is writable. 1963 */ 1964 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1965 bool write_fault, bool *writable) 1966 { 1967 struct vm_area_struct *vma; 1968 kvm_pfn_t pfn = 0; 1969 int npages, r; 1970 1971 /* we can do it either atomically or asynchronously, not both */ 1972 BUG_ON(atomic && async); 1973 1974 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1975 return pfn; 1976 1977 if (atomic) 1978 return KVM_PFN_ERR_FAULT; 1979 1980 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1981 if (npages == 1) 1982 return pfn; 1983 1984 mmap_read_lock(current->mm); 1985 if (npages == -EHWPOISON || 1986 (!async && check_user_page_hwpoison(addr))) { 1987 pfn = KVM_PFN_ERR_HWPOISON; 1988 goto exit; 1989 } 1990 1991 retry: 1992 vma = find_vma_intersection(current->mm, addr, addr + 1); 1993 1994 if (vma == NULL) 1995 pfn = KVM_PFN_ERR_FAULT; 1996 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1997 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1998 if (r == -EAGAIN) 1999 goto retry; 2000 if (r < 0) 2001 pfn = KVM_PFN_ERR_FAULT; 2002 } else { 2003 if (async && vma_is_valid(vma, write_fault)) 2004 *async = true; 2005 pfn = KVM_PFN_ERR_FAULT; 2006 } 2007 exit: 2008 mmap_read_unlock(current->mm); 2009 return pfn; 2010 } 2011 2012 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 2013 bool atomic, bool *async, bool write_fault, 2014 bool *writable) 2015 { 2016 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 2017 2018 if (addr == KVM_HVA_ERR_RO_BAD) { 2019 if (writable) 2020 *writable = false; 2021 return KVM_PFN_ERR_RO_FAULT; 2022 } 2023 2024 if (kvm_is_error_hva(addr)) { 2025 if (writable) 2026 *writable = false; 2027 return KVM_PFN_NOSLOT; 2028 } 2029 2030 /* Do not map writable pfn in the readonly memslot. */ 2031 if (writable && memslot_is_readonly(slot)) { 2032 *writable = false; 2033 writable = NULL; 2034 } 2035 2036 return hva_to_pfn(addr, atomic, async, write_fault, 2037 writable); 2038 } 2039 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 2040 2041 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 2042 bool *writable) 2043 { 2044 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 2045 write_fault, writable); 2046 } 2047 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 2048 2049 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 2050 { 2051 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 2052 } 2053 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 2054 2055 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 2056 { 2057 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 2058 } 2059 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 2060 2061 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 2062 { 2063 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2064 } 2065 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 2066 2067 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 2068 { 2069 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 2070 } 2071 EXPORT_SYMBOL_GPL(gfn_to_pfn); 2072 2073 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 2074 { 2075 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 2076 } 2077 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 2078 2079 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2080 struct page **pages, int nr_pages) 2081 { 2082 unsigned long addr; 2083 gfn_t entry = 0; 2084 2085 addr = gfn_to_hva_many(slot, gfn, &entry); 2086 if (kvm_is_error_hva(addr)) 2087 return -1; 2088 2089 if (entry < nr_pages) 2090 return 0; 2091 2092 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); 2093 } 2094 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2095 2096 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2097 { 2098 if (is_error_noslot_pfn(pfn)) 2099 return KVM_ERR_PTR_BAD_PAGE; 2100 2101 if (kvm_is_reserved_pfn(pfn)) { 2102 WARN_ON(1); 2103 return KVM_ERR_PTR_BAD_PAGE; 2104 } 2105 2106 return pfn_to_page(pfn); 2107 } 2108 2109 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2110 { 2111 kvm_pfn_t pfn; 2112 2113 pfn = gfn_to_pfn(kvm, gfn); 2114 2115 return kvm_pfn_to_page(pfn); 2116 } 2117 EXPORT_SYMBOL_GPL(gfn_to_page); 2118 2119 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 2120 { 2121 if (pfn == 0) 2122 return; 2123 2124 if (cache) 2125 cache->pfn = cache->gfn = 0; 2126 2127 if (dirty) 2128 kvm_release_pfn_dirty(pfn); 2129 else 2130 kvm_release_pfn_clean(pfn); 2131 } 2132 2133 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 2134 struct gfn_to_pfn_cache *cache, u64 gen) 2135 { 2136 kvm_release_pfn(cache->pfn, cache->dirty, cache); 2137 2138 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 2139 cache->gfn = gfn; 2140 cache->dirty = false; 2141 cache->generation = gen; 2142 } 2143 2144 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 2145 struct kvm_host_map *map, 2146 struct gfn_to_pfn_cache *cache, 2147 bool atomic) 2148 { 2149 kvm_pfn_t pfn; 2150 void *hva = NULL; 2151 struct page *page = KVM_UNMAPPED_PAGE; 2152 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 2153 u64 gen = slots->generation; 2154 2155 if (!map) 2156 return -EINVAL; 2157 2158 if (cache) { 2159 if (!cache->pfn || cache->gfn != gfn || 2160 cache->generation != gen) { 2161 if (atomic) 2162 return -EAGAIN; 2163 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 2164 } 2165 pfn = cache->pfn; 2166 } else { 2167 if (atomic) 2168 return -EAGAIN; 2169 pfn = gfn_to_pfn_memslot(slot, gfn); 2170 } 2171 if (is_error_noslot_pfn(pfn)) 2172 return -EINVAL; 2173 2174 if (pfn_valid(pfn)) { 2175 page = pfn_to_page(pfn); 2176 if (atomic) 2177 hva = kmap_atomic(page); 2178 else 2179 hva = kmap(page); 2180 #ifdef CONFIG_HAS_IOMEM 2181 } else if (!atomic) { 2182 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2183 } else { 2184 return -EINVAL; 2185 #endif 2186 } 2187 2188 if (!hva) 2189 return -EFAULT; 2190 2191 map->page = page; 2192 map->hva = hva; 2193 map->pfn = pfn; 2194 map->gfn = gfn; 2195 2196 return 0; 2197 } 2198 2199 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 2200 struct gfn_to_pfn_cache *cache, bool atomic) 2201 { 2202 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 2203 cache, atomic); 2204 } 2205 EXPORT_SYMBOL_GPL(kvm_map_gfn); 2206 2207 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2208 { 2209 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 2210 NULL, false); 2211 } 2212 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2213 2214 static void __kvm_unmap_gfn(struct kvm *kvm, 2215 struct kvm_memory_slot *memslot, 2216 struct kvm_host_map *map, 2217 struct gfn_to_pfn_cache *cache, 2218 bool dirty, bool atomic) 2219 { 2220 if (!map) 2221 return; 2222 2223 if (!map->hva) 2224 return; 2225 2226 if (map->page != KVM_UNMAPPED_PAGE) { 2227 if (atomic) 2228 kunmap_atomic(map->hva); 2229 else 2230 kunmap(map->page); 2231 } 2232 #ifdef CONFIG_HAS_IOMEM 2233 else if (!atomic) 2234 memunmap(map->hva); 2235 else 2236 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 2237 #endif 2238 2239 if (dirty) 2240 mark_page_dirty_in_slot(kvm, memslot, map->gfn); 2241 2242 if (cache) 2243 cache->dirty |= dirty; 2244 else 2245 kvm_release_pfn(map->pfn, dirty, NULL); 2246 2247 map->hva = NULL; 2248 map->page = NULL; 2249 } 2250 2251 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 2252 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 2253 { 2254 __kvm_unmap_gfn(vcpu->kvm, gfn_to_memslot(vcpu->kvm, map->gfn), map, 2255 cache, dirty, atomic); 2256 return 0; 2257 } 2258 EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 2259 2260 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2261 { 2262 __kvm_unmap_gfn(vcpu->kvm, kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), 2263 map, NULL, dirty, false); 2264 } 2265 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2266 2267 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2268 { 2269 kvm_pfn_t pfn; 2270 2271 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2272 2273 return kvm_pfn_to_page(pfn); 2274 } 2275 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2276 2277 void kvm_release_page_clean(struct page *page) 2278 { 2279 WARN_ON(is_error_page(page)); 2280 2281 kvm_release_pfn_clean(page_to_pfn(page)); 2282 } 2283 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2284 2285 void kvm_release_pfn_clean(kvm_pfn_t pfn) 2286 { 2287 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2288 put_page(pfn_to_page(pfn)); 2289 } 2290 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2291 2292 void kvm_release_page_dirty(struct page *page) 2293 { 2294 WARN_ON(is_error_page(page)); 2295 2296 kvm_release_pfn_dirty(page_to_pfn(page)); 2297 } 2298 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2299 2300 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2301 { 2302 kvm_set_pfn_dirty(pfn); 2303 kvm_release_pfn_clean(pfn); 2304 } 2305 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2306 2307 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2308 { 2309 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2310 SetPageDirty(pfn_to_page(pfn)); 2311 } 2312 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2313 2314 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2315 { 2316 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2317 mark_page_accessed(pfn_to_page(pfn)); 2318 } 2319 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2320 2321 void kvm_get_pfn(kvm_pfn_t pfn) 2322 { 2323 if (!kvm_is_reserved_pfn(pfn)) 2324 get_page(pfn_to_page(pfn)); 2325 } 2326 EXPORT_SYMBOL_GPL(kvm_get_pfn); 2327 2328 static int next_segment(unsigned long len, int offset) 2329 { 2330 if (len > PAGE_SIZE - offset) 2331 return PAGE_SIZE - offset; 2332 else 2333 return len; 2334 } 2335 2336 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2337 void *data, int offset, int len) 2338 { 2339 int r; 2340 unsigned long addr; 2341 2342 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2343 if (kvm_is_error_hva(addr)) 2344 return -EFAULT; 2345 r = __copy_from_user(data, (void __user *)addr + offset, len); 2346 if (r) 2347 return -EFAULT; 2348 return 0; 2349 } 2350 2351 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2352 int len) 2353 { 2354 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2355 2356 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2357 } 2358 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2359 2360 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2361 int offset, int len) 2362 { 2363 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2364 2365 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2366 } 2367 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2368 2369 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2370 { 2371 gfn_t gfn = gpa >> PAGE_SHIFT; 2372 int seg; 2373 int offset = offset_in_page(gpa); 2374 int ret; 2375 2376 while ((seg = next_segment(len, offset)) != 0) { 2377 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2378 if (ret < 0) 2379 return ret; 2380 offset = 0; 2381 len -= seg; 2382 data += seg; 2383 ++gfn; 2384 } 2385 return 0; 2386 } 2387 EXPORT_SYMBOL_GPL(kvm_read_guest); 2388 2389 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2390 { 2391 gfn_t gfn = gpa >> PAGE_SHIFT; 2392 int seg; 2393 int offset = offset_in_page(gpa); 2394 int ret; 2395 2396 while ((seg = next_segment(len, offset)) != 0) { 2397 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2398 if (ret < 0) 2399 return ret; 2400 offset = 0; 2401 len -= seg; 2402 data += seg; 2403 ++gfn; 2404 } 2405 return 0; 2406 } 2407 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2408 2409 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2410 void *data, int offset, unsigned long len) 2411 { 2412 int r; 2413 unsigned long addr; 2414 2415 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2416 if (kvm_is_error_hva(addr)) 2417 return -EFAULT; 2418 pagefault_disable(); 2419 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2420 pagefault_enable(); 2421 if (r) 2422 return -EFAULT; 2423 return 0; 2424 } 2425 2426 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2427 void *data, unsigned long len) 2428 { 2429 gfn_t gfn = gpa >> PAGE_SHIFT; 2430 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2431 int offset = offset_in_page(gpa); 2432 2433 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2434 } 2435 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2436 2437 static int __kvm_write_guest_page(struct kvm *kvm, 2438 struct kvm_memory_slot *memslot, gfn_t gfn, 2439 const void *data, int offset, int len) 2440 { 2441 int r; 2442 unsigned long addr; 2443 2444 addr = gfn_to_hva_memslot(memslot, gfn); 2445 if (kvm_is_error_hva(addr)) 2446 return -EFAULT; 2447 r = __copy_to_user((void __user *)addr + offset, data, len); 2448 if (r) 2449 return -EFAULT; 2450 mark_page_dirty_in_slot(kvm, memslot, gfn); 2451 return 0; 2452 } 2453 2454 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2455 const void *data, int offset, int len) 2456 { 2457 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2458 2459 return __kvm_write_guest_page(kvm, slot, gfn, data, offset, len); 2460 } 2461 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2462 2463 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2464 const void *data, int offset, int len) 2465 { 2466 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2467 2468 return __kvm_write_guest_page(vcpu->kvm, slot, gfn, data, offset, len); 2469 } 2470 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2471 2472 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2473 unsigned long len) 2474 { 2475 gfn_t gfn = gpa >> PAGE_SHIFT; 2476 int seg; 2477 int offset = offset_in_page(gpa); 2478 int ret; 2479 2480 while ((seg = next_segment(len, offset)) != 0) { 2481 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2482 if (ret < 0) 2483 return ret; 2484 offset = 0; 2485 len -= seg; 2486 data += seg; 2487 ++gfn; 2488 } 2489 return 0; 2490 } 2491 EXPORT_SYMBOL_GPL(kvm_write_guest); 2492 2493 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2494 unsigned long len) 2495 { 2496 gfn_t gfn = gpa >> PAGE_SHIFT; 2497 int seg; 2498 int offset = offset_in_page(gpa); 2499 int ret; 2500 2501 while ((seg = next_segment(len, offset)) != 0) { 2502 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2503 if (ret < 0) 2504 return ret; 2505 offset = 0; 2506 len -= seg; 2507 data += seg; 2508 ++gfn; 2509 } 2510 return 0; 2511 } 2512 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2513 2514 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2515 struct gfn_to_hva_cache *ghc, 2516 gpa_t gpa, unsigned long len) 2517 { 2518 int offset = offset_in_page(gpa); 2519 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2520 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2521 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2522 gfn_t nr_pages_avail; 2523 2524 /* Update ghc->generation before performing any error checks. */ 2525 ghc->generation = slots->generation; 2526 2527 if (start_gfn > end_gfn) { 2528 ghc->hva = KVM_HVA_ERR_BAD; 2529 return -EINVAL; 2530 } 2531 2532 /* 2533 * If the requested region crosses two memslots, we still 2534 * verify that the entire region is valid here. 2535 */ 2536 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2537 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2538 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2539 &nr_pages_avail); 2540 if (kvm_is_error_hva(ghc->hva)) 2541 return -EFAULT; 2542 } 2543 2544 /* Use the slow path for cross page reads and writes. */ 2545 if (nr_pages_needed == 1) 2546 ghc->hva += offset; 2547 else 2548 ghc->memslot = NULL; 2549 2550 ghc->gpa = gpa; 2551 ghc->len = len; 2552 return 0; 2553 } 2554 2555 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2556 gpa_t gpa, unsigned long len) 2557 { 2558 struct kvm_memslots *slots = kvm_memslots(kvm); 2559 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2560 } 2561 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2562 2563 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2564 void *data, unsigned int offset, 2565 unsigned long len) 2566 { 2567 struct kvm_memslots *slots = kvm_memslots(kvm); 2568 int r; 2569 gpa_t gpa = ghc->gpa + offset; 2570 2571 BUG_ON(len + offset > ghc->len); 2572 2573 if (slots->generation != ghc->generation) { 2574 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2575 return -EFAULT; 2576 } 2577 2578 if (kvm_is_error_hva(ghc->hva)) 2579 return -EFAULT; 2580 2581 if (unlikely(!ghc->memslot)) 2582 return kvm_write_guest(kvm, gpa, data, len); 2583 2584 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2585 if (r) 2586 return -EFAULT; 2587 mark_page_dirty_in_slot(kvm, ghc->memslot, gpa >> PAGE_SHIFT); 2588 2589 return 0; 2590 } 2591 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2592 2593 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2594 void *data, unsigned long len) 2595 { 2596 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2597 } 2598 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2599 2600 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2601 void *data, unsigned int offset, 2602 unsigned long len) 2603 { 2604 struct kvm_memslots *slots = kvm_memslots(kvm); 2605 int r; 2606 gpa_t gpa = ghc->gpa + offset; 2607 2608 BUG_ON(len + offset > ghc->len); 2609 2610 if (slots->generation != ghc->generation) { 2611 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2612 return -EFAULT; 2613 } 2614 2615 if (kvm_is_error_hva(ghc->hva)) 2616 return -EFAULT; 2617 2618 if (unlikely(!ghc->memslot)) 2619 return kvm_read_guest(kvm, gpa, data, len); 2620 2621 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); 2622 if (r) 2623 return -EFAULT; 2624 2625 return 0; 2626 } 2627 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); 2628 2629 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2630 void *data, unsigned long len) 2631 { 2632 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); 2633 } 2634 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2635 2636 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2637 { 2638 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2639 gfn_t gfn = gpa >> PAGE_SHIFT; 2640 int seg; 2641 int offset = offset_in_page(gpa); 2642 int ret; 2643 2644 while ((seg = next_segment(len, offset)) != 0) { 2645 ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2646 if (ret < 0) 2647 return ret; 2648 offset = 0; 2649 len -= seg; 2650 ++gfn; 2651 } 2652 return 0; 2653 } 2654 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2655 2656 void mark_page_dirty_in_slot(struct kvm *kvm, 2657 struct kvm_memory_slot *memslot, 2658 gfn_t gfn) 2659 { 2660 if (memslot && kvm_slot_dirty_track_enabled(memslot)) { 2661 unsigned long rel_gfn = gfn - memslot->base_gfn; 2662 u32 slot = (memslot->as_id << 16) | memslot->id; 2663 2664 if (kvm->dirty_ring_size) 2665 kvm_dirty_ring_push(kvm_dirty_ring_get(kvm), 2666 slot, rel_gfn); 2667 else 2668 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2669 } 2670 } 2671 EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot); 2672 2673 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2674 { 2675 struct kvm_memory_slot *memslot; 2676 2677 memslot = gfn_to_memslot(kvm, gfn); 2678 mark_page_dirty_in_slot(kvm, memslot, gfn); 2679 } 2680 EXPORT_SYMBOL_GPL(mark_page_dirty); 2681 2682 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2683 { 2684 struct kvm_memory_slot *memslot; 2685 2686 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2687 mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn); 2688 } 2689 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2690 2691 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2692 { 2693 if (!vcpu->sigset_active) 2694 return; 2695 2696 /* 2697 * This does a lockless modification of ->real_blocked, which is fine 2698 * because, only current can change ->real_blocked and all readers of 2699 * ->real_blocked don't care as long ->real_blocked is always a subset 2700 * of ->blocked. 2701 */ 2702 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2703 } 2704 2705 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2706 { 2707 if (!vcpu->sigset_active) 2708 return; 2709 2710 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2711 sigemptyset(¤t->real_blocked); 2712 } 2713 2714 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2715 { 2716 unsigned int old, val, grow, grow_start; 2717 2718 old = val = vcpu->halt_poll_ns; 2719 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2720 grow = READ_ONCE(halt_poll_ns_grow); 2721 if (!grow) 2722 goto out; 2723 2724 val *= grow; 2725 if (val < grow_start) 2726 val = grow_start; 2727 2728 if (val > halt_poll_ns) 2729 val = halt_poll_ns; 2730 2731 vcpu->halt_poll_ns = val; 2732 out: 2733 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2734 } 2735 2736 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2737 { 2738 unsigned int old, val, shrink; 2739 2740 old = val = vcpu->halt_poll_ns; 2741 shrink = READ_ONCE(halt_poll_ns_shrink); 2742 if (shrink == 0) 2743 val = 0; 2744 else 2745 val /= shrink; 2746 2747 vcpu->halt_poll_ns = val; 2748 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2749 } 2750 2751 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2752 { 2753 int ret = -EINTR; 2754 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2755 2756 if (kvm_arch_vcpu_runnable(vcpu)) { 2757 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2758 goto out; 2759 } 2760 if (kvm_cpu_has_pending_timer(vcpu)) 2761 goto out; 2762 if (signal_pending(current)) 2763 goto out; 2764 2765 ret = 0; 2766 out: 2767 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2768 return ret; 2769 } 2770 2771 static inline void 2772 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) 2773 { 2774 if (waited) 2775 vcpu->stat.halt_poll_fail_ns += poll_ns; 2776 else 2777 vcpu->stat.halt_poll_success_ns += poll_ns; 2778 } 2779 2780 /* 2781 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2782 */ 2783 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2784 { 2785 ktime_t start, cur, poll_end; 2786 bool waited = false; 2787 u64 block_ns; 2788 2789 kvm_arch_vcpu_blocking(vcpu); 2790 2791 start = cur = poll_end = ktime_get(); 2792 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2793 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2794 2795 ++vcpu->stat.halt_attempted_poll; 2796 do { 2797 /* 2798 * This sets KVM_REQ_UNHALT if an interrupt 2799 * arrives. 2800 */ 2801 if (kvm_vcpu_check_block(vcpu) < 0) { 2802 ++vcpu->stat.halt_successful_poll; 2803 if (!vcpu_valid_wakeup(vcpu)) 2804 ++vcpu->stat.halt_poll_invalid; 2805 goto out; 2806 } 2807 poll_end = cur = ktime_get(); 2808 } while (single_task_running() && ktime_before(cur, stop)); 2809 } 2810 2811 prepare_to_rcuwait(&vcpu->wait); 2812 for (;;) { 2813 set_current_state(TASK_INTERRUPTIBLE); 2814 2815 if (kvm_vcpu_check_block(vcpu) < 0) 2816 break; 2817 2818 waited = true; 2819 schedule(); 2820 } 2821 finish_rcuwait(&vcpu->wait); 2822 cur = ktime_get(); 2823 out: 2824 kvm_arch_vcpu_unblocking(vcpu); 2825 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2826 2827 update_halt_poll_stats( 2828 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); 2829 2830 if (!kvm_arch_no_poll(vcpu)) { 2831 if (!vcpu_valid_wakeup(vcpu)) { 2832 shrink_halt_poll_ns(vcpu); 2833 } else if (vcpu->kvm->max_halt_poll_ns) { 2834 if (block_ns <= vcpu->halt_poll_ns) 2835 ; 2836 /* we had a long block, shrink polling */ 2837 else if (vcpu->halt_poll_ns && 2838 block_ns > vcpu->kvm->max_halt_poll_ns) 2839 shrink_halt_poll_ns(vcpu); 2840 /* we had a short halt and our poll time is too small */ 2841 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && 2842 block_ns < vcpu->kvm->max_halt_poll_ns) 2843 grow_halt_poll_ns(vcpu); 2844 } else { 2845 vcpu->halt_poll_ns = 0; 2846 } 2847 } 2848 2849 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2850 kvm_arch_vcpu_block_finish(vcpu); 2851 } 2852 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2853 2854 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2855 { 2856 struct rcuwait *waitp; 2857 2858 waitp = kvm_arch_vcpu_get_wait(vcpu); 2859 if (rcuwait_wake_up(waitp)) { 2860 WRITE_ONCE(vcpu->ready, true); 2861 ++vcpu->stat.halt_wakeup; 2862 return true; 2863 } 2864 2865 return false; 2866 } 2867 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2868 2869 #ifndef CONFIG_S390 2870 /* 2871 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2872 */ 2873 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2874 { 2875 int me; 2876 int cpu = vcpu->cpu; 2877 2878 if (kvm_vcpu_wake_up(vcpu)) 2879 return; 2880 2881 me = get_cpu(); 2882 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2883 if (kvm_arch_vcpu_should_kick(vcpu)) 2884 smp_send_reschedule(cpu); 2885 put_cpu(); 2886 } 2887 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2888 #endif /* !CONFIG_S390 */ 2889 2890 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2891 { 2892 struct pid *pid; 2893 struct task_struct *task = NULL; 2894 int ret = 0; 2895 2896 rcu_read_lock(); 2897 pid = rcu_dereference(target->pid); 2898 if (pid) 2899 task = get_pid_task(pid, PIDTYPE_PID); 2900 rcu_read_unlock(); 2901 if (!task) 2902 return ret; 2903 ret = yield_to(task, 1); 2904 put_task_struct(task); 2905 2906 return ret; 2907 } 2908 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2909 2910 /* 2911 * Helper that checks whether a VCPU is eligible for directed yield. 2912 * Most eligible candidate to yield is decided by following heuristics: 2913 * 2914 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2915 * (preempted lock holder), indicated by @in_spin_loop. 2916 * Set at the beginning and cleared at the end of interception/PLE handler. 2917 * 2918 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2919 * chance last time (mostly it has become eligible now since we have probably 2920 * yielded to lockholder in last iteration. This is done by toggling 2921 * @dy_eligible each time a VCPU checked for eligibility.) 2922 * 2923 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2924 * to preempted lock-holder could result in wrong VCPU selection and CPU 2925 * burning. Giving priority for a potential lock-holder increases lock 2926 * progress. 2927 * 2928 * Since algorithm is based on heuristics, accessing another VCPU data without 2929 * locking does not harm. It may result in trying to yield to same VCPU, fail 2930 * and continue with next VCPU and so on. 2931 */ 2932 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2933 { 2934 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2935 bool eligible; 2936 2937 eligible = !vcpu->spin_loop.in_spin_loop || 2938 vcpu->spin_loop.dy_eligible; 2939 2940 if (vcpu->spin_loop.in_spin_loop) 2941 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2942 2943 return eligible; 2944 #else 2945 return true; 2946 #endif 2947 } 2948 2949 /* 2950 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2951 * a vcpu_load/vcpu_put pair. However, for most architectures 2952 * kvm_arch_vcpu_runnable does not require vcpu_load. 2953 */ 2954 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2955 { 2956 return kvm_arch_vcpu_runnable(vcpu); 2957 } 2958 2959 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2960 { 2961 if (kvm_arch_dy_runnable(vcpu)) 2962 return true; 2963 2964 #ifdef CONFIG_KVM_ASYNC_PF 2965 if (!list_empty_careful(&vcpu->async_pf.done)) 2966 return true; 2967 #endif 2968 2969 return false; 2970 } 2971 2972 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2973 { 2974 struct kvm *kvm = me->kvm; 2975 struct kvm_vcpu *vcpu; 2976 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2977 int yielded = 0; 2978 int try = 3; 2979 int pass; 2980 int i; 2981 2982 kvm_vcpu_set_in_spin_loop(me, true); 2983 /* 2984 * We boost the priority of a VCPU that is runnable but not 2985 * currently running, because it got preempted by something 2986 * else and called schedule in __vcpu_run. Hopefully that 2987 * VCPU is holding the lock that we need and will release it. 2988 * We approximate round-robin by starting at the last boosted VCPU. 2989 */ 2990 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2991 kvm_for_each_vcpu(i, vcpu, kvm) { 2992 if (!pass && i <= last_boosted_vcpu) { 2993 i = last_boosted_vcpu; 2994 continue; 2995 } else if (pass && i > last_boosted_vcpu) 2996 break; 2997 if (!READ_ONCE(vcpu->ready)) 2998 continue; 2999 if (vcpu == me) 3000 continue; 3001 if (rcuwait_active(&vcpu->wait) && 3002 !vcpu_dy_runnable(vcpu)) 3003 continue; 3004 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 3005 !kvm_arch_vcpu_in_kernel(vcpu)) 3006 continue; 3007 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 3008 continue; 3009 3010 yielded = kvm_vcpu_yield_to(vcpu); 3011 if (yielded > 0) { 3012 kvm->last_boosted_vcpu = i; 3013 break; 3014 } else if (yielded < 0) { 3015 try--; 3016 if (!try) 3017 break; 3018 } 3019 } 3020 } 3021 kvm_vcpu_set_in_spin_loop(me, false); 3022 3023 /* Ensure vcpu is not eligible during next spinloop */ 3024 kvm_vcpu_set_dy_eligible(me, false); 3025 } 3026 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 3027 3028 static bool kvm_page_in_dirty_ring(struct kvm *kvm, unsigned long pgoff) 3029 { 3030 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0 3031 return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) && 3032 (pgoff < KVM_DIRTY_LOG_PAGE_OFFSET + 3033 kvm->dirty_ring_size / PAGE_SIZE); 3034 #else 3035 return false; 3036 #endif 3037 } 3038 3039 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 3040 { 3041 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 3042 struct page *page; 3043 3044 if (vmf->pgoff == 0) 3045 page = virt_to_page(vcpu->run); 3046 #ifdef CONFIG_X86 3047 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 3048 page = virt_to_page(vcpu->arch.pio_data); 3049 #endif 3050 #ifdef CONFIG_KVM_MMIO 3051 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 3052 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 3053 #endif 3054 else if (kvm_page_in_dirty_ring(vcpu->kvm, vmf->pgoff)) 3055 page = kvm_dirty_ring_get_page( 3056 &vcpu->dirty_ring, 3057 vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET); 3058 else 3059 return kvm_arch_vcpu_fault(vcpu, vmf); 3060 get_page(page); 3061 vmf->page = page; 3062 return 0; 3063 } 3064 3065 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 3066 .fault = kvm_vcpu_fault, 3067 }; 3068 3069 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 3070 { 3071 struct kvm_vcpu *vcpu = file->private_data; 3072 unsigned long pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; 3073 3074 if ((kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff) || 3075 kvm_page_in_dirty_ring(vcpu->kvm, vma->vm_pgoff + pages - 1)) && 3076 ((vma->vm_flags & VM_EXEC) || !(vma->vm_flags & VM_SHARED))) 3077 return -EINVAL; 3078 3079 vma->vm_ops = &kvm_vcpu_vm_ops; 3080 return 0; 3081 } 3082 3083 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 3084 { 3085 struct kvm_vcpu *vcpu = filp->private_data; 3086 3087 kvm_put_kvm(vcpu->kvm); 3088 return 0; 3089 } 3090 3091 static struct file_operations kvm_vcpu_fops = { 3092 .release = kvm_vcpu_release, 3093 .unlocked_ioctl = kvm_vcpu_ioctl, 3094 .mmap = kvm_vcpu_mmap, 3095 .llseek = noop_llseek, 3096 KVM_COMPAT(kvm_vcpu_compat_ioctl), 3097 }; 3098 3099 /* 3100 * Allocates an inode for the vcpu. 3101 */ 3102 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 3103 { 3104 char name[8 + 1 + ITOA_MAX_LEN + 1]; 3105 3106 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 3107 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 3108 } 3109 3110 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 3111 { 3112 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 3113 struct dentry *debugfs_dentry; 3114 char dir_name[ITOA_MAX_LEN * 2]; 3115 3116 if (!debugfs_initialized()) 3117 return; 3118 3119 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 3120 debugfs_dentry = debugfs_create_dir(dir_name, 3121 vcpu->kvm->debugfs_dentry); 3122 3123 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); 3124 #endif 3125 } 3126 3127 /* 3128 * Creates some virtual cpus. Good luck creating more than one. 3129 */ 3130 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3131 { 3132 int r; 3133 struct kvm_vcpu *vcpu; 3134 struct page *page; 3135 3136 if (id >= KVM_MAX_VCPU_ID) 3137 return -EINVAL; 3138 3139 mutex_lock(&kvm->lock); 3140 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 3141 mutex_unlock(&kvm->lock); 3142 return -EINVAL; 3143 } 3144 3145 kvm->created_vcpus++; 3146 mutex_unlock(&kvm->lock); 3147 3148 r = kvm_arch_vcpu_precreate(kvm, id); 3149 if (r) 3150 goto vcpu_decrement; 3151 3152 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3153 if (!vcpu) { 3154 r = -ENOMEM; 3155 goto vcpu_decrement; 3156 } 3157 3158 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3159 page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); 3160 if (!page) { 3161 r = -ENOMEM; 3162 goto vcpu_free; 3163 } 3164 vcpu->run = page_address(page); 3165 3166 kvm_vcpu_init(vcpu, kvm, id); 3167 3168 r = kvm_arch_vcpu_create(vcpu); 3169 if (r) 3170 goto vcpu_free_run_page; 3171 3172 if (kvm->dirty_ring_size) { 3173 r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, 3174 id, kvm->dirty_ring_size); 3175 if (r) 3176 goto arch_vcpu_destroy; 3177 } 3178 3179 mutex_lock(&kvm->lock); 3180 if (kvm_get_vcpu_by_id(kvm, id)) { 3181 r = -EEXIST; 3182 goto unlock_vcpu_destroy; 3183 } 3184 3185 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3186 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 3187 3188 /* Now it's all set up, let userspace reach it */ 3189 kvm_get_kvm(kvm); 3190 r = create_vcpu_fd(vcpu); 3191 if (r < 0) { 3192 kvm_put_kvm_no_destroy(kvm); 3193 goto unlock_vcpu_destroy; 3194 } 3195 3196 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 3197 3198 /* 3199 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 3200 * before kvm->online_vcpu's incremented value. 3201 */ 3202 smp_wmb(); 3203 atomic_inc(&kvm->online_vcpus); 3204 3205 mutex_unlock(&kvm->lock); 3206 kvm_arch_vcpu_postcreate(vcpu); 3207 kvm_create_vcpu_debugfs(vcpu); 3208 return r; 3209 3210 unlock_vcpu_destroy: 3211 mutex_unlock(&kvm->lock); 3212 kvm_dirty_ring_free(&vcpu->dirty_ring); 3213 arch_vcpu_destroy: 3214 kvm_arch_vcpu_destroy(vcpu); 3215 vcpu_free_run_page: 3216 free_page((unsigned long)vcpu->run); 3217 vcpu_free: 3218 kmem_cache_free(kvm_vcpu_cache, vcpu); 3219 vcpu_decrement: 3220 mutex_lock(&kvm->lock); 3221 kvm->created_vcpus--; 3222 mutex_unlock(&kvm->lock); 3223 return r; 3224 } 3225 3226 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3227 { 3228 if (sigset) { 3229 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3230 vcpu->sigset_active = 1; 3231 vcpu->sigset = *sigset; 3232 } else 3233 vcpu->sigset_active = 0; 3234 return 0; 3235 } 3236 3237 static long kvm_vcpu_ioctl(struct file *filp, 3238 unsigned int ioctl, unsigned long arg) 3239 { 3240 struct kvm_vcpu *vcpu = filp->private_data; 3241 void __user *argp = (void __user *)arg; 3242 int r; 3243 struct kvm_fpu *fpu = NULL; 3244 struct kvm_sregs *kvm_sregs = NULL; 3245 3246 if (vcpu->kvm->mm != current->mm) 3247 return -EIO; 3248 3249 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3250 return -EINVAL; 3251 3252 /* 3253 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3254 * execution; mutex_lock() would break them. 3255 */ 3256 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3257 if (r != -ENOIOCTLCMD) 3258 return r; 3259 3260 if (mutex_lock_killable(&vcpu->mutex)) 3261 return -EINTR; 3262 switch (ioctl) { 3263 case KVM_RUN: { 3264 struct pid *oldpid; 3265 r = -EINVAL; 3266 if (arg) 3267 goto out; 3268 oldpid = rcu_access_pointer(vcpu->pid); 3269 if (unlikely(oldpid != task_pid(current))) { 3270 /* The thread running this VCPU changed. */ 3271 struct pid *newpid; 3272 3273 r = kvm_arch_vcpu_run_pid_change(vcpu); 3274 if (r) 3275 break; 3276 3277 newpid = get_task_pid(current, PIDTYPE_PID); 3278 rcu_assign_pointer(vcpu->pid, newpid); 3279 if (oldpid) 3280 synchronize_rcu(); 3281 put_pid(oldpid); 3282 } 3283 r = kvm_arch_vcpu_ioctl_run(vcpu); 3284 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3285 break; 3286 } 3287 case KVM_GET_REGS: { 3288 struct kvm_regs *kvm_regs; 3289 3290 r = -ENOMEM; 3291 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3292 if (!kvm_regs) 3293 goto out; 3294 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3295 if (r) 3296 goto out_free1; 3297 r = -EFAULT; 3298 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3299 goto out_free1; 3300 r = 0; 3301 out_free1: 3302 kfree(kvm_regs); 3303 break; 3304 } 3305 case KVM_SET_REGS: { 3306 struct kvm_regs *kvm_regs; 3307 3308 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3309 if (IS_ERR(kvm_regs)) { 3310 r = PTR_ERR(kvm_regs); 3311 goto out; 3312 } 3313 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3314 kfree(kvm_regs); 3315 break; 3316 } 3317 case KVM_GET_SREGS: { 3318 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3319 GFP_KERNEL_ACCOUNT); 3320 r = -ENOMEM; 3321 if (!kvm_sregs) 3322 goto out; 3323 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3324 if (r) 3325 goto out; 3326 r = -EFAULT; 3327 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3328 goto out; 3329 r = 0; 3330 break; 3331 } 3332 case KVM_SET_SREGS: { 3333 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3334 if (IS_ERR(kvm_sregs)) { 3335 r = PTR_ERR(kvm_sregs); 3336 kvm_sregs = NULL; 3337 goto out; 3338 } 3339 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3340 break; 3341 } 3342 case KVM_GET_MP_STATE: { 3343 struct kvm_mp_state mp_state; 3344 3345 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3346 if (r) 3347 goto out; 3348 r = -EFAULT; 3349 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3350 goto out; 3351 r = 0; 3352 break; 3353 } 3354 case KVM_SET_MP_STATE: { 3355 struct kvm_mp_state mp_state; 3356 3357 r = -EFAULT; 3358 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3359 goto out; 3360 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3361 break; 3362 } 3363 case KVM_TRANSLATE: { 3364 struct kvm_translation tr; 3365 3366 r = -EFAULT; 3367 if (copy_from_user(&tr, argp, sizeof(tr))) 3368 goto out; 3369 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3370 if (r) 3371 goto out; 3372 r = -EFAULT; 3373 if (copy_to_user(argp, &tr, sizeof(tr))) 3374 goto out; 3375 r = 0; 3376 break; 3377 } 3378 case KVM_SET_GUEST_DEBUG: { 3379 struct kvm_guest_debug dbg; 3380 3381 r = -EFAULT; 3382 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3383 goto out; 3384 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3385 break; 3386 } 3387 case KVM_SET_SIGNAL_MASK: { 3388 struct kvm_signal_mask __user *sigmask_arg = argp; 3389 struct kvm_signal_mask kvm_sigmask; 3390 sigset_t sigset, *p; 3391 3392 p = NULL; 3393 if (argp) { 3394 r = -EFAULT; 3395 if (copy_from_user(&kvm_sigmask, argp, 3396 sizeof(kvm_sigmask))) 3397 goto out; 3398 r = -EINVAL; 3399 if (kvm_sigmask.len != sizeof(sigset)) 3400 goto out; 3401 r = -EFAULT; 3402 if (copy_from_user(&sigset, sigmask_arg->sigset, 3403 sizeof(sigset))) 3404 goto out; 3405 p = &sigset; 3406 } 3407 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3408 break; 3409 } 3410 case KVM_GET_FPU: { 3411 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3412 r = -ENOMEM; 3413 if (!fpu) 3414 goto out; 3415 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3416 if (r) 3417 goto out; 3418 r = -EFAULT; 3419 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3420 goto out; 3421 r = 0; 3422 break; 3423 } 3424 case KVM_SET_FPU: { 3425 fpu = memdup_user(argp, sizeof(*fpu)); 3426 if (IS_ERR(fpu)) { 3427 r = PTR_ERR(fpu); 3428 fpu = NULL; 3429 goto out; 3430 } 3431 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3432 break; 3433 } 3434 default: 3435 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3436 } 3437 out: 3438 mutex_unlock(&vcpu->mutex); 3439 kfree(fpu); 3440 kfree(kvm_sregs); 3441 return r; 3442 } 3443 3444 #ifdef CONFIG_KVM_COMPAT 3445 static long kvm_vcpu_compat_ioctl(struct file *filp, 3446 unsigned int ioctl, unsigned long arg) 3447 { 3448 struct kvm_vcpu *vcpu = filp->private_data; 3449 void __user *argp = compat_ptr(arg); 3450 int r; 3451 3452 if (vcpu->kvm->mm != current->mm) 3453 return -EIO; 3454 3455 switch (ioctl) { 3456 case KVM_SET_SIGNAL_MASK: { 3457 struct kvm_signal_mask __user *sigmask_arg = argp; 3458 struct kvm_signal_mask kvm_sigmask; 3459 sigset_t sigset; 3460 3461 if (argp) { 3462 r = -EFAULT; 3463 if (copy_from_user(&kvm_sigmask, argp, 3464 sizeof(kvm_sigmask))) 3465 goto out; 3466 r = -EINVAL; 3467 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3468 goto out; 3469 r = -EFAULT; 3470 if (get_compat_sigset(&sigset, 3471 (compat_sigset_t __user *)sigmask_arg->sigset)) 3472 goto out; 3473 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3474 } else 3475 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3476 break; 3477 } 3478 default: 3479 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3480 } 3481 3482 out: 3483 return r; 3484 } 3485 #endif 3486 3487 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3488 { 3489 struct kvm_device *dev = filp->private_data; 3490 3491 if (dev->ops->mmap) 3492 return dev->ops->mmap(dev, vma); 3493 3494 return -ENODEV; 3495 } 3496 3497 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3498 int (*accessor)(struct kvm_device *dev, 3499 struct kvm_device_attr *attr), 3500 unsigned long arg) 3501 { 3502 struct kvm_device_attr attr; 3503 3504 if (!accessor) 3505 return -EPERM; 3506 3507 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3508 return -EFAULT; 3509 3510 return accessor(dev, &attr); 3511 } 3512 3513 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3514 unsigned long arg) 3515 { 3516 struct kvm_device *dev = filp->private_data; 3517 3518 if (dev->kvm->mm != current->mm) 3519 return -EIO; 3520 3521 switch (ioctl) { 3522 case KVM_SET_DEVICE_ATTR: 3523 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3524 case KVM_GET_DEVICE_ATTR: 3525 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3526 case KVM_HAS_DEVICE_ATTR: 3527 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3528 default: 3529 if (dev->ops->ioctl) 3530 return dev->ops->ioctl(dev, ioctl, arg); 3531 3532 return -ENOTTY; 3533 } 3534 } 3535 3536 static int kvm_device_release(struct inode *inode, struct file *filp) 3537 { 3538 struct kvm_device *dev = filp->private_data; 3539 struct kvm *kvm = dev->kvm; 3540 3541 if (dev->ops->release) { 3542 mutex_lock(&kvm->lock); 3543 list_del(&dev->vm_node); 3544 dev->ops->release(dev); 3545 mutex_unlock(&kvm->lock); 3546 } 3547 3548 kvm_put_kvm(kvm); 3549 return 0; 3550 } 3551 3552 static const struct file_operations kvm_device_fops = { 3553 .unlocked_ioctl = kvm_device_ioctl, 3554 .release = kvm_device_release, 3555 KVM_COMPAT(kvm_device_ioctl), 3556 .mmap = kvm_device_mmap, 3557 }; 3558 3559 struct kvm_device *kvm_device_from_filp(struct file *filp) 3560 { 3561 if (filp->f_op != &kvm_device_fops) 3562 return NULL; 3563 3564 return filp->private_data; 3565 } 3566 3567 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3568 #ifdef CONFIG_KVM_MPIC 3569 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3570 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3571 #endif 3572 }; 3573 3574 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3575 { 3576 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3577 return -ENOSPC; 3578 3579 if (kvm_device_ops_table[type] != NULL) 3580 return -EEXIST; 3581 3582 kvm_device_ops_table[type] = ops; 3583 return 0; 3584 } 3585 3586 void kvm_unregister_device_ops(u32 type) 3587 { 3588 if (kvm_device_ops_table[type] != NULL) 3589 kvm_device_ops_table[type] = NULL; 3590 } 3591 3592 static int kvm_ioctl_create_device(struct kvm *kvm, 3593 struct kvm_create_device *cd) 3594 { 3595 const struct kvm_device_ops *ops = NULL; 3596 struct kvm_device *dev; 3597 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3598 int type; 3599 int ret; 3600 3601 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3602 return -ENODEV; 3603 3604 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3605 ops = kvm_device_ops_table[type]; 3606 if (ops == NULL) 3607 return -ENODEV; 3608 3609 if (test) 3610 return 0; 3611 3612 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3613 if (!dev) 3614 return -ENOMEM; 3615 3616 dev->ops = ops; 3617 dev->kvm = kvm; 3618 3619 mutex_lock(&kvm->lock); 3620 ret = ops->create(dev, type); 3621 if (ret < 0) { 3622 mutex_unlock(&kvm->lock); 3623 kfree(dev); 3624 return ret; 3625 } 3626 list_add(&dev->vm_node, &kvm->devices); 3627 mutex_unlock(&kvm->lock); 3628 3629 if (ops->init) 3630 ops->init(dev); 3631 3632 kvm_get_kvm(kvm); 3633 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3634 if (ret < 0) { 3635 kvm_put_kvm_no_destroy(kvm); 3636 mutex_lock(&kvm->lock); 3637 list_del(&dev->vm_node); 3638 mutex_unlock(&kvm->lock); 3639 ops->destroy(dev); 3640 return ret; 3641 } 3642 3643 cd->fd = ret; 3644 return 0; 3645 } 3646 3647 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3648 { 3649 switch (arg) { 3650 case KVM_CAP_USER_MEMORY: 3651 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3652 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3653 case KVM_CAP_INTERNAL_ERROR_DATA: 3654 #ifdef CONFIG_HAVE_KVM_MSI 3655 case KVM_CAP_SIGNAL_MSI: 3656 #endif 3657 #ifdef CONFIG_HAVE_KVM_IRQFD 3658 case KVM_CAP_IRQFD: 3659 case KVM_CAP_IRQFD_RESAMPLE: 3660 #endif 3661 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3662 case KVM_CAP_CHECK_EXTENSION_VM: 3663 case KVM_CAP_ENABLE_CAP_VM: 3664 case KVM_CAP_HALT_POLL: 3665 return 1; 3666 #ifdef CONFIG_KVM_MMIO 3667 case KVM_CAP_COALESCED_MMIO: 3668 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3669 case KVM_CAP_COALESCED_PIO: 3670 return 1; 3671 #endif 3672 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3673 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3674 return KVM_DIRTY_LOG_MANUAL_CAPS; 3675 #endif 3676 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3677 case KVM_CAP_IRQ_ROUTING: 3678 return KVM_MAX_IRQ_ROUTES; 3679 #endif 3680 #if KVM_ADDRESS_SPACE_NUM > 1 3681 case KVM_CAP_MULTI_ADDRESS_SPACE: 3682 return KVM_ADDRESS_SPACE_NUM; 3683 #endif 3684 case KVM_CAP_NR_MEMSLOTS: 3685 return KVM_USER_MEM_SLOTS; 3686 case KVM_CAP_DIRTY_LOG_RING: 3687 #if KVM_DIRTY_LOG_PAGE_OFFSET > 0 3688 return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn); 3689 #else 3690 return 0; 3691 #endif 3692 default: 3693 break; 3694 } 3695 return kvm_vm_ioctl_check_extension(kvm, arg); 3696 } 3697 3698 static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) 3699 { 3700 int r; 3701 3702 if (!KVM_DIRTY_LOG_PAGE_OFFSET) 3703 return -EINVAL; 3704 3705 /* the size should be power of 2 */ 3706 if (!size || (size & (size - 1))) 3707 return -EINVAL; 3708 3709 /* Should be bigger to keep the reserved entries, or a page */ 3710 if (size < kvm_dirty_ring_get_rsvd_entries() * 3711 sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) 3712 return -EINVAL; 3713 3714 if (size > KVM_DIRTY_RING_MAX_ENTRIES * 3715 sizeof(struct kvm_dirty_gfn)) 3716 return -E2BIG; 3717 3718 /* We only allow it to set once */ 3719 if (kvm->dirty_ring_size) 3720 return -EINVAL; 3721 3722 mutex_lock(&kvm->lock); 3723 3724 if (kvm->created_vcpus) { 3725 /* We don't allow to change this value after vcpu created */ 3726 r = -EINVAL; 3727 } else { 3728 kvm->dirty_ring_size = size; 3729 r = 0; 3730 } 3731 3732 mutex_unlock(&kvm->lock); 3733 return r; 3734 } 3735 3736 static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm) 3737 { 3738 int i; 3739 struct kvm_vcpu *vcpu; 3740 int cleared = 0; 3741 3742 if (!kvm->dirty_ring_size) 3743 return -EINVAL; 3744 3745 mutex_lock(&kvm->slots_lock); 3746 3747 kvm_for_each_vcpu(i, vcpu, kvm) 3748 cleared += kvm_dirty_ring_reset(vcpu->kvm, &vcpu->dirty_ring); 3749 3750 mutex_unlock(&kvm->slots_lock); 3751 3752 if (cleared) 3753 kvm_flush_remote_tlbs(kvm); 3754 3755 return cleared; 3756 } 3757 3758 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3759 struct kvm_enable_cap *cap) 3760 { 3761 return -EINVAL; 3762 } 3763 3764 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3765 struct kvm_enable_cap *cap) 3766 { 3767 switch (cap->cap) { 3768 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3769 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 3770 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 3771 3772 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 3773 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 3774 3775 if (cap->flags || (cap->args[0] & ~allowed_options)) 3776 return -EINVAL; 3777 kvm->manual_dirty_log_protect = cap->args[0]; 3778 return 0; 3779 } 3780 #endif 3781 case KVM_CAP_HALT_POLL: { 3782 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) 3783 return -EINVAL; 3784 3785 kvm->max_halt_poll_ns = cap->args[0]; 3786 return 0; 3787 } 3788 case KVM_CAP_DIRTY_LOG_RING: 3789 return kvm_vm_ioctl_enable_dirty_log_ring(kvm, cap->args[0]); 3790 default: 3791 return kvm_vm_ioctl_enable_cap(kvm, cap); 3792 } 3793 } 3794 3795 static long kvm_vm_ioctl(struct file *filp, 3796 unsigned int ioctl, unsigned long arg) 3797 { 3798 struct kvm *kvm = filp->private_data; 3799 void __user *argp = (void __user *)arg; 3800 int r; 3801 3802 if (kvm->mm != current->mm) 3803 return -EIO; 3804 switch (ioctl) { 3805 case KVM_CREATE_VCPU: 3806 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3807 break; 3808 case KVM_ENABLE_CAP: { 3809 struct kvm_enable_cap cap; 3810 3811 r = -EFAULT; 3812 if (copy_from_user(&cap, argp, sizeof(cap))) 3813 goto out; 3814 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3815 break; 3816 } 3817 case KVM_SET_USER_MEMORY_REGION: { 3818 struct kvm_userspace_memory_region kvm_userspace_mem; 3819 3820 r = -EFAULT; 3821 if (copy_from_user(&kvm_userspace_mem, argp, 3822 sizeof(kvm_userspace_mem))) 3823 goto out; 3824 3825 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3826 break; 3827 } 3828 case KVM_GET_DIRTY_LOG: { 3829 struct kvm_dirty_log log; 3830 3831 r = -EFAULT; 3832 if (copy_from_user(&log, argp, sizeof(log))) 3833 goto out; 3834 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3835 break; 3836 } 3837 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3838 case KVM_CLEAR_DIRTY_LOG: { 3839 struct kvm_clear_dirty_log log; 3840 3841 r = -EFAULT; 3842 if (copy_from_user(&log, argp, sizeof(log))) 3843 goto out; 3844 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3845 break; 3846 } 3847 #endif 3848 #ifdef CONFIG_KVM_MMIO 3849 case KVM_REGISTER_COALESCED_MMIO: { 3850 struct kvm_coalesced_mmio_zone zone; 3851 3852 r = -EFAULT; 3853 if (copy_from_user(&zone, argp, sizeof(zone))) 3854 goto out; 3855 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3856 break; 3857 } 3858 case KVM_UNREGISTER_COALESCED_MMIO: { 3859 struct kvm_coalesced_mmio_zone zone; 3860 3861 r = -EFAULT; 3862 if (copy_from_user(&zone, argp, sizeof(zone))) 3863 goto out; 3864 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3865 break; 3866 } 3867 #endif 3868 case KVM_IRQFD: { 3869 struct kvm_irqfd data; 3870 3871 r = -EFAULT; 3872 if (copy_from_user(&data, argp, sizeof(data))) 3873 goto out; 3874 r = kvm_irqfd(kvm, &data); 3875 break; 3876 } 3877 case KVM_IOEVENTFD: { 3878 struct kvm_ioeventfd data; 3879 3880 r = -EFAULT; 3881 if (copy_from_user(&data, argp, sizeof(data))) 3882 goto out; 3883 r = kvm_ioeventfd(kvm, &data); 3884 break; 3885 } 3886 #ifdef CONFIG_HAVE_KVM_MSI 3887 case KVM_SIGNAL_MSI: { 3888 struct kvm_msi msi; 3889 3890 r = -EFAULT; 3891 if (copy_from_user(&msi, argp, sizeof(msi))) 3892 goto out; 3893 r = kvm_send_userspace_msi(kvm, &msi); 3894 break; 3895 } 3896 #endif 3897 #ifdef __KVM_HAVE_IRQ_LINE 3898 case KVM_IRQ_LINE_STATUS: 3899 case KVM_IRQ_LINE: { 3900 struct kvm_irq_level irq_event; 3901 3902 r = -EFAULT; 3903 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3904 goto out; 3905 3906 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3907 ioctl == KVM_IRQ_LINE_STATUS); 3908 if (r) 3909 goto out; 3910 3911 r = -EFAULT; 3912 if (ioctl == KVM_IRQ_LINE_STATUS) { 3913 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3914 goto out; 3915 } 3916 3917 r = 0; 3918 break; 3919 } 3920 #endif 3921 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3922 case KVM_SET_GSI_ROUTING: { 3923 struct kvm_irq_routing routing; 3924 struct kvm_irq_routing __user *urouting; 3925 struct kvm_irq_routing_entry *entries = NULL; 3926 3927 r = -EFAULT; 3928 if (copy_from_user(&routing, argp, sizeof(routing))) 3929 goto out; 3930 r = -EINVAL; 3931 if (!kvm_arch_can_set_irq_routing(kvm)) 3932 goto out; 3933 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3934 goto out; 3935 if (routing.flags) 3936 goto out; 3937 if (routing.nr) { 3938 urouting = argp; 3939 entries = vmemdup_user(urouting->entries, 3940 array_size(sizeof(*entries), 3941 routing.nr)); 3942 if (IS_ERR(entries)) { 3943 r = PTR_ERR(entries); 3944 goto out; 3945 } 3946 } 3947 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3948 routing.flags); 3949 kvfree(entries); 3950 break; 3951 } 3952 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3953 case KVM_CREATE_DEVICE: { 3954 struct kvm_create_device cd; 3955 3956 r = -EFAULT; 3957 if (copy_from_user(&cd, argp, sizeof(cd))) 3958 goto out; 3959 3960 r = kvm_ioctl_create_device(kvm, &cd); 3961 if (r) 3962 goto out; 3963 3964 r = -EFAULT; 3965 if (copy_to_user(argp, &cd, sizeof(cd))) 3966 goto out; 3967 3968 r = 0; 3969 break; 3970 } 3971 case KVM_CHECK_EXTENSION: 3972 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3973 break; 3974 case KVM_RESET_DIRTY_RINGS: 3975 r = kvm_vm_ioctl_reset_dirty_pages(kvm); 3976 break; 3977 default: 3978 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3979 } 3980 out: 3981 return r; 3982 } 3983 3984 #ifdef CONFIG_KVM_COMPAT 3985 struct compat_kvm_dirty_log { 3986 __u32 slot; 3987 __u32 padding1; 3988 union { 3989 compat_uptr_t dirty_bitmap; /* one bit per page */ 3990 __u64 padding2; 3991 }; 3992 }; 3993 3994 static long kvm_vm_compat_ioctl(struct file *filp, 3995 unsigned int ioctl, unsigned long arg) 3996 { 3997 struct kvm *kvm = filp->private_data; 3998 int r; 3999 4000 if (kvm->mm != current->mm) 4001 return -EIO; 4002 switch (ioctl) { 4003 case KVM_GET_DIRTY_LOG: { 4004 struct compat_kvm_dirty_log compat_log; 4005 struct kvm_dirty_log log; 4006 4007 if (copy_from_user(&compat_log, (void __user *)arg, 4008 sizeof(compat_log))) 4009 return -EFAULT; 4010 log.slot = compat_log.slot; 4011 log.padding1 = compat_log.padding1; 4012 log.padding2 = compat_log.padding2; 4013 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 4014 4015 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 4016 break; 4017 } 4018 default: 4019 r = kvm_vm_ioctl(filp, ioctl, arg); 4020 } 4021 return r; 4022 } 4023 #endif 4024 4025 static struct file_operations kvm_vm_fops = { 4026 .release = kvm_vm_release, 4027 .unlocked_ioctl = kvm_vm_ioctl, 4028 .llseek = noop_llseek, 4029 KVM_COMPAT(kvm_vm_compat_ioctl), 4030 }; 4031 4032 static int kvm_dev_ioctl_create_vm(unsigned long type) 4033 { 4034 int r; 4035 struct kvm *kvm; 4036 struct file *file; 4037 4038 kvm = kvm_create_vm(type); 4039 if (IS_ERR(kvm)) 4040 return PTR_ERR(kvm); 4041 #ifdef CONFIG_KVM_MMIO 4042 r = kvm_coalesced_mmio_init(kvm); 4043 if (r < 0) 4044 goto put_kvm; 4045 #endif 4046 r = get_unused_fd_flags(O_CLOEXEC); 4047 if (r < 0) 4048 goto put_kvm; 4049 4050 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 4051 if (IS_ERR(file)) { 4052 put_unused_fd(r); 4053 r = PTR_ERR(file); 4054 goto put_kvm; 4055 } 4056 4057 /* 4058 * Don't call kvm_put_kvm anymore at this point; file->f_op is 4059 * already set, with ->release() being kvm_vm_release(). In error 4060 * cases it will be called by the final fput(file) and will take 4061 * care of doing kvm_put_kvm(kvm). 4062 */ 4063 if (kvm_create_vm_debugfs(kvm, r) < 0) { 4064 put_unused_fd(r); 4065 fput(file); 4066 return -ENOMEM; 4067 } 4068 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 4069 4070 fd_install(r, file); 4071 return r; 4072 4073 put_kvm: 4074 kvm_put_kvm(kvm); 4075 return r; 4076 } 4077 4078 static long kvm_dev_ioctl(struct file *filp, 4079 unsigned int ioctl, unsigned long arg) 4080 { 4081 long r = -EINVAL; 4082 4083 switch (ioctl) { 4084 case KVM_GET_API_VERSION: 4085 if (arg) 4086 goto out; 4087 r = KVM_API_VERSION; 4088 break; 4089 case KVM_CREATE_VM: 4090 r = kvm_dev_ioctl_create_vm(arg); 4091 break; 4092 case KVM_CHECK_EXTENSION: 4093 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 4094 break; 4095 case KVM_GET_VCPU_MMAP_SIZE: 4096 if (arg) 4097 goto out; 4098 r = PAGE_SIZE; /* struct kvm_run */ 4099 #ifdef CONFIG_X86 4100 r += PAGE_SIZE; /* pio data page */ 4101 #endif 4102 #ifdef CONFIG_KVM_MMIO 4103 r += PAGE_SIZE; /* coalesced mmio ring page */ 4104 #endif 4105 break; 4106 case KVM_TRACE_ENABLE: 4107 case KVM_TRACE_PAUSE: 4108 case KVM_TRACE_DISABLE: 4109 r = -EOPNOTSUPP; 4110 break; 4111 default: 4112 return kvm_arch_dev_ioctl(filp, ioctl, arg); 4113 } 4114 out: 4115 return r; 4116 } 4117 4118 static struct file_operations kvm_chardev_ops = { 4119 .unlocked_ioctl = kvm_dev_ioctl, 4120 .llseek = noop_llseek, 4121 KVM_COMPAT(kvm_dev_ioctl), 4122 }; 4123 4124 static struct miscdevice kvm_dev = { 4125 KVM_MINOR, 4126 "kvm", 4127 &kvm_chardev_ops, 4128 }; 4129 4130 static void hardware_enable_nolock(void *junk) 4131 { 4132 int cpu = raw_smp_processor_id(); 4133 int r; 4134 4135 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4136 return; 4137 4138 cpumask_set_cpu(cpu, cpus_hardware_enabled); 4139 4140 r = kvm_arch_hardware_enable(); 4141 4142 if (r) { 4143 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4144 atomic_inc(&hardware_enable_failed); 4145 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 4146 } 4147 } 4148 4149 static int kvm_starting_cpu(unsigned int cpu) 4150 { 4151 raw_spin_lock(&kvm_count_lock); 4152 if (kvm_usage_count) 4153 hardware_enable_nolock(NULL); 4154 raw_spin_unlock(&kvm_count_lock); 4155 return 0; 4156 } 4157 4158 static void hardware_disable_nolock(void *junk) 4159 { 4160 int cpu = raw_smp_processor_id(); 4161 4162 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 4163 return; 4164 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 4165 kvm_arch_hardware_disable(); 4166 } 4167 4168 static int kvm_dying_cpu(unsigned int cpu) 4169 { 4170 raw_spin_lock(&kvm_count_lock); 4171 if (kvm_usage_count) 4172 hardware_disable_nolock(NULL); 4173 raw_spin_unlock(&kvm_count_lock); 4174 return 0; 4175 } 4176 4177 static void hardware_disable_all_nolock(void) 4178 { 4179 BUG_ON(!kvm_usage_count); 4180 4181 kvm_usage_count--; 4182 if (!kvm_usage_count) 4183 on_each_cpu(hardware_disable_nolock, NULL, 1); 4184 } 4185 4186 static void hardware_disable_all(void) 4187 { 4188 raw_spin_lock(&kvm_count_lock); 4189 hardware_disable_all_nolock(); 4190 raw_spin_unlock(&kvm_count_lock); 4191 } 4192 4193 static int hardware_enable_all(void) 4194 { 4195 int r = 0; 4196 4197 raw_spin_lock(&kvm_count_lock); 4198 4199 kvm_usage_count++; 4200 if (kvm_usage_count == 1) { 4201 atomic_set(&hardware_enable_failed, 0); 4202 on_each_cpu(hardware_enable_nolock, NULL, 1); 4203 4204 if (atomic_read(&hardware_enable_failed)) { 4205 hardware_disable_all_nolock(); 4206 r = -EBUSY; 4207 } 4208 } 4209 4210 raw_spin_unlock(&kvm_count_lock); 4211 4212 return r; 4213 } 4214 4215 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4216 void *v) 4217 { 4218 /* 4219 * Some (well, at least mine) BIOSes hang on reboot if 4220 * in vmx root mode. 4221 * 4222 * And Intel TXT required VMX off for all cpu when system shutdown. 4223 */ 4224 pr_info("kvm: exiting hardware virtualization\n"); 4225 kvm_rebooting = true; 4226 on_each_cpu(hardware_disable_nolock, NULL, 1); 4227 return NOTIFY_OK; 4228 } 4229 4230 static struct notifier_block kvm_reboot_notifier = { 4231 .notifier_call = kvm_reboot, 4232 .priority = 0, 4233 }; 4234 4235 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4236 { 4237 int i; 4238 4239 for (i = 0; i < bus->dev_count; i++) { 4240 struct kvm_io_device *pos = bus->range[i].dev; 4241 4242 kvm_iodevice_destructor(pos); 4243 } 4244 kfree(bus); 4245 } 4246 4247 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4248 const struct kvm_io_range *r2) 4249 { 4250 gpa_t addr1 = r1->addr; 4251 gpa_t addr2 = r2->addr; 4252 4253 if (addr1 < addr2) 4254 return -1; 4255 4256 /* If r2->len == 0, match the exact address. If r2->len != 0, 4257 * accept any overlapping write. Any order is acceptable for 4258 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4259 * we process all of them. 4260 */ 4261 if (r2->len) { 4262 addr1 += r1->len; 4263 addr2 += r2->len; 4264 } 4265 4266 if (addr1 > addr2) 4267 return 1; 4268 4269 return 0; 4270 } 4271 4272 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 4273 { 4274 return kvm_io_bus_cmp(p1, p2); 4275 } 4276 4277 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 4278 gpa_t addr, int len) 4279 { 4280 struct kvm_io_range *range, key; 4281 int off; 4282 4283 key = (struct kvm_io_range) { 4284 .addr = addr, 4285 .len = len, 4286 }; 4287 4288 range = bsearch(&key, bus->range, bus->dev_count, 4289 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 4290 if (range == NULL) 4291 return -ENOENT; 4292 4293 off = range - bus->range; 4294 4295 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 4296 off--; 4297 4298 return off; 4299 } 4300 4301 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4302 struct kvm_io_range *range, const void *val) 4303 { 4304 int idx; 4305 4306 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4307 if (idx < 0) 4308 return -EOPNOTSUPP; 4309 4310 while (idx < bus->dev_count && 4311 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4312 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 4313 range->len, val)) 4314 return idx; 4315 idx++; 4316 } 4317 4318 return -EOPNOTSUPP; 4319 } 4320 4321 /* kvm_io_bus_write - called under kvm->slots_lock */ 4322 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4323 int len, const void *val) 4324 { 4325 struct kvm_io_bus *bus; 4326 struct kvm_io_range range; 4327 int r; 4328 4329 range = (struct kvm_io_range) { 4330 .addr = addr, 4331 .len = len, 4332 }; 4333 4334 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4335 if (!bus) 4336 return -ENOMEM; 4337 r = __kvm_io_bus_write(vcpu, bus, &range, val); 4338 return r < 0 ? r : 0; 4339 } 4340 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 4341 4342 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 4343 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 4344 gpa_t addr, int len, const void *val, long cookie) 4345 { 4346 struct kvm_io_bus *bus; 4347 struct kvm_io_range range; 4348 4349 range = (struct kvm_io_range) { 4350 .addr = addr, 4351 .len = len, 4352 }; 4353 4354 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4355 if (!bus) 4356 return -ENOMEM; 4357 4358 /* First try the device referenced by cookie. */ 4359 if ((cookie >= 0) && (cookie < bus->dev_count) && 4360 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 4361 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 4362 val)) 4363 return cookie; 4364 4365 /* 4366 * cookie contained garbage; fall back to search and return the 4367 * correct cookie value. 4368 */ 4369 return __kvm_io_bus_write(vcpu, bus, &range, val); 4370 } 4371 4372 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4373 struct kvm_io_range *range, void *val) 4374 { 4375 int idx; 4376 4377 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4378 if (idx < 0) 4379 return -EOPNOTSUPP; 4380 4381 while (idx < bus->dev_count && 4382 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4383 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 4384 range->len, val)) 4385 return idx; 4386 idx++; 4387 } 4388 4389 return -EOPNOTSUPP; 4390 } 4391 4392 /* kvm_io_bus_read - called under kvm->slots_lock */ 4393 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4394 int len, void *val) 4395 { 4396 struct kvm_io_bus *bus; 4397 struct kvm_io_range range; 4398 int r; 4399 4400 range = (struct kvm_io_range) { 4401 .addr = addr, 4402 .len = len, 4403 }; 4404 4405 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4406 if (!bus) 4407 return -ENOMEM; 4408 r = __kvm_io_bus_read(vcpu, bus, &range, val); 4409 return r < 0 ? r : 0; 4410 } 4411 4412 /* Caller must hold slots_lock. */ 4413 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 4414 int len, struct kvm_io_device *dev) 4415 { 4416 int i; 4417 struct kvm_io_bus *new_bus, *bus; 4418 struct kvm_io_range range; 4419 4420 bus = kvm_get_bus(kvm, bus_idx); 4421 if (!bus) 4422 return -ENOMEM; 4423 4424 /* exclude ioeventfd which is limited by maximum fd */ 4425 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 4426 return -ENOSPC; 4427 4428 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 4429 GFP_KERNEL_ACCOUNT); 4430 if (!new_bus) 4431 return -ENOMEM; 4432 4433 range = (struct kvm_io_range) { 4434 .addr = addr, 4435 .len = len, 4436 .dev = dev, 4437 }; 4438 4439 for (i = 0; i < bus->dev_count; i++) 4440 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4441 break; 4442 4443 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4444 new_bus->dev_count++; 4445 new_bus->range[i] = range; 4446 memcpy(new_bus->range + i + 1, bus->range + i, 4447 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4448 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4449 synchronize_srcu_expedited(&kvm->srcu); 4450 kfree(bus); 4451 4452 return 0; 4453 } 4454 4455 /* Caller must hold slots_lock. */ 4456 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4457 struct kvm_io_device *dev) 4458 { 4459 int i, j; 4460 struct kvm_io_bus *new_bus, *bus; 4461 4462 bus = kvm_get_bus(kvm, bus_idx); 4463 if (!bus) 4464 return; 4465 4466 for (i = 0; i < bus->dev_count; i++) 4467 if (bus->range[i].dev == dev) { 4468 break; 4469 } 4470 4471 if (i == bus->dev_count) 4472 return; 4473 4474 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4475 GFP_KERNEL_ACCOUNT); 4476 if (new_bus) { 4477 memcpy(new_bus, bus, struct_size(bus, range, i)); 4478 new_bus->dev_count--; 4479 memcpy(new_bus->range + i, bus->range + i + 1, 4480 flex_array_size(new_bus, range, new_bus->dev_count - i)); 4481 } else { 4482 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4483 for (j = 0; j < bus->dev_count; j++) { 4484 if (j == i) 4485 continue; 4486 kvm_iodevice_destructor(bus->range[j].dev); 4487 } 4488 } 4489 4490 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4491 synchronize_srcu_expedited(&kvm->srcu); 4492 kfree(bus); 4493 return; 4494 } 4495 4496 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4497 gpa_t addr) 4498 { 4499 struct kvm_io_bus *bus; 4500 int dev_idx, srcu_idx; 4501 struct kvm_io_device *iodev = NULL; 4502 4503 srcu_idx = srcu_read_lock(&kvm->srcu); 4504 4505 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4506 if (!bus) 4507 goto out_unlock; 4508 4509 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4510 if (dev_idx < 0) 4511 goto out_unlock; 4512 4513 iodev = bus->range[dev_idx].dev; 4514 4515 out_unlock: 4516 srcu_read_unlock(&kvm->srcu, srcu_idx); 4517 4518 return iodev; 4519 } 4520 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4521 4522 static int kvm_debugfs_open(struct inode *inode, struct file *file, 4523 int (*get)(void *, u64 *), int (*set)(void *, u64), 4524 const char *fmt) 4525 { 4526 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4527 inode->i_private; 4528 4529 /* The debugfs files are a reference to the kvm struct which 4530 * is still valid when kvm_destroy_vm is called. 4531 * To avoid the race between open and the removal of the debugfs 4532 * directory we test against the users count. 4533 */ 4534 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4535 return -ENOENT; 4536 4537 if (simple_attr_open(inode, file, get, 4538 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4539 ? set : NULL, 4540 fmt)) { 4541 kvm_put_kvm(stat_data->kvm); 4542 return -ENOMEM; 4543 } 4544 4545 return 0; 4546 } 4547 4548 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4549 { 4550 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4551 inode->i_private; 4552 4553 simple_attr_release(inode, file); 4554 kvm_put_kvm(stat_data->kvm); 4555 4556 return 0; 4557 } 4558 4559 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4560 { 4561 *val = *(ulong *)((void *)kvm + offset); 4562 4563 return 0; 4564 } 4565 4566 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4567 { 4568 *(ulong *)((void *)kvm + offset) = 0; 4569 4570 return 0; 4571 } 4572 4573 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4574 { 4575 int i; 4576 struct kvm_vcpu *vcpu; 4577 4578 *val = 0; 4579 4580 kvm_for_each_vcpu(i, vcpu, kvm) 4581 *val += *(u64 *)((void *)vcpu + offset); 4582 4583 return 0; 4584 } 4585 4586 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4587 { 4588 int i; 4589 struct kvm_vcpu *vcpu; 4590 4591 kvm_for_each_vcpu(i, vcpu, kvm) 4592 *(u64 *)((void *)vcpu + offset) = 0; 4593 4594 return 0; 4595 } 4596 4597 static int kvm_stat_data_get(void *data, u64 *val) 4598 { 4599 int r = -EFAULT; 4600 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4601 4602 switch (stat_data->dbgfs_item->kind) { 4603 case KVM_STAT_VM: 4604 r = kvm_get_stat_per_vm(stat_data->kvm, 4605 stat_data->dbgfs_item->offset, val); 4606 break; 4607 case KVM_STAT_VCPU: 4608 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4609 stat_data->dbgfs_item->offset, val); 4610 break; 4611 } 4612 4613 return r; 4614 } 4615 4616 static int kvm_stat_data_clear(void *data, u64 val) 4617 { 4618 int r = -EFAULT; 4619 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4620 4621 if (val) 4622 return -EINVAL; 4623 4624 switch (stat_data->dbgfs_item->kind) { 4625 case KVM_STAT_VM: 4626 r = kvm_clear_stat_per_vm(stat_data->kvm, 4627 stat_data->dbgfs_item->offset); 4628 break; 4629 case KVM_STAT_VCPU: 4630 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4631 stat_data->dbgfs_item->offset); 4632 break; 4633 } 4634 4635 return r; 4636 } 4637 4638 static int kvm_stat_data_open(struct inode *inode, struct file *file) 4639 { 4640 __simple_attr_check_format("%llu\n", 0ull); 4641 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4642 kvm_stat_data_clear, "%llu\n"); 4643 } 4644 4645 static const struct file_operations stat_fops_per_vm = { 4646 .owner = THIS_MODULE, 4647 .open = kvm_stat_data_open, 4648 .release = kvm_debugfs_release, 4649 .read = simple_attr_read, 4650 .write = simple_attr_write, 4651 .llseek = no_llseek, 4652 }; 4653 4654 static int vm_stat_get(void *_offset, u64 *val) 4655 { 4656 unsigned offset = (long)_offset; 4657 struct kvm *kvm; 4658 u64 tmp_val; 4659 4660 *val = 0; 4661 mutex_lock(&kvm_lock); 4662 list_for_each_entry(kvm, &vm_list, vm_list) { 4663 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4664 *val += tmp_val; 4665 } 4666 mutex_unlock(&kvm_lock); 4667 return 0; 4668 } 4669 4670 static int vm_stat_clear(void *_offset, u64 val) 4671 { 4672 unsigned offset = (long)_offset; 4673 struct kvm *kvm; 4674 4675 if (val) 4676 return -EINVAL; 4677 4678 mutex_lock(&kvm_lock); 4679 list_for_each_entry(kvm, &vm_list, vm_list) { 4680 kvm_clear_stat_per_vm(kvm, offset); 4681 } 4682 mutex_unlock(&kvm_lock); 4683 4684 return 0; 4685 } 4686 4687 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4688 4689 static int vcpu_stat_get(void *_offset, u64 *val) 4690 { 4691 unsigned offset = (long)_offset; 4692 struct kvm *kvm; 4693 u64 tmp_val; 4694 4695 *val = 0; 4696 mutex_lock(&kvm_lock); 4697 list_for_each_entry(kvm, &vm_list, vm_list) { 4698 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4699 *val += tmp_val; 4700 } 4701 mutex_unlock(&kvm_lock); 4702 return 0; 4703 } 4704 4705 static int vcpu_stat_clear(void *_offset, u64 val) 4706 { 4707 unsigned offset = (long)_offset; 4708 struct kvm *kvm; 4709 4710 if (val) 4711 return -EINVAL; 4712 4713 mutex_lock(&kvm_lock); 4714 list_for_each_entry(kvm, &vm_list, vm_list) { 4715 kvm_clear_stat_per_vcpu(kvm, offset); 4716 } 4717 mutex_unlock(&kvm_lock); 4718 4719 return 0; 4720 } 4721 4722 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4723 "%llu\n"); 4724 4725 static const struct file_operations *stat_fops[] = { 4726 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4727 [KVM_STAT_VM] = &vm_stat_fops, 4728 }; 4729 4730 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4731 { 4732 struct kobj_uevent_env *env; 4733 unsigned long long created, active; 4734 4735 if (!kvm_dev.this_device || !kvm) 4736 return; 4737 4738 mutex_lock(&kvm_lock); 4739 if (type == KVM_EVENT_CREATE_VM) { 4740 kvm_createvm_count++; 4741 kvm_active_vms++; 4742 } else if (type == KVM_EVENT_DESTROY_VM) { 4743 kvm_active_vms--; 4744 } 4745 created = kvm_createvm_count; 4746 active = kvm_active_vms; 4747 mutex_unlock(&kvm_lock); 4748 4749 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4750 if (!env) 4751 return; 4752 4753 add_uevent_var(env, "CREATED=%llu", created); 4754 add_uevent_var(env, "COUNT=%llu", active); 4755 4756 if (type == KVM_EVENT_CREATE_VM) { 4757 add_uevent_var(env, "EVENT=create"); 4758 kvm->userspace_pid = task_pid_nr(current); 4759 } else if (type == KVM_EVENT_DESTROY_VM) { 4760 add_uevent_var(env, "EVENT=destroy"); 4761 } 4762 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4763 4764 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4765 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4766 4767 if (p) { 4768 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4769 if (!IS_ERR(tmp)) 4770 add_uevent_var(env, "STATS_PATH=%s", tmp); 4771 kfree(p); 4772 } 4773 } 4774 /* no need for checks, since we are adding at most only 5 keys */ 4775 env->envp[env->envp_idx++] = NULL; 4776 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4777 kfree(env); 4778 } 4779 4780 static void kvm_init_debug(void) 4781 { 4782 struct kvm_stats_debugfs_item *p; 4783 4784 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4785 4786 kvm_debugfs_num_entries = 0; 4787 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4788 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4789 kvm_debugfs_dir, (void *)(long)p->offset, 4790 stat_fops[p->kind]); 4791 } 4792 } 4793 4794 static int kvm_suspend(void) 4795 { 4796 if (kvm_usage_count) 4797 hardware_disable_nolock(NULL); 4798 return 0; 4799 } 4800 4801 static void kvm_resume(void) 4802 { 4803 if (kvm_usage_count) { 4804 #ifdef CONFIG_LOCKDEP 4805 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4806 #endif 4807 hardware_enable_nolock(NULL); 4808 } 4809 } 4810 4811 static struct syscore_ops kvm_syscore_ops = { 4812 .suspend = kvm_suspend, 4813 .resume = kvm_resume, 4814 }; 4815 4816 static inline 4817 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4818 { 4819 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4820 } 4821 4822 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4823 { 4824 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4825 4826 WRITE_ONCE(vcpu->preempted, false); 4827 WRITE_ONCE(vcpu->ready, false); 4828 4829 __this_cpu_write(kvm_running_vcpu, vcpu); 4830 kvm_arch_sched_in(vcpu, cpu); 4831 kvm_arch_vcpu_load(vcpu, cpu); 4832 } 4833 4834 static void kvm_sched_out(struct preempt_notifier *pn, 4835 struct task_struct *next) 4836 { 4837 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4838 4839 if (current->state == TASK_RUNNING) { 4840 WRITE_ONCE(vcpu->preempted, true); 4841 WRITE_ONCE(vcpu->ready, true); 4842 } 4843 kvm_arch_vcpu_put(vcpu); 4844 __this_cpu_write(kvm_running_vcpu, NULL); 4845 } 4846 4847 /** 4848 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4849 * 4850 * We can disable preemption locally around accessing the per-CPU variable, 4851 * and use the resolved vcpu pointer after enabling preemption again, 4852 * because even if the current thread is migrated to another CPU, reading 4853 * the per-CPU value later will give us the same value as we update the 4854 * per-CPU variable in the preempt notifier handlers. 4855 */ 4856 struct kvm_vcpu *kvm_get_running_vcpu(void) 4857 { 4858 struct kvm_vcpu *vcpu; 4859 4860 preempt_disable(); 4861 vcpu = __this_cpu_read(kvm_running_vcpu); 4862 preempt_enable(); 4863 4864 return vcpu; 4865 } 4866 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); 4867 4868 /** 4869 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4870 */ 4871 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4872 { 4873 return &kvm_running_vcpu; 4874 } 4875 4876 struct kvm_cpu_compat_check { 4877 void *opaque; 4878 int *ret; 4879 }; 4880 4881 static void check_processor_compat(void *data) 4882 { 4883 struct kvm_cpu_compat_check *c = data; 4884 4885 *c->ret = kvm_arch_check_processor_compat(c->opaque); 4886 } 4887 4888 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4889 struct module *module) 4890 { 4891 struct kvm_cpu_compat_check c; 4892 int r; 4893 int cpu; 4894 4895 r = kvm_arch_init(opaque); 4896 if (r) 4897 goto out_fail; 4898 4899 /* 4900 * kvm_arch_init makes sure there's at most one caller 4901 * for architectures that support multiple implementations, 4902 * like intel and amd on x86. 4903 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4904 * conflicts in case kvm is already setup for another implementation. 4905 */ 4906 r = kvm_irqfd_init(); 4907 if (r) 4908 goto out_irqfd; 4909 4910 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4911 r = -ENOMEM; 4912 goto out_free_0; 4913 } 4914 4915 r = kvm_arch_hardware_setup(opaque); 4916 if (r < 0) 4917 goto out_free_1; 4918 4919 c.ret = &r; 4920 c.opaque = opaque; 4921 for_each_online_cpu(cpu) { 4922 smp_call_function_single(cpu, check_processor_compat, &c, 1); 4923 if (r < 0) 4924 goto out_free_2; 4925 } 4926 4927 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4928 kvm_starting_cpu, kvm_dying_cpu); 4929 if (r) 4930 goto out_free_2; 4931 register_reboot_notifier(&kvm_reboot_notifier); 4932 4933 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4934 if (!vcpu_align) 4935 vcpu_align = __alignof__(struct kvm_vcpu); 4936 kvm_vcpu_cache = 4937 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4938 SLAB_ACCOUNT, 4939 offsetof(struct kvm_vcpu, arch), 4940 sizeof_field(struct kvm_vcpu, arch), 4941 NULL); 4942 if (!kvm_vcpu_cache) { 4943 r = -ENOMEM; 4944 goto out_free_3; 4945 } 4946 4947 r = kvm_async_pf_init(); 4948 if (r) 4949 goto out_free; 4950 4951 kvm_chardev_ops.owner = module; 4952 kvm_vm_fops.owner = module; 4953 kvm_vcpu_fops.owner = module; 4954 4955 r = misc_register(&kvm_dev); 4956 if (r) { 4957 pr_err("kvm: misc device register failed\n"); 4958 goto out_unreg; 4959 } 4960 4961 register_syscore_ops(&kvm_syscore_ops); 4962 4963 kvm_preempt_ops.sched_in = kvm_sched_in; 4964 kvm_preempt_ops.sched_out = kvm_sched_out; 4965 4966 kvm_init_debug(); 4967 4968 r = kvm_vfio_ops_init(); 4969 WARN_ON(r); 4970 4971 return 0; 4972 4973 out_unreg: 4974 kvm_async_pf_deinit(); 4975 out_free: 4976 kmem_cache_destroy(kvm_vcpu_cache); 4977 out_free_3: 4978 unregister_reboot_notifier(&kvm_reboot_notifier); 4979 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4980 out_free_2: 4981 kvm_arch_hardware_unsetup(); 4982 out_free_1: 4983 free_cpumask_var(cpus_hardware_enabled); 4984 out_free_0: 4985 kvm_irqfd_exit(); 4986 out_irqfd: 4987 kvm_arch_exit(); 4988 out_fail: 4989 return r; 4990 } 4991 EXPORT_SYMBOL_GPL(kvm_init); 4992 4993 void kvm_exit(void) 4994 { 4995 debugfs_remove_recursive(kvm_debugfs_dir); 4996 misc_deregister(&kvm_dev); 4997 kmem_cache_destroy(kvm_vcpu_cache); 4998 kvm_async_pf_deinit(); 4999 unregister_syscore_ops(&kvm_syscore_ops); 5000 unregister_reboot_notifier(&kvm_reboot_notifier); 5001 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 5002 on_each_cpu(hardware_disable_nolock, NULL, 1); 5003 kvm_arch_hardware_unsetup(); 5004 kvm_arch_exit(); 5005 kvm_irqfd_exit(); 5006 free_cpumask_var(cpus_hardware_enabled); 5007 kvm_vfio_ops_exit(); 5008 } 5009 EXPORT_SYMBOL_GPL(kvm_exit); 5010 5011 struct kvm_vm_worker_thread_context { 5012 struct kvm *kvm; 5013 struct task_struct *parent; 5014 struct completion init_done; 5015 kvm_vm_thread_fn_t thread_fn; 5016 uintptr_t data; 5017 int err; 5018 }; 5019 5020 static int kvm_vm_worker_thread(void *context) 5021 { 5022 /* 5023 * The init_context is allocated on the stack of the parent thread, so 5024 * we have to locally copy anything that is needed beyond initialization 5025 */ 5026 struct kvm_vm_worker_thread_context *init_context = context; 5027 struct kvm *kvm = init_context->kvm; 5028 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 5029 uintptr_t data = init_context->data; 5030 int err; 5031 5032 err = kthread_park(current); 5033 /* kthread_park(current) is never supposed to return an error */ 5034 WARN_ON(err != 0); 5035 if (err) 5036 goto init_complete; 5037 5038 err = cgroup_attach_task_all(init_context->parent, current); 5039 if (err) { 5040 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 5041 __func__, err); 5042 goto init_complete; 5043 } 5044 5045 set_user_nice(current, task_nice(init_context->parent)); 5046 5047 init_complete: 5048 init_context->err = err; 5049 complete(&init_context->init_done); 5050 init_context = NULL; 5051 5052 if (err) 5053 return err; 5054 5055 /* Wait to be woken up by the spawner before proceeding. */ 5056 kthread_parkme(); 5057 5058 if (!kthread_should_stop()) 5059 err = thread_fn(kvm, data); 5060 5061 return err; 5062 } 5063 5064 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 5065 uintptr_t data, const char *name, 5066 struct task_struct **thread_ptr) 5067 { 5068 struct kvm_vm_worker_thread_context init_context = {}; 5069 struct task_struct *thread; 5070 5071 *thread_ptr = NULL; 5072 init_context.kvm = kvm; 5073 init_context.parent = current; 5074 init_context.thread_fn = thread_fn; 5075 init_context.data = data; 5076 init_completion(&init_context.init_done); 5077 5078 thread = kthread_run(kvm_vm_worker_thread, &init_context, 5079 "%s-%d", name, task_pid_nr(current)); 5080 if (IS_ERR(thread)) 5081 return PTR_ERR(thread); 5082 5083 /* kthread_run is never supposed to return NULL */ 5084 WARN_ON(thread == NULL); 5085 5086 wait_for_completion(&init_context.init_done); 5087 5088 if (!init_context.err) 5089 *thread_ptr = thread; 5090 5091 return init_context.err; 5092 } 5093