1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 #include <asm/pgtable.h> 59 60 #include "coalesced_mmio.h" 61 #include "async_pf.h" 62 #include "vfio.h" 63 64 #define CREATE_TRACE_POINTS 65 #include <trace/events/kvm.h> 66 67 /* Worst case buffer size needed for holding an integer. */ 68 #define ITOA_MAX_LEN 12 69 70 MODULE_AUTHOR("Qumranet"); 71 MODULE_LICENSE("GPL"); 72 73 /* Architectures should define their poll value according to the halt latency */ 74 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 75 module_param(halt_poll_ns, uint, 0644); 76 EXPORT_SYMBOL_GPL(halt_poll_ns); 77 78 /* Default doubles per-vcpu halt_poll_ns. */ 79 unsigned int halt_poll_ns_grow = 2; 80 module_param(halt_poll_ns_grow, uint, 0644); 81 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 82 83 /* The start value to grow halt_poll_ns from */ 84 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 85 module_param(halt_poll_ns_grow_start, uint, 0644); 86 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 87 88 /* Default resets per-vcpu halt_poll_ns . */ 89 unsigned int halt_poll_ns_shrink; 90 module_param(halt_poll_ns_shrink, uint, 0644); 91 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 92 93 /* 94 * Ordering of locks: 95 * 96 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 97 */ 98 99 DEFINE_MUTEX(kvm_lock); 100 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 101 LIST_HEAD(vm_list); 102 103 static cpumask_var_t cpus_hardware_enabled; 104 static int kvm_usage_count; 105 static atomic_t hardware_enable_failed; 106 107 struct kmem_cache *kvm_vcpu_cache; 108 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 109 110 static __read_mostly struct preempt_ops kvm_preempt_ops; 111 112 struct dentry *kvm_debugfs_dir; 113 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 114 115 static int kvm_debugfs_num_entries; 116 static const struct file_operations *stat_fops_per_vm[]; 117 118 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 119 unsigned long arg); 120 #ifdef CONFIG_KVM_COMPAT 121 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 122 unsigned long arg); 123 #define KVM_COMPAT(c) .compat_ioctl = (c) 124 #else 125 /* 126 * For architectures that don't implement a compat infrastructure, 127 * adopt a double line of defense: 128 * - Prevent a compat task from opening /dev/kvm 129 * - If the open has been done by a 64bit task, and the KVM fd 130 * passed to a compat task, let the ioctls fail. 131 */ 132 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 133 unsigned long arg) { return -EINVAL; } 134 135 static int kvm_no_compat_open(struct inode *inode, struct file *file) 136 { 137 return is_compat_task() ? -ENODEV : 0; 138 } 139 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 140 .open = kvm_no_compat_open 141 #endif 142 static int hardware_enable_all(void); 143 static void hardware_disable_all(void); 144 145 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 146 147 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 148 149 __visible bool kvm_rebooting; 150 EXPORT_SYMBOL_GPL(kvm_rebooting); 151 152 static bool largepages_enabled = true; 153 154 #define KVM_EVENT_CREATE_VM 0 155 #define KVM_EVENT_DESTROY_VM 1 156 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 157 static unsigned long long kvm_createvm_count; 158 static unsigned long long kvm_active_vms; 159 160 __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 161 unsigned long start, unsigned long end, bool blockable) 162 { 163 return 0; 164 } 165 166 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 167 { 168 /* 169 * The metadata used by is_zone_device_page() to determine whether or 170 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 171 * the device has been pinned, e.g. by get_user_pages(). WARN if the 172 * page_count() is zero to help detect bad usage of this helper. 173 */ 174 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 175 return false; 176 177 return is_zone_device_page(pfn_to_page(pfn)); 178 } 179 180 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 181 { 182 /* 183 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 184 * perspective they are "normal" pages, albeit with slightly different 185 * usage rules. 186 */ 187 if (pfn_valid(pfn)) 188 return PageReserved(pfn_to_page(pfn)) && 189 !kvm_is_zone_device_pfn(pfn); 190 191 return true; 192 } 193 194 /* 195 * Switches to specified vcpu, until a matching vcpu_put() 196 */ 197 void vcpu_load(struct kvm_vcpu *vcpu) 198 { 199 int cpu = get_cpu(); 200 preempt_notifier_register(&vcpu->preempt_notifier); 201 kvm_arch_vcpu_load(vcpu, cpu); 202 put_cpu(); 203 } 204 EXPORT_SYMBOL_GPL(vcpu_load); 205 206 void vcpu_put(struct kvm_vcpu *vcpu) 207 { 208 preempt_disable(); 209 kvm_arch_vcpu_put(vcpu); 210 preempt_notifier_unregister(&vcpu->preempt_notifier); 211 preempt_enable(); 212 } 213 EXPORT_SYMBOL_GPL(vcpu_put); 214 215 /* TODO: merge with kvm_arch_vcpu_should_kick */ 216 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 217 { 218 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 219 220 /* 221 * We need to wait for the VCPU to reenable interrupts and get out of 222 * READING_SHADOW_PAGE_TABLES mode. 223 */ 224 if (req & KVM_REQUEST_WAIT) 225 return mode != OUTSIDE_GUEST_MODE; 226 227 /* 228 * Need to kick a running VCPU, but otherwise there is nothing to do. 229 */ 230 return mode == IN_GUEST_MODE; 231 } 232 233 static void ack_flush(void *_completed) 234 { 235 } 236 237 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 238 { 239 if (unlikely(!cpus)) 240 cpus = cpu_online_mask; 241 242 if (cpumask_empty(cpus)) 243 return false; 244 245 smp_call_function_many(cpus, ack_flush, NULL, wait); 246 return true; 247 } 248 249 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 250 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 251 { 252 int i, cpu, me; 253 struct kvm_vcpu *vcpu; 254 bool called; 255 256 me = get_cpu(); 257 258 kvm_for_each_vcpu(i, vcpu, kvm) { 259 if (vcpu_bitmap && !test_bit(i, vcpu_bitmap)) 260 continue; 261 262 kvm_make_request(req, vcpu); 263 cpu = vcpu->cpu; 264 265 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 266 continue; 267 268 if (tmp != NULL && cpu != -1 && cpu != me && 269 kvm_request_needs_ipi(vcpu, req)) 270 __cpumask_set_cpu(cpu, tmp); 271 } 272 273 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 274 put_cpu(); 275 276 return called; 277 } 278 279 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 280 { 281 cpumask_var_t cpus; 282 bool called; 283 284 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 285 286 called = kvm_make_vcpus_request_mask(kvm, req, NULL, cpus); 287 288 free_cpumask_var(cpus); 289 return called; 290 } 291 292 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 293 void kvm_flush_remote_tlbs(struct kvm *kvm) 294 { 295 /* 296 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 297 * kvm_make_all_cpus_request. 298 */ 299 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 300 301 /* 302 * We want to publish modifications to the page tables before reading 303 * mode. Pairs with a memory barrier in arch-specific code. 304 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 305 * and smp_mb in walk_shadow_page_lockless_begin/end. 306 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 307 * 308 * There is already an smp_mb__after_atomic() before 309 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 310 * barrier here. 311 */ 312 if (!kvm_arch_flush_remote_tlb(kvm) 313 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 314 ++kvm->stat.remote_tlb_flush; 315 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 316 } 317 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 318 #endif 319 320 void kvm_reload_remote_mmus(struct kvm *kvm) 321 { 322 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 323 } 324 325 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 326 { 327 struct page *page; 328 int r; 329 330 mutex_init(&vcpu->mutex); 331 vcpu->cpu = -1; 332 vcpu->kvm = kvm; 333 vcpu->vcpu_id = id; 334 vcpu->pid = NULL; 335 init_swait_queue_head(&vcpu->wq); 336 kvm_async_pf_vcpu_init(vcpu); 337 338 vcpu->pre_pcpu = -1; 339 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 340 341 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 342 if (!page) { 343 r = -ENOMEM; 344 goto fail; 345 } 346 vcpu->run = page_address(page); 347 348 kvm_vcpu_set_in_spin_loop(vcpu, false); 349 kvm_vcpu_set_dy_eligible(vcpu, false); 350 vcpu->preempted = false; 351 vcpu->ready = false; 352 353 r = kvm_arch_vcpu_init(vcpu); 354 if (r < 0) 355 goto fail_free_run; 356 return 0; 357 358 fail_free_run: 359 free_page((unsigned long)vcpu->run); 360 fail: 361 return r; 362 } 363 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 364 365 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 366 { 367 /* 368 * no need for rcu_read_lock as VCPU_RUN is the only place that 369 * will change the vcpu->pid pointer and on uninit all file 370 * descriptors are already gone. 371 */ 372 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 373 kvm_arch_vcpu_uninit(vcpu); 374 free_page((unsigned long)vcpu->run); 375 } 376 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 377 378 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 379 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 380 { 381 return container_of(mn, struct kvm, mmu_notifier); 382 } 383 384 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 385 struct mm_struct *mm, 386 unsigned long address, 387 pte_t pte) 388 { 389 struct kvm *kvm = mmu_notifier_to_kvm(mn); 390 int idx; 391 392 idx = srcu_read_lock(&kvm->srcu); 393 spin_lock(&kvm->mmu_lock); 394 kvm->mmu_notifier_seq++; 395 396 if (kvm_set_spte_hva(kvm, address, pte)) 397 kvm_flush_remote_tlbs(kvm); 398 399 spin_unlock(&kvm->mmu_lock); 400 srcu_read_unlock(&kvm->srcu, idx); 401 } 402 403 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 404 const struct mmu_notifier_range *range) 405 { 406 struct kvm *kvm = mmu_notifier_to_kvm(mn); 407 int need_tlb_flush = 0, idx; 408 int ret; 409 410 idx = srcu_read_lock(&kvm->srcu); 411 spin_lock(&kvm->mmu_lock); 412 /* 413 * The count increase must become visible at unlock time as no 414 * spte can be established without taking the mmu_lock and 415 * count is also read inside the mmu_lock critical section. 416 */ 417 kvm->mmu_notifier_count++; 418 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); 419 need_tlb_flush |= kvm->tlbs_dirty; 420 /* we've to flush the tlb before the pages can be freed */ 421 if (need_tlb_flush) 422 kvm_flush_remote_tlbs(kvm); 423 424 spin_unlock(&kvm->mmu_lock); 425 426 ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start, 427 range->end, 428 mmu_notifier_range_blockable(range)); 429 430 srcu_read_unlock(&kvm->srcu, idx); 431 432 return ret; 433 } 434 435 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 436 const struct mmu_notifier_range *range) 437 { 438 struct kvm *kvm = mmu_notifier_to_kvm(mn); 439 440 spin_lock(&kvm->mmu_lock); 441 /* 442 * This sequence increase will notify the kvm page fault that 443 * the page that is going to be mapped in the spte could have 444 * been freed. 445 */ 446 kvm->mmu_notifier_seq++; 447 smp_wmb(); 448 /* 449 * The above sequence increase must be visible before the 450 * below count decrease, which is ensured by the smp_wmb above 451 * in conjunction with the smp_rmb in mmu_notifier_retry(). 452 */ 453 kvm->mmu_notifier_count--; 454 spin_unlock(&kvm->mmu_lock); 455 456 BUG_ON(kvm->mmu_notifier_count < 0); 457 } 458 459 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 460 struct mm_struct *mm, 461 unsigned long start, 462 unsigned long end) 463 { 464 struct kvm *kvm = mmu_notifier_to_kvm(mn); 465 int young, idx; 466 467 idx = srcu_read_lock(&kvm->srcu); 468 spin_lock(&kvm->mmu_lock); 469 470 young = kvm_age_hva(kvm, start, end); 471 if (young) 472 kvm_flush_remote_tlbs(kvm); 473 474 spin_unlock(&kvm->mmu_lock); 475 srcu_read_unlock(&kvm->srcu, idx); 476 477 return young; 478 } 479 480 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 481 struct mm_struct *mm, 482 unsigned long start, 483 unsigned long end) 484 { 485 struct kvm *kvm = mmu_notifier_to_kvm(mn); 486 int young, idx; 487 488 idx = srcu_read_lock(&kvm->srcu); 489 spin_lock(&kvm->mmu_lock); 490 /* 491 * Even though we do not flush TLB, this will still adversely 492 * affect performance on pre-Haswell Intel EPT, where there is 493 * no EPT Access Bit to clear so that we have to tear down EPT 494 * tables instead. If we find this unacceptable, we can always 495 * add a parameter to kvm_age_hva so that it effectively doesn't 496 * do anything on clear_young. 497 * 498 * Also note that currently we never issue secondary TLB flushes 499 * from clear_young, leaving this job up to the regular system 500 * cadence. If we find this inaccurate, we might come up with a 501 * more sophisticated heuristic later. 502 */ 503 young = kvm_age_hva(kvm, start, end); 504 spin_unlock(&kvm->mmu_lock); 505 srcu_read_unlock(&kvm->srcu, idx); 506 507 return young; 508 } 509 510 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 511 struct mm_struct *mm, 512 unsigned long address) 513 { 514 struct kvm *kvm = mmu_notifier_to_kvm(mn); 515 int young, idx; 516 517 idx = srcu_read_lock(&kvm->srcu); 518 spin_lock(&kvm->mmu_lock); 519 young = kvm_test_age_hva(kvm, address); 520 spin_unlock(&kvm->mmu_lock); 521 srcu_read_unlock(&kvm->srcu, idx); 522 523 return young; 524 } 525 526 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 527 struct mm_struct *mm) 528 { 529 struct kvm *kvm = mmu_notifier_to_kvm(mn); 530 int idx; 531 532 idx = srcu_read_lock(&kvm->srcu); 533 kvm_arch_flush_shadow_all(kvm); 534 srcu_read_unlock(&kvm->srcu, idx); 535 } 536 537 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 538 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 539 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 540 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 541 .clear_young = kvm_mmu_notifier_clear_young, 542 .test_young = kvm_mmu_notifier_test_young, 543 .change_pte = kvm_mmu_notifier_change_pte, 544 .release = kvm_mmu_notifier_release, 545 }; 546 547 static int kvm_init_mmu_notifier(struct kvm *kvm) 548 { 549 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 550 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 551 } 552 553 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 554 555 static int kvm_init_mmu_notifier(struct kvm *kvm) 556 { 557 return 0; 558 } 559 560 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 561 562 static struct kvm_memslots *kvm_alloc_memslots(void) 563 { 564 int i; 565 struct kvm_memslots *slots; 566 567 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 568 if (!slots) 569 return NULL; 570 571 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 572 slots->id_to_index[i] = slots->memslots[i].id = i; 573 574 return slots; 575 } 576 577 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 578 { 579 if (!memslot->dirty_bitmap) 580 return; 581 582 kvfree(memslot->dirty_bitmap); 583 memslot->dirty_bitmap = NULL; 584 } 585 586 /* 587 * Free any memory in @free but not in @dont. 588 */ 589 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, 590 struct kvm_memory_slot *dont) 591 { 592 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 593 kvm_destroy_dirty_bitmap(free); 594 595 kvm_arch_free_memslot(kvm, free, dont); 596 597 free->npages = 0; 598 } 599 600 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 601 { 602 struct kvm_memory_slot *memslot; 603 604 if (!slots) 605 return; 606 607 kvm_for_each_memslot(memslot, slots) 608 kvm_free_memslot(kvm, memslot, NULL); 609 610 kvfree(slots); 611 } 612 613 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 614 { 615 int i; 616 617 if (!kvm->debugfs_dentry) 618 return; 619 620 debugfs_remove_recursive(kvm->debugfs_dentry); 621 622 if (kvm->debugfs_stat_data) { 623 for (i = 0; i < kvm_debugfs_num_entries; i++) 624 kfree(kvm->debugfs_stat_data[i]); 625 kfree(kvm->debugfs_stat_data); 626 } 627 } 628 629 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 630 { 631 char dir_name[ITOA_MAX_LEN * 2]; 632 struct kvm_stat_data *stat_data; 633 struct kvm_stats_debugfs_item *p; 634 635 if (!debugfs_initialized()) 636 return 0; 637 638 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 639 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 640 641 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 642 sizeof(*kvm->debugfs_stat_data), 643 GFP_KERNEL_ACCOUNT); 644 if (!kvm->debugfs_stat_data) 645 return -ENOMEM; 646 647 for (p = debugfs_entries; p->name; p++) { 648 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 649 if (!stat_data) 650 return -ENOMEM; 651 652 stat_data->kvm = kvm; 653 stat_data->offset = p->offset; 654 stat_data->mode = p->mode ? p->mode : 0644; 655 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 656 debugfs_create_file(p->name, stat_data->mode, kvm->debugfs_dentry, 657 stat_data, stat_fops_per_vm[p->kind]); 658 } 659 return 0; 660 } 661 662 /* 663 * Called after the VM is otherwise initialized, but just before adding it to 664 * the vm_list. 665 */ 666 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 667 { 668 return 0; 669 } 670 671 /* 672 * Called just after removing the VM from the vm_list, but before doing any 673 * other destruction. 674 */ 675 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 676 { 677 } 678 679 static struct kvm *kvm_create_vm(unsigned long type) 680 { 681 struct kvm *kvm = kvm_arch_alloc_vm(); 682 int r = -ENOMEM; 683 int i; 684 685 if (!kvm) 686 return ERR_PTR(-ENOMEM); 687 688 spin_lock_init(&kvm->mmu_lock); 689 mmgrab(current->mm); 690 kvm->mm = current->mm; 691 kvm_eventfd_init(kvm); 692 mutex_init(&kvm->lock); 693 mutex_init(&kvm->irq_lock); 694 mutex_init(&kvm->slots_lock); 695 INIT_LIST_HEAD(&kvm->devices); 696 697 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 698 699 if (init_srcu_struct(&kvm->srcu)) 700 goto out_err_no_srcu; 701 if (init_srcu_struct(&kvm->irq_srcu)) 702 goto out_err_no_irq_srcu; 703 704 refcount_set(&kvm->users_count, 1); 705 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 706 struct kvm_memslots *slots = kvm_alloc_memslots(); 707 708 if (!slots) 709 goto out_err_no_arch_destroy_vm; 710 /* Generations must be different for each address space. */ 711 slots->generation = i; 712 rcu_assign_pointer(kvm->memslots[i], slots); 713 } 714 715 for (i = 0; i < KVM_NR_BUSES; i++) { 716 rcu_assign_pointer(kvm->buses[i], 717 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 718 if (!kvm->buses[i]) 719 goto out_err_no_arch_destroy_vm; 720 } 721 722 r = kvm_arch_init_vm(kvm, type); 723 if (r) 724 goto out_err_no_arch_destroy_vm; 725 726 r = hardware_enable_all(); 727 if (r) 728 goto out_err_no_disable; 729 730 #ifdef CONFIG_HAVE_KVM_IRQFD 731 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 732 #endif 733 734 r = kvm_init_mmu_notifier(kvm); 735 if (r) 736 goto out_err_no_mmu_notifier; 737 738 r = kvm_arch_post_init_vm(kvm); 739 if (r) 740 goto out_err; 741 742 mutex_lock(&kvm_lock); 743 list_add(&kvm->vm_list, &vm_list); 744 mutex_unlock(&kvm_lock); 745 746 preempt_notifier_inc(); 747 748 return kvm; 749 750 out_err: 751 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 752 if (kvm->mmu_notifier.ops) 753 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 754 #endif 755 out_err_no_mmu_notifier: 756 hardware_disable_all(); 757 out_err_no_disable: 758 kvm_arch_destroy_vm(kvm); 759 out_err_no_arch_destroy_vm: 760 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 761 for (i = 0; i < KVM_NR_BUSES; i++) 762 kfree(kvm_get_bus(kvm, i)); 763 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 764 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 765 cleanup_srcu_struct(&kvm->irq_srcu); 766 out_err_no_irq_srcu: 767 cleanup_srcu_struct(&kvm->srcu); 768 out_err_no_srcu: 769 kvm_arch_free_vm(kvm); 770 mmdrop(current->mm); 771 return ERR_PTR(r); 772 } 773 774 static void kvm_destroy_devices(struct kvm *kvm) 775 { 776 struct kvm_device *dev, *tmp; 777 778 /* 779 * We do not need to take the kvm->lock here, because nobody else 780 * has a reference to the struct kvm at this point and therefore 781 * cannot access the devices list anyhow. 782 */ 783 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 784 list_del(&dev->vm_node); 785 dev->ops->destroy(dev); 786 } 787 } 788 789 static void kvm_destroy_vm(struct kvm *kvm) 790 { 791 int i; 792 struct mm_struct *mm = kvm->mm; 793 794 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 795 kvm_destroy_vm_debugfs(kvm); 796 kvm_arch_sync_events(kvm); 797 mutex_lock(&kvm_lock); 798 list_del(&kvm->vm_list); 799 mutex_unlock(&kvm_lock); 800 kvm_arch_pre_destroy_vm(kvm); 801 802 kvm_free_irq_routing(kvm); 803 for (i = 0; i < KVM_NR_BUSES; i++) { 804 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 805 806 if (bus) 807 kvm_io_bus_destroy(bus); 808 kvm->buses[i] = NULL; 809 } 810 kvm_coalesced_mmio_free(kvm); 811 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 812 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 813 #else 814 kvm_arch_flush_shadow_all(kvm); 815 #endif 816 kvm_arch_destroy_vm(kvm); 817 kvm_destroy_devices(kvm); 818 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 819 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 820 cleanup_srcu_struct(&kvm->irq_srcu); 821 cleanup_srcu_struct(&kvm->srcu); 822 kvm_arch_free_vm(kvm); 823 preempt_notifier_dec(); 824 hardware_disable_all(); 825 mmdrop(mm); 826 } 827 828 void kvm_get_kvm(struct kvm *kvm) 829 { 830 refcount_inc(&kvm->users_count); 831 } 832 EXPORT_SYMBOL_GPL(kvm_get_kvm); 833 834 void kvm_put_kvm(struct kvm *kvm) 835 { 836 if (refcount_dec_and_test(&kvm->users_count)) 837 kvm_destroy_vm(kvm); 838 } 839 EXPORT_SYMBOL_GPL(kvm_put_kvm); 840 841 842 static int kvm_vm_release(struct inode *inode, struct file *filp) 843 { 844 struct kvm *kvm = filp->private_data; 845 846 kvm_irqfd_release(kvm); 847 848 kvm_put_kvm(kvm); 849 return 0; 850 } 851 852 /* 853 * Allocation size is twice as large as the actual dirty bitmap size. 854 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed. 855 */ 856 static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) 857 { 858 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 859 860 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 861 if (!memslot->dirty_bitmap) 862 return -ENOMEM; 863 864 return 0; 865 } 866 867 /* 868 * Insert memslot and re-sort memslots based on their GFN, 869 * so binary search could be used to lookup GFN. 870 * Sorting algorithm takes advantage of having initially 871 * sorted array and known changed memslot position. 872 */ 873 static void update_memslots(struct kvm_memslots *slots, 874 struct kvm_memory_slot *new, 875 enum kvm_mr_change change) 876 { 877 int id = new->id; 878 int i = slots->id_to_index[id]; 879 struct kvm_memory_slot *mslots = slots->memslots; 880 881 WARN_ON(mslots[i].id != id); 882 switch (change) { 883 case KVM_MR_CREATE: 884 slots->used_slots++; 885 WARN_ON(mslots[i].npages || !new->npages); 886 break; 887 case KVM_MR_DELETE: 888 slots->used_slots--; 889 WARN_ON(new->npages || !mslots[i].npages); 890 break; 891 default: 892 break; 893 } 894 895 while (i < KVM_MEM_SLOTS_NUM - 1 && 896 new->base_gfn <= mslots[i + 1].base_gfn) { 897 if (!mslots[i + 1].npages) 898 break; 899 mslots[i] = mslots[i + 1]; 900 slots->id_to_index[mslots[i].id] = i; 901 i++; 902 } 903 904 /* 905 * The ">=" is needed when creating a slot with base_gfn == 0, 906 * so that it moves before all those with base_gfn == npages == 0. 907 * 908 * On the other hand, if new->npages is zero, the above loop has 909 * already left i pointing to the beginning of the empty part of 910 * mslots, and the ">=" would move the hole backwards in this 911 * case---which is wrong. So skip the loop when deleting a slot. 912 */ 913 if (new->npages) { 914 while (i > 0 && 915 new->base_gfn >= mslots[i - 1].base_gfn) { 916 mslots[i] = mslots[i - 1]; 917 slots->id_to_index[mslots[i].id] = i; 918 i--; 919 } 920 } else 921 WARN_ON_ONCE(i != slots->used_slots); 922 923 mslots[i] = *new; 924 slots->id_to_index[mslots[i].id] = i; 925 } 926 927 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 928 { 929 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 930 931 #ifdef __KVM_HAVE_READONLY_MEM 932 valid_flags |= KVM_MEM_READONLY; 933 #endif 934 935 if (mem->flags & ~valid_flags) 936 return -EINVAL; 937 938 return 0; 939 } 940 941 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 942 int as_id, struct kvm_memslots *slots) 943 { 944 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 945 u64 gen = old_memslots->generation; 946 947 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 948 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 949 950 rcu_assign_pointer(kvm->memslots[as_id], slots); 951 synchronize_srcu_expedited(&kvm->srcu); 952 953 /* 954 * Increment the new memslot generation a second time, dropping the 955 * update in-progress flag and incrementing then generation based on 956 * the number of address spaces. This provides a unique and easily 957 * identifiable generation number while the memslots are in flux. 958 */ 959 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 960 961 /* 962 * Generations must be unique even across address spaces. We do not need 963 * a global counter for that, instead the generation space is evenly split 964 * across address spaces. For example, with two address spaces, address 965 * space 0 will use generations 0, 2, 4, ... while address space 1 will 966 * use generations 1, 3, 5, ... 967 */ 968 gen += KVM_ADDRESS_SPACE_NUM; 969 970 kvm_arch_memslots_updated(kvm, gen); 971 972 slots->generation = gen; 973 974 return old_memslots; 975 } 976 977 /* 978 * Allocate some memory and give it an address in the guest physical address 979 * space. 980 * 981 * Discontiguous memory is allowed, mostly for framebuffers. 982 * 983 * Must be called holding kvm->slots_lock for write. 984 */ 985 int __kvm_set_memory_region(struct kvm *kvm, 986 const struct kvm_userspace_memory_region *mem) 987 { 988 int r; 989 gfn_t base_gfn; 990 unsigned long npages; 991 struct kvm_memory_slot *slot; 992 struct kvm_memory_slot old, new; 993 struct kvm_memslots *slots = NULL, *old_memslots; 994 int as_id, id; 995 enum kvm_mr_change change; 996 997 r = check_memory_region_flags(mem); 998 if (r) 999 goto out; 1000 1001 r = -EINVAL; 1002 as_id = mem->slot >> 16; 1003 id = (u16)mem->slot; 1004 1005 /* General sanity checks */ 1006 if (mem->memory_size & (PAGE_SIZE - 1)) 1007 goto out; 1008 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1009 goto out; 1010 /* We can read the guest memory with __xxx_user() later on. */ 1011 if ((id < KVM_USER_MEM_SLOTS) && 1012 ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1013 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1014 mem->memory_size))) 1015 goto out; 1016 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1017 goto out; 1018 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1019 goto out; 1020 1021 slot = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1022 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1023 npages = mem->memory_size >> PAGE_SHIFT; 1024 1025 if (npages > KVM_MEM_MAX_NR_PAGES) 1026 goto out; 1027 1028 new = old = *slot; 1029 1030 new.id = id; 1031 new.base_gfn = base_gfn; 1032 new.npages = npages; 1033 new.flags = mem->flags; 1034 1035 if (npages) { 1036 if (!old.npages) 1037 change = KVM_MR_CREATE; 1038 else { /* Modify an existing slot. */ 1039 if ((mem->userspace_addr != old.userspace_addr) || 1040 (npages != old.npages) || 1041 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1042 goto out; 1043 1044 if (base_gfn != old.base_gfn) 1045 change = KVM_MR_MOVE; 1046 else if (new.flags != old.flags) 1047 change = KVM_MR_FLAGS_ONLY; 1048 else { /* Nothing to change. */ 1049 r = 0; 1050 goto out; 1051 } 1052 } 1053 } else { 1054 if (!old.npages) 1055 goto out; 1056 1057 change = KVM_MR_DELETE; 1058 new.base_gfn = 0; 1059 new.flags = 0; 1060 } 1061 1062 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1063 /* Check for overlaps */ 1064 r = -EEXIST; 1065 kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { 1066 if (slot->id == id) 1067 continue; 1068 if (!((base_gfn + npages <= slot->base_gfn) || 1069 (base_gfn >= slot->base_gfn + slot->npages))) 1070 goto out; 1071 } 1072 } 1073 1074 /* Free page dirty bitmap if unneeded */ 1075 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1076 new.dirty_bitmap = NULL; 1077 1078 r = -ENOMEM; 1079 if (change == KVM_MR_CREATE) { 1080 new.userspace_addr = mem->userspace_addr; 1081 1082 if (kvm_arch_create_memslot(kvm, &new, npages)) 1083 goto out_free; 1084 } 1085 1086 /* Allocate page dirty bitmap if needed */ 1087 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1088 if (kvm_create_dirty_bitmap(&new) < 0) 1089 goto out_free; 1090 } 1091 1092 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 1093 if (!slots) 1094 goto out_free; 1095 memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots)); 1096 1097 if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) { 1098 slot = id_to_memslot(slots, id); 1099 slot->flags |= KVM_MEMSLOT_INVALID; 1100 1101 old_memslots = install_new_memslots(kvm, as_id, slots); 1102 1103 /* From this point no new shadow pages pointing to a deleted, 1104 * or moved, memslot will be created. 1105 * 1106 * validation of sp->gfn happens in: 1107 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1108 * - kvm_is_visible_gfn (mmu_check_roots) 1109 */ 1110 kvm_arch_flush_shadow_memslot(kvm, slot); 1111 1112 /* 1113 * We can re-use the old_memslots from above, the only difference 1114 * from the currently installed memslots is the invalid flag. This 1115 * will get overwritten by update_memslots anyway. 1116 */ 1117 slots = old_memslots; 1118 } 1119 1120 r = kvm_arch_prepare_memory_region(kvm, &new, mem, change); 1121 if (r) 1122 goto out_slots; 1123 1124 /* actual memory is freed via old in kvm_free_memslot below */ 1125 if (change == KVM_MR_DELETE) { 1126 new.dirty_bitmap = NULL; 1127 memset(&new.arch, 0, sizeof(new.arch)); 1128 } 1129 1130 update_memslots(slots, &new, change); 1131 old_memslots = install_new_memslots(kvm, as_id, slots); 1132 1133 kvm_arch_commit_memory_region(kvm, mem, &old, &new, change); 1134 1135 kvm_free_memslot(kvm, &old, &new); 1136 kvfree(old_memslots); 1137 return 0; 1138 1139 out_slots: 1140 kvfree(slots); 1141 out_free: 1142 kvm_free_memslot(kvm, &new, &old); 1143 out: 1144 return r; 1145 } 1146 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1147 1148 int kvm_set_memory_region(struct kvm *kvm, 1149 const struct kvm_userspace_memory_region *mem) 1150 { 1151 int r; 1152 1153 mutex_lock(&kvm->slots_lock); 1154 r = __kvm_set_memory_region(kvm, mem); 1155 mutex_unlock(&kvm->slots_lock); 1156 return r; 1157 } 1158 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1159 1160 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1161 struct kvm_userspace_memory_region *mem) 1162 { 1163 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1164 return -EINVAL; 1165 1166 return kvm_set_memory_region(kvm, mem); 1167 } 1168 1169 int kvm_get_dirty_log(struct kvm *kvm, 1170 struct kvm_dirty_log *log, int *is_dirty) 1171 { 1172 struct kvm_memslots *slots; 1173 struct kvm_memory_slot *memslot; 1174 int i, as_id, id; 1175 unsigned long n; 1176 unsigned long any = 0; 1177 1178 as_id = log->slot >> 16; 1179 id = (u16)log->slot; 1180 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1181 return -EINVAL; 1182 1183 slots = __kvm_memslots(kvm, as_id); 1184 memslot = id_to_memslot(slots, id); 1185 if (!memslot->dirty_bitmap) 1186 return -ENOENT; 1187 1188 n = kvm_dirty_bitmap_bytes(memslot); 1189 1190 for (i = 0; !any && i < n/sizeof(long); ++i) 1191 any = memslot->dirty_bitmap[i]; 1192 1193 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1194 return -EFAULT; 1195 1196 if (any) 1197 *is_dirty = 1; 1198 return 0; 1199 } 1200 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1201 1202 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1203 /** 1204 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1205 * and reenable dirty page tracking for the corresponding pages. 1206 * @kvm: pointer to kvm instance 1207 * @log: slot id and address to which we copy the log 1208 * @flush: true if TLB flush is needed by caller 1209 * 1210 * We need to keep it in mind that VCPU threads can write to the bitmap 1211 * concurrently. So, to avoid losing track of dirty pages we keep the 1212 * following order: 1213 * 1214 * 1. Take a snapshot of the bit and clear it if needed. 1215 * 2. Write protect the corresponding page. 1216 * 3. Copy the snapshot to the userspace. 1217 * 4. Upon return caller flushes TLB's if needed. 1218 * 1219 * Between 2 and 4, the guest may write to the page using the remaining TLB 1220 * entry. This is not a problem because the page is reported dirty using 1221 * the snapshot taken before and step 4 ensures that writes done after 1222 * exiting to userspace will be logged for the next call. 1223 * 1224 */ 1225 int kvm_get_dirty_log_protect(struct kvm *kvm, 1226 struct kvm_dirty_log *log, bool *flush) 1227 { 1228 struct kvm_memslots *slots; 1229 struct kvm_memory_slot *memslot; 1230 int i, as_id, id; 1231 unsigned long n; 1232 unsigned long *dirty_bitmap; 1233 unsigned long *dirty_bitmap_buffer; 1234 1235 as_id = log->slot >> 16; 1236 id = (u16)log->slot; 1237 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1238 return -EINVAL; 1239 1240 slots = __kvm_memslots(kvm, as_id); 1241 memslot = id_to_memslot(slots, id); 1242 1243 dirty_bitmap = memslot->dirty_bitmap; 1244 if (!dirty_bitmap) 1245 return -ENOENT; 1246 1247 n = kvm_dirty_bitmap_bytes(memslot); 1248 *flush = false; 1249 if (kvm->manual_dirty_log_protect) { 1250 /* 1251 * Unlike kvm_get_dirty_log, we always return false in *flush, 1252 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1253 * is some code duplication between this function and 1254 * kvm_get_dirty_log, but hopefully all architecture 1255 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1256 * can be eliminated. 1257 */ 1258 dirty_bitmap_buffer = dirty_bitmap; 1259 } else { 1260 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1261 memset(dirty_bitmap_buffer, 0, n); 1262 1263 spin_lock(&kvm->mmu_lock); 1264 for (i = 0; i < n / sizeof(long); i++) { 1265 unsigned long mask; 1266 gfn_t offset; 1267 1268 if (!dirty_bitmap[i]) 1269 continue; 1270 1271 *flush = true; 1272 mask = xchg(&dirty_bitmap[i], 0); 1273 dirty_bitmap_buffer[i] = mask; 1274 1275 offset = i * BITS_PER_LONG; 1276 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1277 offset, mask); 1278 } 1279 spin_unlock(&kvm->mmu_lock); 1280 } 1281 1282 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1283 return -EFAULT; 1284 return 0; 1285 } 1286 EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect); 1287 1288 /** 1289 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1290 * and reenable dirty page tracking for the corresponding pages. 1291 * @kvm: pointer to kvm instance 1292 * @log: slot id and address from which to fetch the bitmap of dirty pages 1293 * @flush: true if TLB flush is needed by caller 1294 */ 1295 int kvm_clear_dirty_log_protect(struct kvm *kvm, 1296 struct kvm_clear_dirty_log *log, bool *flush) 1297 { 1298 struct kvm_memslots *slots; 1299 struct kvm_memory_slot *memslot; 1300 int as_id, id; 1301 gfn_t offset; 1302 unsigned long i, n; 1303 unsigned long *dirty_bitmap; 1304 unsigned long *dirty_bitmap_buffer; 1305 1306 as_id = log->slot >> 16; 1307 id = (u16)log->slot; 1308 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1309 return -EINVAL; 1310 1311 if (log->first_page & 63) 1312 return -EINVAL; 1313 1314 slots = __kvm_memslots(kvm, as_id); 1315 memslot = id_to_memslot(slots, id); 1316 1317 dirty_bitmap = memslot->dirty_bitmap; 1318 if (!dirty_bitmap) 1319 return -ENOENT; 1320 1321 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1322 1323 if (log->first_page > memslot->npages || 1324 log->num_pages > memslot->npages - log->first_page || 1325 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1326 return -EINVAL; 1327 1328 *flush = false; 1329 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1330 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1331 return -EFAULT; 1332 1333 spin_lock(&kvm->mmu_lock); 1334 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1335 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1336 i++, offset += BITS_PER_LONG) { 1337 unsigned long mask = *dirty_bitmap_buffer++; 1338 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1339 if (!mask) 1340 continue; 1341 1342 mask &= atomic_long_fetch_andnot(mask, p); 1343 1344 /* 1345 * mask contains the bits that really have been cleared. This 1346 * never includes any bits beyond the length of the memslot (if 1347 * the length is not aligned to 64 pages), therefore it is not 1348 * a problem if userspace sets them in log->dirty_bitmap. 1349 */ 1350 if (mask) { 1351 *flush = true; 1352 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1353 offset, mask); 1354 } 1355 } 1356 spin_unlock(&kvm->mmu_lock); 1357 1358 return 0; 1359 } 1360 EXPORT_SYMBOL_GPL(kvm_clear_dirty_log_protect); 1361 #endif 1362 1363 bool kvm_largepages_enabled(void) 1364 { 1365 return largepages_enabled; 1366 } 1367 1368 void kvm_disable_largepages(void) 1369 { 1370 largepages_enabled = false; 1371 } 1372 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1373 1374 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1375 { 1376 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1377 } 1378 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1379 1380 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1381 { 1382 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1383 } 1384 1385 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1386 { 1387 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1388 1389 if (!memslot || memslot->id >= KVM_USER_MEM_SLOTS || 1390 memslot->flags & KVM_MEMSLOT_INVALID) 1391 return false; 1392 1393 return true; 1394 } 1395 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1396 1397 unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn) 1398 { 1399 struct vm_area_struct *vma; 1400 unsigned long addr, size; 1401 1402 size = PAGE_SIZE; 1403 1404 addr = gfn_to_hva(kvm, gfn); 1405 if (kvm_is_error_hva(addr)) 1406 return PAGE_SIZE; 1407 1408 down_read(¤t->mm->mmap_sem); 1409 vma = find_vma(current->mm, addr); 1410 if (!vma) 1411 goto out; 1412 1413 size = vma_kernel_pagesize(vma); 1414 1415 out: 1416 up_read(¤t->mm->mmap_sem); 1417 1418 return size; 1419 } 1420 1421 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1422 { 1423 return slot->flags & KVM_MEM_READONLY; 1424 } 1425 1426 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1427 gfn_t *nr_pages, bool write) 1428 { 1429 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1430 return KVM_HVA_ERR_BAD; 1431 1432 if (memslot_is_readonly(slot) && write) 1433 return KVM_HVA_ERR_RO_BAD; 1434 1435 if (nr_pages) 1436 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1437 1438 return __gfn_to_hva_memslot(slot, gfn); 1439 } 1440 1441 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1442 gfn_t *nr_pages) 1443 { 1444 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1445 } 1446 1447 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1448 gfn_t gfn) 1449 { 1450 return gfn_to_hva_many(slot, gfn, NULL); 1451 } 1452 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1453 1454 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1455 { 1456 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1457 } 1458 EXPORT_SYMBOL_GPL(gfn_to_hva); 1459 1460 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1461 { 1462 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1463 } 1464 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1465 1466 /* 1467 * Return the hva of a @gfn and the R/W attribute if possible. 1468 * 1469 * @slot: the kvm_memory_slot which contains @gfn 1470 * @gfn: the gfn to be translated 1471 * @writable: used to return the read/write attribute of the @slot if the hva 1472 * is valid and @writable is not NULL 1473 */ 1474 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1475 gfn_t gfn, bool *writable) 1476 { 1477 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1478 1479 if (!kvm_is_error_hva(hva) && writable) 1480 *writable = !memslot_is_readonly(slot); 1481 1482 return hva; 1483 } 1484 1485 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1486 { 1487 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1488 1489 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1490 } 1491 1492 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1493 { 1494 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1495 1496 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1497 } 1498 1499 static inline int check_user_page_hwpoison(unsigned long addr) 1500 { 1501 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1502 1503 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1504 return rc == -EHWPOISON; 1505 } 1506 1507 /* 1508 * The fast path to get the writable pfn which will be stored in @pfn, 1509 * true indicates success, otherwise false is returned. It's also the 1510 * only part that runs if we can are in atomic context. 1511 */ 1512 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1513 bool *writable, kvm_pfn_t *pfn) 1514 { 1515 struct page *page[1]; 1516 int npages; 1517 1518 /* 1519 * Fast pin a writable pfn only if it is a write fault request 1520 * or the caller allows to map a writable pfn for a read fault 1521 * request. 1522 */ 1523 if (!(write_fault || writable)) 1524 return false; 1525 1526 npages = __get_user_pages_fast(addr, 1, 1, page); 1527 if (npages == 1) { 1528 *pfn = page_to_pfn(page[0]); 1529 1530 if (writable) 1531 *writable = true; 1532 return true; 1533 } 1534 1535 return false; 1536 } 1537 1538 /* 1539 * The slow path to get the pfn of the specified host virtual address, 1540 * 1 indicates success, -errno is returned if error is detected. 1541 */ 1542 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1543 bool *writable, kvm_pfn_t *pfn) 1544 { 1545 unsigned int flags = FOLL_HWPOISON; 1546 struct page *page; 1547 int npages = 0; 1548 1549 might_sleep(); 1550 1551 if (writable) 1552 *writable = write_fault; 1553 1554 if (write_fault) 1555 flags |= FOLL_WRITE; 1556 if (async) 1557 flags |= FOLL_NOWAIT; 1558 1559 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1560 if (npages != 1) 1561 return npages; 1562 1563 /* map read fault as writable if possible */ 1564 if (unlikely(!write_fault) && writable) { 1565 struct page *wpage; 1566 1567 if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { 1568 *writable = true; 1569 put_page(page); 1570 page = wpage; 1571 } 1572 } 1573 *pfn = page_to_pfn(page); 1574 return npages; 1575 } 1576 1577 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1578 { 1579 if (unlikely(!(vma->vm_flags & VM_READ))) 1580 return false; 1581 1582 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1583 return false; 1584 1585 return true; 1586 } 1587 1588 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1589 unsigned long addr, bool *async, 1590 bool write_fault, bool *writable, 1591 kvm_pfn_t *p_pfn) 1592 { 1593 unsigned long pfn; 1594 int r; 1595 1596 r = follow_pfn(vma, addr, &pfn); 1597 if (r) { 1598 /* 1599 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1600 * not call the fault handler, so do it here. 1601 */ 1602 bool unlocked = false; 1603 r = fixup_user_fault(current, current->mm, addr, 1604 (write_fault ? FAULT_FLAG_WRITE : 0), 1605 &unlocked); 1606 if (unlocked) 1607 return -EAGAIN; 1608 if (r) 1609 return r; 1610 1611 r = follow_pfn(vma, addr, &pfn); 1612 if (r) 1613 return r; 1614 1615 } 1616 1617 if (writable) 1618 *writable = true; 1619 1620 /* 1621 * Get a reference here because callers of *hva_to_pfn* and 1622 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1623 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1624 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1625 * simply do nothing for reserved pfns. 1626 * 1627 * Whoever called remap_pfn_range is also going to call e.g. 1628 * unmap_mapping_range before the underlying pages are freed, 1629 * causing a call to our MMU notifier. 1630 */ 1631 kvm_get_pfn(pfn); 1632 1633 *p_pfn = pfn; 1634 return 0; 1635 } 1636 1637 /* 1638 * Pin guest page in memory and return its pfn. 1639 * @addr: host virtual address which maps memory to the guest 1640 * @atomic: whether this function can sleep 1641 * @async: whether this function need to wait IO complete if the 1642 * host page is not in the memory 1643 * @write_fault: whether we should get a writable host page 1644 * @writable: whether it allows to map a writable host page for !@write_fault 1645 * 1646 * The function will map a writable host page for these two cases: 1647 * 1): @write_fault = true 1648 * 2): @write_fault = false && @writable, @writable will tell the caller 1649 * whether the mapping is writable. 1650 */ 1651 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1652 bool write_fault, bool *writable) 1653 { 1654 struct vm_area_struct *vma; 1655 kvm_pfn_t pfn = 0; 1656 int npages, r; 1657 1658 /* we can do it either atomically or asynchronously, not both */ 1659 BUG_ON(atomic && async); 1660 1661 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1662 return pfn; 1663 1664 if (atomic) 1665 return KVM_PFN_ERR_FAULT; 1666 1667 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1668 if (npages == 1) 1669 return pfn; 1670 1671 down_read(¤t->mm->mmap_sem); 1672 if (npages == -EHWPOISON || 1673 (!async && check_user_page_hwpoison(addr))) { 1674 pfn = KVM_PFN_ERR_HWPOISON; 1675 goto exit; 1676 } 1677 1678 retry: 1679 vma = find_vma_intersection(current->mm, addr, addr + 1); 1680 1681 if (vma == NULL) 1682 pfn = KVM_PFN_ERR_FAULT; 1683 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1684 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1685 if (r == -EAGAIN) 1686 goto retry; 1687 if (r < 0) 1688 pfn = KVM_PFN_ERR_FAULT; 1689 } else { 1690 if (async && vma_is_valid(vma, write_fault)) 1691 *async = true; 1692 pfn = KVM_PFN_ERR_FAULT; 1693 } 1694 exit: 1695 up_read(¤t->mm->mmap_sem); 1696 return pfn; 1697 } 1698 1699 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1700 bool atomic, bool *async, bool write_fault, 1701 bool *writable) 1702 { 1703 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1704 1705 if (addr == KVM_HVA_ERR_RO_BAD) { 1706 if (writable) 1707 *writable = false; 1708 return KVM_PFN_ERR_RO_FAULT; 1709 } 1710 1711 if (kvm_is_error_hva(addr)) { 1712 if (writable) 1713 *writable = false; 1714 return KVM_PFN_NOSLOT; 1715 } 1716 1717 /* Do not map writable pfn in the readonly memslot. */ 1718 if (writable && memslot_is_readonly(slot)) { 1719 *writable = false; 1720 writable = NULL; 1721 } 1722 1723 return hva_to_pfn(addr, atomic, async, write_fault, 1724 writable); 1725 } 1726 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1727 1728 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1729 bool *writable) 1730 { 1731 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1732 write_fault, writable); 1733 } 1734 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1735 1736 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1737 { 1738 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1739 } 1740 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1741 1742 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1743 { 1744 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1745 } 1746 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1747 1748 kvm_pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn) 1749 { 1750 return gfn_to_pfn_memslot_atomic(gfn_to_memslot(kvm, gfn), gfn); 1751 } 1752 EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic); 1753 1754 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1755 { 1756 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1757 } 1758 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1759 1760 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1761 { 1762 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1763 } 1764 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1765 1766 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1767 { 1768 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1769 } 1770 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1771 1772 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1773 struct page **pages, int nr_pages) 1774 { 1775 unsigned long addr; 1776 gfn_t entry = 0; 1777 1778 addr = gfn_to_hva_many(slot, gfn, &entry); 1779 if (kvm_is_error_hva(addr)) 1780 return -1; 1781 1782 if (entry < nr_pages) 1783 return 0; 1784 1785 return __get_user_pages_fast(addr, nr_pages, 1, pages); 1786 } 1787 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 1788 1789 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 1790 { 1791 if (is_error_noslot_pfn(pfn)) 1792 return KVM_ERR_PTR_BAD_PAGE; 1793 1794 if (kvm_is_reserved_pfn(pfn)) { 1795 WARN_ON(1); 1796 return KVM_ERR_PTR_BAD_PAGE; 1797 } 1798 1799 return pfn_to_page(pfn); 1800 } 1801 1802 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1803 { 1804 kvm_pfn_t pfn; 1805 1806 pfn = gfn_to_pfn(kvm, gfn); 1807 1808 return kvm_pfn_to_page(pfn); 1809 } 1810 EXPORT_SYMBOL_GPL(gfn_to_page); 1811 1812 static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn, 1813 struct kvm_host_map *map) 1814 { 1815 kvm_pfn_t pfn; 1816 void *hva = NULL; 1817 struct page *page = KVM_UNMAPPED_PAGE; 1818 1819 if (!map) 1820 return -EINVAL; 1821 1822 pfn = gfn_to_pfn_memslot(slot, gfn); 1823 if (is_error_noslot_pfn(pfn)) 1824 return -EINVAL; 1825 1826 if (pfn_valid(pfn)) { 1827 page = pfn_to_page(pfn); 1828 hva = kmap(page); 1829 #ifdef CONFIG_HAS_IOMEM 1830 } else { 1831 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 1832 #endif 1833 } 1834 1835 if (!hva) 1836 return -EFAULT; 1837 1838 map->page = page; 1839 map->hva = hva; 1840 map->pfn = pfn; 1841 map->gfn = gfn; 1842 1843 return 0; 1844 } 1845 1846 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 1847 { 1848 return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map); 1849 } 1850 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 1851 1852 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 1853 bool dirty) 1854 { 1855 if (!map) 1856 return; 1857 1858 if (!map->hva) 1859 return; 1860 1861 if (map->page != KVM_UNMAPPED_PAGE) 1862 kunmap(map->page); 1863 #ifdef CONFIG_HAS_IOMEM 1864 else 1865 memunmap(map->hva); 1866 #endif 1867 1868 if (dirty) { 1869 kvm_vcpu_mark_page_dirty(vcpu, map->gfn); 1870 kvm_release_pfn_dirty(map->pfn); 1871 } else { 1872 kvm_release_pfn_clean(map->pfn); 1873 } 1874 1875 map->hva = NULL; 1876 map->page = NULL; 1877 } 1878 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 1879 1880 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 1881 { 1882 kvm_pfn_t pfn; 1883 1884 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 1885 1886 return kvm_pfn_to_page(pfn); 1887 } 1888 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 1889 1890 void kvm_release_page_clean(struct page *page) 1891 { 1892 WARN_ON(is_error_page(page)); 1893 1894 kvm_release_pfn_clean(page_to_pfn(page)); 1895 } 1896 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1897 1898 void kvm_release_pfn_clean(kvm_pfn_t pfn) 1899 { 1900 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 1901 put_page(pfn_to_page(pfn)); 1902 } 1903 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1904 1905 void kvm_release_page_dirty(struct page *page) 1906 { 1907 WARN_ON(is_error_page(page)); 1908 1909 kvm_release_pfn_dirty(page_to_pfn(page)); 1910 } 1911 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1912 1913 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 1914 { 1915 kvm_set_pfn_dirty(pfn); 1916 kvm_release_pfn_clean(pfn); 1917 } 1918 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1919 1920 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 1921 { 1922 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { 1923 struct page *page = pfn_to_page(pfn); 1924 1925 SetPageDirty(page); 1926 } 1927 } 1928 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1929 1930 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 1931 { 1932 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 1933 mark_page_accessed(pfn_to_page(pfn)); 1934 } 1935 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1936 1937 void kvm_get_pfn(kvm_pfn_t pfn) 1938 { 1939 if (!kvm_is_reserved_pfn(pfn)) 1940 get_page(pfn_to_page(pfn)); 1941 } 1942 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1943 1944 static int next_segment(unsigned long len, int offset) 1945 { 1946 if (len > PAGE_SIZE - offset) 1947 return PAGE_SIZE - offset; 1948 else 1949 return len; 1950 } 1951 1952 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 1953 void *data, int offset, int len) 1954 { 1955 int r; 1956 unsigned long addr; 1957 1958 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 1959 if (kvm_is_error_hva(addr)) 1960 return -EFAULT; 1961 r = __copy_from_user(data, (void __user *)addr + offset, len); 1962 if (r) 1963 return -EFAULT; 1964 return 0; 1965 } 1966 1967 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1968 int len) 1969 { 1970 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1971 1972 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1973 } 1974 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1975 1976 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 1977 int offset, int len) 1978 { 1979 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1980 1981 return __kvm_read_guest_page(slot, gfn, data, offset, len); 1982 } 1983 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 1984 1985 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1986 { 1987 gfn_t gfn = gpa >> PAGE_SHIFT; 1988 int seg; 1989 int offset = offset_in_page(gpa); 1990 int ret; 1991 1992 while ((seg = next_segment(len, offset)) != 0) { 1993 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1994 if (ret < 0) 1995 return ret; 1996 offset = 0; 1997 len -= seg; 1998 data += seg; 1999 ++gfn; 2000 } 2001 return 0; 2002 } 2003 EXPORT_SYMBOL_GPL(kvm_read_guest); 2004 2005 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2006 { 2007 gfn_t gfn = gpa >> PAGE_SHIFT; 2008 int seg; 2009 int offset = offset_in_page(gpa); 2010 int ret; 2011 2012 while ((seg = next_segment(len, offset)) != 0) { 2013 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2014 if (ret < 0) 2015 return ret; 2016 offset = 0; 2017 len -= seg; 2018 data += seg; 2019 ++gfn; 2020 } 2021 return 0; 2022 } 2023 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2024 2025 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2026 void *data, int offset, unsigned long len) 2027 { 2028 int r; 2029 unsigned long addr; 2030 2031 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2032 if (kvm_is_error_hva(addr)) 2033 return -EFAULT; 2034 pagefault_disable(); 2035 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2036 pagefault_enable(); 2037 if (r) 2038 return -EFAULT; 2039 return 0; 2040 } 2041 2042 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 2043 unsigned long len) 2044 { 2045 gfn_t gfn = gpa >> PAGE_SHIFT; 2046 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2047 int offset = offset_in_page(gpa); 2048 2049 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2050 } 2051 EXPORT_SYMBOL_GPL(kvm_read_guest_atomic); 2052 2053 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2054 void *data, unsigned long len) 2055 { 2056 gfn_t gfn = gpa >> PAGE_SHIFT; 2057 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2058 int offset = offset_in_page(gpa); 2059 2060 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2061 } 2062 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2063 2064 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2065 const void *data, int offset, int len) 2066 { 2067 int r; 2068 unsigned long addr; 2069 2070 addr = gfn_to_hva_memslot(memslot, gfn); 2071 if (kvm_is_error_hva(addr)) 2072 return -EFAULT; 2073 r = __copy_to_user((void __user *)addr + offset, data, len); 2074 if (r) 2075 return -EFAULT; 2076 mark_page_dirty_in_slot(memslot, gfn); 2077 return 0; 2078 } 2079 2080 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2081 const void *data, int offset, int len) 2082 { 2083 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2084 2085 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2086 } 2087 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2088 2089 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2090 const void *data, int offset, int len) 2091 { 2092 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2093 2094 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2095 } 2096 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2097 2098 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2099 unsigned long len) 2100 { 2101 gfn_t gfn = gpa >> PAGE_SHIFT; 2102 int seg; 2103 int offset = offset_in_page(gpa); 2104 int ret; 2105 2106 while ((seg = next_segment(len, offset)) != 0) { 2107 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2108 if (ret < 0) 2109 return ret; 2110 offset = 0; 2111 len -= seg; 2112 data += seg; 2113 ++gfn; 2114 } 2115 return 0; 2116 } 2117 EXPORT_SYMBOL_GPL(kvm_write_guest); 2118 2119 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2120 unsigned long len) 2121 { 2122 gfn_t gfn = gpa >> PAGE_SHIFT; 2123 int seg; 2124 int offset = offset_in_page(gpa); 2125 int ret; 2126 2127 while ((seg = next_segment(len, offset)) != 0) { 2128 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2129 if (ret < 0) 2130 return ret; 2131 offset = 0; 2132 len -= seg; 2133 data += seg; 2134 ++gfn; 2135 } 2136 return 0; 2137 } 2138 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2139 2140 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2141 struct gfn_to_hva_cache *ghc, 2142 gpa_t gpa, unsigned long len) 2143 { 2144 int offset = offset_in_page(gpa); 2145 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2146 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2147 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2148 gfn_t nr_pages_avail; 2149 int r = start_gfn <= end_gfn ? 0 : -EINVAL; 2150 2151 ghc->gpa = gpa; 2152 ghc->generation = slots->generation; 2153 ghc->len = len; 2154 ghc->hva = KVM_HVA_ERR_BAD; 2155 2156 /* 2157 * If the requested region crosses two memslots, we still 2158 * verify that the entire region is valid here. 2159 */ 2160 while (!r && start_gfn <= end_gfn) { 2161 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2162 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2163 &nr_pages_avail); 2164 if (kvm_is_error_hva(ghc->hva)) 2165 r = -EFAULT; 2166 start_gfn += nr_pages_avail; 2167 } 2168 2169 /* Use the slow path for cross page reads and writes. */ 2170 if (!r && nr_pages_needed == 1) 2171 ghc->hva += offset; 2172 else 2173 ghc->memslot = NULL; 2174 2175 return r; 2176 } 2177 2178 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2179 gpa_t gpa, unsigned long len) 2180 { 2181 struct kvm_memslots *slots = kvm_memslots(kvm); 2182 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2183 } 2184 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2185 2186 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2187 void *data, unsigned int offset, 2188 unsigned long len) 2189 { 2190 struct kvm_memslots *slots = kvm_memslots(kvm); 2191 int r; 2192 gpa_t gpa = ghc->gpa + offset; 2193 2194 BUG_ON(len + offset > ghc->len); 2195 2196 if (slots->generation != ghc->generation) 2197 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2198 2199 if (unlikely(!ghc->memslot)) 2200 return kvm_write_guest(kvm, gpa, data, len); 2201 2202 if (kvm_is_error_hva(ghc->hva)) 2203 return -EFAULT; 2204 2205 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2206 if (r) 2207 return -EFAULT; 2208 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2209 2210 return 0; 2211 } 2212 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2213 2214 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2215 void *data, unsigned long len) 2216 { 2217 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2218 } 2219 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2220 2221 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2222 void *data, unsigned long len) 2223 { 2224 struct kvm_memslots *slots = kvm_memslots(kvm); 2225 int r; 2226 2227 BUG_ON(len > ghc->len); 2228 2229 if (slots->generation != ghc->generation) 2230 __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); 2231 2232 if (unlikely(!ghc->memslot)) 2233 return kvm_read_guest(kvm, ghc->gpa, data, len); 2234 2235 if (kvm_is_error_hva(ghc->hva)) 2236 return -EFAULT; 2237 2238 r = __copy_from_user(data, (void __user *)ghc->hva, len); 2239 if (r) 2240 return -EFAULT; 2241 2242 return 0; 2243 } 2244 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2245 2246 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2247 { 2248 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2249 2250 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2251 } 2252 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2253 2254 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2255 { 2256 gfn_t gfn = gpa >> PAGE_SHIFT; 2257 int seg; 2258 int offset = offset_in_page(gpa); 2259 int ret; 2260 2261 while ((seg = next_segment(len, offset)) != 0) { 2262 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2263 if (ret < 0) 2264 return ret; 2265 offset = 0; 2266 len -= seg; 2267 ++gfn; 2268 } 2269 return 0; 2270 } 2271 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2272 2273 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2274 gfn_t gfn) 2275 { 2276 if (memslot && memslot->dirty_bitmap) { 2277 unsigned long rel_gfn = gfn - memslot->base_gfn; 2278 2279 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2280 } 2281 } 2282 2283 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2284 { 2285 struct kvm_memory_slot *memslot; 2286 2287 memslot = gfn_to_memslot(kvm, gfn); 2288 mark_page_dirty_in_slot(memslot, gfn); 2289 } 2290 EXPORT_SYMBOL_GPL(mark_page_dirty); 2291 2292 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2293 { 2294 struct kvm_memory_slot *memslot; 2295 2296 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2297 mark_page_dirty_in_slot(memslot, gfn); 2298 } 2299 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2300 2301 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2302 { 2303 if (!vcpu->sigset_active) 2304 return; 2305 2306 /* 2307 * This does a lockless modification of ->real_blocked, which is fine 2308 * because, only current can change ->real_blocked and all readers of 2309 * ->real_blocked don't care as long ->real_blocked is always a subset 2310 * of ->blocked. 2311 */ 2312 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2313 } 2314 2315 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2316 { 2317 if (!vcpu->sigset_active) 2318 return; 2319 2320 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2321 sigemptyset(¤t->real_blocked); 2322 } 2323 2324 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2325 { 2326 unsigned int old, val, grow, grow_start; 2327 2328 old = val = vcpu->halt_poll_ns; 2329 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2330 grow = READ_ONCE(halt_poll_ns_grow); 2331 if (!grow) 2332 goto out; 2333 2334 val *= grow; 2335 if (val < grow_start) 2336 val = grow_start; 2337 2338 if (val > halt_poll_ns) 2339 val = halt_poll_ns; 2340 2341 vcpu->halt_poll_ns = val; 2342 out: 2343 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2344 } 2345 2346 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2347 { 2348 unsigned int old, val, shrink; 2349 2350 old = val = vcpu->halt_poll_ns; 2351 shrink = READ_ONCE(halt_poll_ns_shrink); 2352 if (shrink == 0) 2353 val = 0; 2354 else 2355 val /= shrink; 2356 2357 vcpu->halt_poll_ns = val; 2358 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2359 } 2360 2361 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2362 { 2363 int ret = -EINTR; 2364 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2365 2366 if (kvm_arch_vcpu_runnable(vcpu)) { 2367 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2368 goto out; 2369 } 2370 if (kvm_cpu_has_pending_timer(vcpu)) 2371 goto out; 2372 if (signal_pending(current)) 2373 goto out; 2374 2375 ret = 0; 2376 out: 2377 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2378 return ret; 2379 } 2380 2381 /* 2382 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2383 */ 2384 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2385 { 2386 ktime_t start, cur; 2387 DECLARE_SWAITQUEUE(wait); 2388 bool waited = false; 2389 u64 block_ns; 2390 2391 kvm_arch_vcpu_blocking(vcpu); 2392 2393 start = cur = ktime_get(); 2394 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2395 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2396 2397 ++vcpu->stat.halt_attempted_poll; 2398 do { 2399 /* 2400 * This sets KVM_REQ_UNHALT if an interrupt 2401 * arrives. 2402 */ 2403 if (kvm_vcpu_check_block(vcpu) < 0) { 2404 ++vcpu->stat.halt_successful_poll; 2405 if (!vcpu_valid_wakeup(vcpu)) 2406 ++vcpu->stat.halt_poll_invalid; 2407 goto out; 2408 } 2409 cur = ktime_get(); 2410 } while (single_task_running() && ktime_before(cur, stop)); 2411 } 2412 2413 for (;;) { 2414 prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 2415 2416 if (kvm_vcpu_check_block(vcpu) < 0) 2417 break; 2418 2419 waited = true; 2420 schedule(); 2421 } 2422 2423 finish_swait(&vcpu->wq, &wait); 2424 cur = ktime_get(); 2425 out: 2426 kvm_arch_vcpu_unblocking(vcpu); 2427 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2428 2429 if (!kvm_arch_no_poll(vcpu)) { 2430 if (!vcpu_valid_wakeup(vcpu)) { 2431 shrink_halt_poll_ns(vcpu); 2432 } else if (halt_poll_ns) { 2433 if (block_ns <= vcpu->halt_poll_ns) 2434 ; 2435 /* we had a long block, shrink polling */ 2436 else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) 2437 shrink_halt_poll_ns(vcpu); 2438 /* we had a short halt and our poll time is too small */ 2439 else if (vcpu->halt_poll_ns < halt_poll_ns && 2440 block_ns < halt_poll_ns) 2441 grow_halt_poll_ns(vcpu); 2442 } else { 2443 vcpu->halt_poll_ns = 0; 2444 } 2445 } 2446 2447 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2448 kvm_arch_vcpu_block_finish(vcpu); 2449 } 2450 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2451 2452 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2453 { 2454 struct swait_queue_head *wqp; 2455 2456 wqp = kvm_arch_vcpu_wq(vcpu); 2457 if (swq_has_sleeper(wqp)) { 2458 swake_up_one(wqp); 2459 WRITE_ONCE(vcpu->ready, true); 2460 ++vcpu->stat.halt_wakeup; 2461 return true; 2462 } 2463 2464 return false; 2465 } 2466 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2467 2468 #ifndef CONFIG_S390 2469 /* 2470 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2471 */ 2472 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2473 { 2474 int me; 2475 int cpu = vcpu->cpu; 2476 2477 if (kvm_vcpu_wake_up(vcpu)) 2478 return; 2479 2480 me = get_cpu(); 2481 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2482 if (kvm_arch_vcpu_should_kick(vcpu)) 2483 smp_send_reschedule(cpu); 2484 put_cpu(); 2485 } 2486 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2487 #endif /* !CONFIG_S390 */ 2488 2489 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2490 { 2491 struct pid *pid; 2492 struct task_struct *task = NULL; 2493 int ret = 0; 2494 2495 rcu_read_lock(); 2496 pid = rcu_dereference(target->pid); 2497 if (pid) 2498 task = get_pid_task(pid, PIDTYPE_PID); 2499 rcu_read_unlock(); 2500 if (!task) 2501 return ret; 2502 ret = yield_to(task, 1); 2503 put_task_struct(task); 2504 2505 return ret; 2506 } 2507 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2508 2509 /* 2510 * Helper that checks whether a VCPU is eligible for directed yield. 2511 * Most eligible candidate to yield is decided by following heuristics: 2512 * 2513 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2514 * (preempted lock holder), indicated by @in_spin_loop. 2515 * Set at the beiginning and cleared at the end of interception/PLE handler. 2516 * 2517 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2518 * chance last time (mostly it has become eligible now since we have probably 2519 * yielded to lockholder in last iteration. This is done by toggling 2520 * @dy_eligible each time a VCPU checked for eligibility.) 2521 * 2522 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2523 * to preempted lock-holder could result in wrong VCPU selection and CPU 2524 * burning. Giving priority for a potential lock-holder increases lock 2525 * progress. 2526 * 2527 * Since algorithm is based on heuristics, accessing another VCPU data without 2528 * locking does not harm. It may result in trying to yield to same VCPU, fail 2529 * and continue with next VCPU and so on. 2530 */ 2531 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2532 { 2533 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2534 bool eligible; 2535 2536 eligible = !vcpu->spin_loop.in_spin_loop || 2537 vcpu->spin_loop.dy_eligible; 2538 2539 if (vcpu->spin_loop.in_spin_loop) 2540 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2541 2542 return eligible; 2543 #else 2544 return true; 2545 #endif 2546 } 2547 2548 /* 2549 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2550 * a vcpu_load/vcpu_put pair. However, for most architectures 2551 * kvm_arch_vcpu_runnable does not require vcpu_load. 2552 */ 2553 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2554 { 2555 return kvm_arch_vcpu_runnable(vcpu); 2556 } 2557 2558 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2559 { 2560 if (kvm_arch_dy_runnable(vcpu)) 2561 return true; 2562 2563 #ifdef CONFIG_KVM_ASYNC_PF 2564 if (!list_empty_careful(&vcpu->async_pf.done)) 2565 return true; 2566 #endif 2567 2568 return false; 2569 } 2570 2571 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2572 { 2573 struct kvm *kvm = me->kvm; 2574 struct kvm_vcpu *vcpu; 2575 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2576 int yielded = 0; 2577 int try = 3; 2578 int pass; 2579 int i; 2580 2581 kvm_vcpu_set_in_spin_loop(me, true); 2582 /* 2583 * We boost the priority of a VCPU that is runnable but not 2584 * currently running, because it got preempted by something 2585 * else and called schedule in __vcpu_run. Hopefully that 2586 * VCPU is holding the lock that we need and will release it. 2587 * We approximate round-robin by starting at the last boosted VCPU. 2588 */ 2589 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2590 kvm_for_each_vcpu(i, vcpu, kvm) { 2591 if (!pass && i <= last_boosted_vcpu) { 2592 i = last_boosted_vcpu; 2593 continue; 2594 } else if (pass && i > last_boosted_vcpu) 2595 break; 2596 if (!READ_ONCE(vcpu->ready)) 2597 continue; 2598 if (vcpu == me) 2599 continue; 2600 if (swait_active(&vcpu->wq) && !vcpu_dy_runnable(vcpu)) 2601 continue; 2602 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2603 !kvm_arch_vcpu_in_kernel(vcpu)) 2604 continue; 2605 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2606 continue; 2607 2608 yielded = kvm_vcpu_yield_to(vcpu); 2609 if (yielded > 0) { 2610 kvm->last_boosted_vcpu = i; 2611 break; 2612 } else if (yielded < 0) { 2613 try--; 2614 if (!try) 2615 break; 2616 } 2617 } 2618 } 2619 kvm_vcpu_set_in_spin_loop(me, false); 2620 2621 /* Ensure vcpu is not eligible during next spinloop */ 2622 kvm_vcpu_set_dy_eligible(me, false); 2623 } 2624 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2625 2626 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2627 { 2628 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2629 struct page *page; 2630 2631 if (vmf->pgoff == 0) 2632 page = virt_to_page(vcpu->run); 2633 #ifdef CONFIG_X86 2634 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2635 page = virt_to_page(vcpu->arch.pio_data); 2636 #endif 2637 #ifdef CONFIG_KVM_MMIO 2638 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2639 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2640 #endif 2641 else 2642 return kvm_arch_vcpu_fault(vcpu, vmf); 2643 get_page(page); 2644 vmf->page = page; 2645 return 0; 2646 } 2647 2648 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2649 .fault = kvm_vcpu_fault, 2650 }; 2651 2652 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2653 { 2654 vma->vm_ops = &kvm_vcpu_vm_ops; 2655 return 0; 2656 } 2657 2658 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2659 { 2660 struct kvm_vcpu *vcpu = filp->private_data; 2661 2662 debugfs_remove_recursive(vcpu->debugfs_dentry); 2663 kvm_put_kvm(vcpu->kvm); 2664 return 0; 2665 } 2666 2667 static struct file_operations kvm_vcpu_fops = { 2668 .release = kvm_vcpu_release, 2669 .unlocked_ioctl = kvm_vcpu_ioctl, 2670 .mmap = kvm_vcpu_mmap, 2671 .llseek = noop_llseek, 2672 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2673 }; 2674 2675 /* 2676 * Allocates an inode for the vcpu. 2677 */ 2678 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2679 { 2680 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2681 2682 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2683 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 2684 } 2685 2686 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 2687 { 2688 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 2689 char dir_name[ITOA_MAX_LEN * 2]; 2690 2691 if (!debugfs_initialized()) 2692 return; 2693 2694 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 2695 vcpu->debugfs_dentry = debugfs_create_dir(dir_name, 2696 vcpu->kvm->debugfs_dentry); 2697 2698 kvm_arch_create_vcpu_debugfs(vcpu); 2699 #endif 2700 } 2701 2702 /* 2703 * Creates some virtual cpus. Good luck creating more than one. 2704 */ 2705 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 2706 { 2707 int r; 2708 struct kvm_vcpu *vcpu; 2709 2710 if (id >= KVM_MAX_VCPU_ID) 2711 return -EINVAL; 2712 2713 mutex_lock(&kvm->lock); 2714 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 2715 mutex_unlock(&kvm->lock); 2716 return -EINVAL; 2717 } 2718 2719 kvm->created_vcpus++; 2720 mutex_unlock(&kvm->lock); 2721 2722 vcpu = kvm_arch_vcpu_create(kvm, id); 2723 if (IS_ERR(vcpu)) { 2724 r = PTR_ERR(vcpu); 2725 goto vcpu_decrement; 2726 } 2727 2728 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 2729 2730 r = kvm_arch_vcpu_setup(vcpu); 2731 if (r) 2732 goto vcpu_destroy; 2733 2734 kvm_create_vcpu_debugfs(vcpu); 2735 2736 mutex_lock(&kvm->lock); 2737 if (kvm_get_vcpu_by_id(kvm, id)) { 2738 r = -EEXIST; 2739 goto unlock_vcpu_destroy; 2740 } 2741 2742 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 2743 2744 /* Now it's all set up, let userspace reach it */ 2745 kvm_get_kvm(kvm); 2746 r = create_vcpu_fd(vcpu); 2747 if (r < 0) { 2748 kvm_put_kvm(kvm); 2749 goto unlock_vcpu_destroy; 2750 } 2751 2752 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 2753 2754 /* 2755 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 2756 * before kvm->online_vcpu's incremented value. 2757 */ 2758 smp_wmb(); 2759 atomic_inc(&kvm->online_vcpus); 2760 2761 mutex_unlock(&kvm->lock); 2762 kvm_arch_vcpu_postcreate(vcpu); 2763 return r; 2764 2765 unlock_vcpu_destroy: 2766 mutex_unlock(&kvm->lock); 2767 debugfs_remove_recursive(vcpu->debugfs_dentry); 2768 vcpu_destroy: 2769 kvm_arch_vcpu_destroy(vcpu); 2770 vcpu_decrement: 2771 mutex_lock(&kvm->lock); 2772 kvm->created_vcpus--; 2773 mutex_unlock(&kvm->lock); 2774 return r; 2775 } 2776 2777 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 2778 { 2779 if (sigset) { 2780 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 2781 vcpu->sigset_active = 1; 2782 vcpu->sigset = *sigset; 2783 } else 2784 vcpu->sigset_active = 0; 2785 return 0; 2786 } 2787 2788 static long kvm_vcpu_ioctl(struct file *filp, 2789 unsigned int ioctl, unsigned long arg) 2790 { 2791 struct kvm_vcpu *vcpu = filp->private_data; 2792 void __user *argp = (void __user *)arg; 2793 int r; 2794 struct kvm_fpu *fpu = NULL; 2795 struct kvm_sregs *kvm_sregs = NULL; 2796 2797 if (vcpu->kvm->mm != current->mm) 2798 return -EIO; 2799 2800 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 2801 return -EINVAL; 2802 2803 /* 2804 * Some architectures have vcpu ioctls that are asynchronous to vcpu 2805 * execution; mutex_lock() would break them. 2806 */ 2807 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 2808 if (r != -ENOIOCTLCMD) 2809 return r; 2810 2811 if (mutex_lock_killable(&vcpu->mutex)) 2812 return -EINTR; 2813 switch (ioctl) { 2814 case KVM_RUN: { 2815 struct pid *oldpid; 2816 r = -EINVAL; 2817 if (arg) 2818 goto out; 2819 oldpid = rcu_access_pointer(vcpu->pid); 2820 if (unlikely(oldpid != task_pid(current))) { 2821 /* The thread running this VCPU changed. */ 2822 struct pid *newpid; 2823 2824 r = kvm_arch_vcpu_run_pid_change(vcpu); 2825 if (r) 2826 break; 2827 2828 newpid = get_task_pid(current, PIDTYPE_PID); 2829 rcu_assign_pointer(vcpu->pid, newpid); 2830 if (oldpid) 2831 synchronize_rcu(); 2832 put_pid(oldpid); 2833 } 2834 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 2835 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 2836 break; 2837 } 2838 case KVM_GET_REGS: { 2839 struct kvm_regs *kvm_regs; 2840 2841 r = -ENOMEM; 2842 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 2843 if (!kvm_regs) 2844 goto out; 2845 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 2846 if (r) 2847 goto out_free1; 2848 r = -EFAULT; 2849 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 2850 goto out_free1; 2851 r = 0; 2852 out_free1: 2853 kfree(kvm_regs); 2854 break; 2855 } 2856 case KVM_SET_REGS: { 2857 struct kvm_regs *kvm_regs; 2858 2859 r = -ENOMEM; 2860 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 2861 if (IS_ERR(kvm_regs)) { 2862 r = PTR_ERR(kvm_regs); 2863 goto out; 2864 } 2865 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 2866 kfree(kvm_regs); 2867 break; 2868 } 2869 case KVM_GET_SREGS: { 2870 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 2871 GFP_KERNEL_ACCOUNT); 2872 r = -ENOMEM; 2873 if (!kvm_sregs) 2874 goto out; 2875 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 2876 if (r) 2877 goto out; 2878 r = -EFAULT; 2879 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 2880 goto out; 2881 r = 0; 2882 break; 2883 } 2884 case KVM_SET_SREGS: { 2885 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 2886 if (IS_ERR(kvm_sregs)) { 2887 r = PTR_ERR(kvm_sregs); 2888 kvm_sregs = NULL; 2889 goto out; 2890 } 2891 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 2892 break; 2893 } 2894 case KVM_GET_MP_STATE: { 2895 struct kvm_mp_state mp_state; 2896 2897 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 2898 if (r) 2899 goto out; 2900 r = -EFAULT; 2901 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 2902 goto out; 2903 r = 0; 2904 break; 2905 } 2906 case KVM_SET_MP_STATE: { 2907 struct kvm_mp_state mp_state; 2908 2909 r = -EFAULT; 2910 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 2911 goto out; 2912 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2913 break; 2914 } 2915 case KVM_TRANSLATE: { 2916 struct kvm_translation tr; 2917 2918 r = -EFAULT; 2919 if (copy_from_user(&tr, argp, sizeof(tr))) 2920 goto out; 2921 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2922 if (r) 2923 goto out; 2924 r = -EFAULT; 2925 if (copy_to_user(argp, &tr, sizeof(tr))) 2926 goto out; 2927 r = 0; 2928 break; 2929 } 2930 case KVM_SET_GUEST_DEBUG: { 2931 struct kvm_guest_debug dbg; 2932 2933 r = -EFAULT; 2934 if (copy_from_user(&dbg, argp, sizeof(dbg))) 2935 goto out; 2936 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2937 break; 2938 } 2939 case KVM_SET_SIGNAL_MASK: { 2940 struct kvm_signal_mask __user *sigmask_arg = argp; 2941 struct kvm_signal_mask kvm_sigmask; 2942 sigset_t sigset, *p; 2943 2944 p = NULL; 2945 if (argp) { 2946 r = -EFAULT; 2947 if (copy_from_user(&kvm_sigmask, argp, 2948 sizeof(kvm_sigmask))) 2949 goto out; 2950 r = -EINVAL; 2951 if (kvm_sigmask.len != sizeof(sigset)) 2952 goto out; 2953 r = -EFAULT; 2954 if (copy_from_user(&sigset, sigmask_arg->sigset, 2955 sizeof(sigset))) 2956 goto out; 2957 p = &sigset; 2958 } 2959 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 2960 break; 2961 } 2962 case KVM_GET_FPU: { 2963 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 2964 r = -ENOMEM; 2965 if (!fpu) 2966 goto out; 2967 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2968 if (r) 2969 goto out; 2970 r = -EFAULT; 2971 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2972 goto out; 2973 r = 0; 2974 break; 2975 } 2976 case KVM_SET_FPU: { 2977 fpu = memdup_user(argp, sizeof(*fpu)); 2978 if (IS_ERR(fpu)) { 2979 r = PTR_ERR(fpu); 2980 fpu = NULL; 2981 goto out; 2982 } 2983 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2984 break; 2985 } 2986 default: 2987 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2988 } 2989 out: 2990 mutex_unlock(&vcpu->mutex); 2991 kfree(fpu); 2992 kfree(kvm_sregs); 2993 return r; 2994 } 2995 2996 #ifdef CONFIG_KVM_COMPAT 2997 static long kvm_vcpu_compat_ioctl(struct file *filp, 2998 unsigned int ioctl, unsigned long arg) 2999 { 3000 struct kvm_vcpu *vcpu = filp->private_data; 3001 void __user *argp = compat_ptr(arg); 3002 int r; 3003 3004 if (vcpu->kvm->mm != current->mm) 3005 return -EIO; 3006 3007 switch (ioctl) { 3008 case KVM_SET_SIGNAL_MASK: { 3009 struct kvm_signal_mask __user *sigmask_arg = argp; 3010 struct kvm_signal_mask kvm_sigmask; 3011 sigset_t sigset; 3012 3013 if (argp) { 3014 r = -EFAULT; 3015 if (copy_from_user(&kvm_sigmask, argp, 3016 sizeof(kvm_sigmask))) 3017 goto out; 3018 r = -EINVAL; 3019 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3020 goto out; 3021 r = -EFAULT; 3022 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 3023 goto out; 3024 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3025 } else 3026 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3027 break; 3028 } 3029 default: 3030 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3031 } 3032 3033 out: 3034 return r; 3035 } 3036 #endif 3037 3038 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3039 { 3040 struct kvm_device *dev = filp->private_data; 3041 3042 if (dev->ops->mmap) 3043 return dev->ops->mmap(dev, vma); 3044 3045 return -ENODEV; 3046 } 3047 3048 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3049 int (*accessor)(struct kvm_device *dev, 3050 struct kvm_device_attr *attr), 3051 unsigned long arg) 3052 { 3053 struct kvm_device_attr attr; 3054 3055 if (!accessor) 3056 return -EPERM; 3057 3058 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3059 return -EFAULT; 3060 3061 return accessor(dev, &attr); 3062 } 3063 3064 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3065 unsigned long arg) 3066 { 3067 struct kvm_device *dev = filp->private_data; 3068 3069 if (dev->kvm->mm != current->mm) 3070 return -EIO; 3071 3072 switch (ioctl) { 3073 case KVM_SET_DEVICE_ATTR: 3074 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3075 case KVM_GET_DEVICE_ATTR: 3076 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3077 case KVM_HAS_DEVICE_ATTR: 3078 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3079 default: 3080 if (dev->ops->ioctl) 3081 return dev->ops->ioctl(dev, ioctl, arg); 3082 3083 return -ENOTTY; 3084 } 3085 } 3086 3087 static int kvm_device_release(struct inode *inode, struct file *filp) 3088 { 3089 struct kvm_device *dev = filp->private_data; 3090 struct kvm *kvm = dev->kvm; 3091 3092 if (dev->ops->release) { 3093 mutex_lock(&kvm->lock); 3094 list_del(&dev->vm_node); 3095 dev->ops->release(dev); 3096 mutex_unlock(&kvm->lock); 3097 } 3098 3099 kvm_put_kvm(kvm); 3100 return 0; 3101 } 3102 3103 static const struct file_operations kvm_device_fops = { 3104 .unlocked_ioctl = kvm_device_ioctl, 3105 .release = kvm_device_release, 3106 KVM_COMPAT(kvm_device_ioctl), 3107 .mmap = kvm_device_mmap, 3108 }; 3109 3110 struct kvm_device *kvm_device_from_filp(struct file *filp) 3111 { 3112 if (filp->f_op != &kvm_device_fops) 3113 return NULL; 3114 3115 return filp->private_data; 3116 } 3117 3118 static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3119 #ifdef CONFIG_KVM_MPIC 3120 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3121 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3122 #endif 3123 }; 3124 3125 int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) 3126 { 3127 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3128 return -ENOSPC; 3129 3130 if (kvm_device_ops_table[type] != NULL) 3131 return -EEXIST; 3132 3133 kvm_device_ops_table[type] = ops; 3134 return 0; 3135 } 3136 3137 void kvm_unregister_device_ops(u32 type) 3138 { 3139 if (kvm_device_ops_table[type] != NULL) 3140 kvm_device_ops_table[type] = NULL; 3141 } 3142 3143 static int kvm_ioctl_create_device(struct kvm *kvm, 3144 struct kvm_create_device *cd) 3145 { 3146 struct kvm_device_ops *ops = NULL; 3147 struct kvm_device *dev; 3148 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3149 int type; 3150 int ret; 3151 3152 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3153 return -ENODEV; 3154 3155 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3156 ops = kvm_device_ops_table[type]; 3157 if (ops == NULL) 3158 return -ENODEV; 3159 3160 if (test) 3161 return 0; 3162 3163 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3164 if (!dev) 3165 return -ENOMEM; 3166 3167 dev->ops = ops; 3168 dev->kvm = kvm; 3169 3170 mutex_lock(&kvm->lock); 3171 ret = ops->create(dev, type); 3172 if (ret < 0) { 3173 mutex_unlock(&kvm->lock); 3174 kfree(dev); 3175 return ret; 3176 } 3177 list_add(&dev->vm_node, &kvm->devices); 3178 mutex_unlock(&kvm->lock); 3179 3180 if (ops->init) 3181 ops->init(dev); 3182 3183 kvm_get_kvm(kvm); 3184 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3185 if (ret < 0) { 3186 kvm_put_kvm(kvm); 3187 mutex_lock(&kvm->lock); 3188 list_del(&dev->vm_node); 3189 mutex_unlock(&kvm->lock); 3190 ops->destroy(dev); 3191 return ret; 3192 } 3193 3194 cd->fd = ret; 3195 return 0; 3196 } 3197 3198 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3199 { 3200 switch (arg) { 3201 case KVM_CAP_USER_MEMORY: 3202 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3203 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3204 case KVM_CAP_INTERNAL_ERROR_DATA: 3205 #ifdef CONFIG_HAVE_KVM_MSI 3206 case KVM_CAP_SIGNAL_MSI: 3207 #endif 3208 #ifdef CONFIG_HAVE_KVM_IRQFD 3209 case KVM_CAP_IRQFD: 3210 case KVM_CAP_IRQFD_RESAMPLE: 3211 #endif 3212 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3213 case KVM_CAP_CHECK_EXTENSION_VM: 3214 case KVM_CAP_ENABLE_CAP_VM: 3215 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3216 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3217 #endif 3218 return 1; 3219 #ifdef CONFIG_KVM_MMIO 3220 case KVM_CAP_COALESCED_MMIO: 3221 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3222 case KVM_CAP_COALESCED_PIO: 3223 return 1; 3224 #endif 3225 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3226 case KVM_CAP_IRQ_ROUTING: 3227 return KVM_MAX_IRQ_ROUTES; 3228 #endif 3229 #if KVM_ADDRESS_SPACE_NUM > 1 3230 case KVM_CAP_MULTI_ADDRESS_SPACE: 3231 return KVM_ADDRESS_SPACE_NUM; 3232 #endif 3233 case KVM_CAP_NR_MEMSLOTS: 3234 return KVM_USER_MEM_SLOTS; 3235 default: 3236 break; 3237 } 3238 return kvm_vm_ioctl_check_extension(kvm, arg); 3239 } 3240 3241 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3242 struct kvm_enable_cap *cap) 3243 { 3244 return -EINVAL; 3245 } 3246 3247 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3248 struct kvm_enable_cap *cap) 3249 { 3250 switch (cap->cap) { 3251 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3252 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3253 if (cap->flags || (cap->args[0] & ~1)) 3254 return -EINVAL; 3255 kvm->manual_dirty_log_protect = cap->args[0]; 3256 return 0; 3257 #endif 3258 default: 3259 return kvm_vm_ioctl_enable_cap(kvm, cap); 3260 } 3261 } 3262 3263 static long kvm_vm_ioctl(struct file *filp, 3264 unsigned int ioctl, unsigned long arg) 3265 { 3266 struct kvm *kvm = filp->private_data; 3267 void __user *argp = (void __user *)arg; 3268 int r; 3269 3270 if (kvm->mm != current->mm) 3271 return -EIO; 3272 switch (ioctl) { 3273 case KVM_CREATE_VCPU: 3274 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3275 break; 3276 case KVM_ENABLE_CAP: { 3277 struct kvm_enable_cap cap; 3278 3279 r = -EFAULT; 3280 if (copy_from_user(&cap, argp, sizeof(cap))) 3281 goto out; 3282 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3283 break; 3284 } 3285 case KVM_SET_USER_MEMORY_REGION: { 3286 struct kvm_userspace_memory_region kvm_userspace_mem; 3287 3288 r = -EFAULT; 3289 if (copy_from_user(&kvm_userspace_mem, argp, 3290 sizeof(kvm_userspace_mem))) 3291 goto out; 3292 3293 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3294 break; 3295 } 3296 case KVM_GET_DIRTY_LOG: { 3297 struct kvm_dirty_log log; 3298 3299 r = -EFAULT; 3300 if (copy_from_user(&log, argp, sizeof(log))) 3301 goto out; 3302 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3303 break; 3304 } 3305 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3306 case KVM_CLEAR_DIRTY_LOG: { 3307 struct kvm_clear_dirty_log log; 3308 3309 r = -EFAULT; 3310 if (copy_from_user(&log, argp, sizeof(log))) 3311 goto out; 3312 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3313 break; 3314 } 3315 #endif 3316 #ifdef CONFIG_KVM_MMIO 3317 case KVM_REGISTER_COALESCED_MMIO: { 3318 struct kvm_coalesced_mmio_zone zone; 3319 3320 r = -EFAULT; 3321 if (copy_from_user(&zone, argp, sizeof(zone))) 3322 goto out; 3323 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3324 break; 3325 } 3326 case KVM_UNREGISTER_COALESCED_MMIO: { 3327 struct kvm_coalesced_mmio_zone zone; 3328 3329 r = -EFAULT; 3330 if (copy_from_user(&zone, argp, sizeof(zone))) 3331 goto out; 3332 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3333 break; 3334 } 3335 #endif 3336 case KVM_IRQFD: { 3337 struct kvm_irqfd data; 3338 3339 r = -EFAULT; 3340 if (copy_from_user(&data, argp, sizeof(data))) 3341 goto out; 3342 r = kvm_irqfd(kvm, &data); 3343 break; 3344 } 3345 case KVM_IOEVENTFD: { 3346 struct kvm_ioeventfd data; 3347 3348 r = -EFAULT; 3349 if (copy_from_user(&data, argp, sizeof(data))) 3350 goto out; 3351 r = kvm_ioeventfd(kvm, &data); 3352 break; 3353 } 3354 #ifdef CONFIG_HAVE_KVM_MSI 3355 case KVM_SIGNAL_MSI: { 3356 struct kvm_msi msi; 3357 3358 r = -EFAULT; 3359 if (copy_from_user(&msi, argp, sizeof(msi))) 3360 goto out; 3361 r = kvm_send_userspace_msi(kvm, &msi); 3362 break; 3363 } 3364 #endif 3365 #ifdef __KVM_HAVE_IRQ_LINE 3366 case KVM_IRQ_LINE_STATUS: 3367 case KVM_IRQ_LINE: { 3368 struct kvm_irq_level irq_event; 3369 3370 r = -EFAULT; 3371 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3372 goto out; 3373 3374 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3375 ioctl == KVM_IRQ_LINE_STATUS); 3376 if (r) 3377 goto out; 3378 3379 r = -EFAULT; 3380 if (ioctl == KVM_IRQ_LINE_STATUS) { 3381 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3382 goto out; 3383 } 3384 3385 r = 0; 3386 break; 3387 } 3388 #endif 3389 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3390 case KVM_SET_GSI_ROUTING: { 3391 struct kvm_irq_routing routing; 3392 struct kvm_irq_routing __user *urouting; 3393 struct kvm_irq_routing_entry *entries = NULL; 3394 3395 r = -EFAULT; 3396 if (copy_from_user(&routing, argp, sizeof(routing))) 3397 goto out; 3398 r = -EINVAL; 3399 if (!kvm_arch_can_set_irq_routing(kvm)) 3400 goto out; 3401 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3402 goto out; 3403 if (routing.flags) 3404 goto out; 3405 if (routing.nr) { 3406 r = -ENOMEM; 3407 entries = vmalloc(array_size(sizeof(*entries), 3408 routing.nr)); 3409 if (!entries) 3410 goto out; 3411 r = -EFAULT; 3412 urouting = argp; 3413 if (copy_from_user(entries, urouting->entries, 3414 routing.nr * sizeof(*entries))) 3415 goto out_free_irq_routing; 3416 } 3417 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3418 routing.flags); 3419 out_free_irq_routing: 3420 vfree(entries); 3421 break; 3422 } 3423 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3424 case KVM_CREATE_DEVICE: { 3425 struct kvm_create_device cd; 3426 3427 r = -EFAULT; 3428 if (copy_from_user(&cd, argp, sizeof(cd))) 3429 goto out; 3430 3431 r = kvm_ioctl_create_device(kvm, &cd); 3432 if (r) 3433 goto out; 3434 3435 r = -EFAULT; 3436 if (copy_to_user(argp, &cd, sizeof(cd))) 3437 goto out; 3438 3439 r = 0; 3440 break; 3441 } 3442 case KVM_CHECK_EXTENSION: 3443 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3444 break; 3445 default: 3446 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3447 } 3448 out: 3449 return r; 3450 } 3451 3452 #ifdef CONFIG_KVM_COMPAT 3453 struct compat_kvm_dirty_log { 3454 __u32 slot; 3455 __u32 padding1; 3456 union { 3457 compat_uptr_t dirty_bitmap; /* one bit per page */ 3458 __u64 padding2; 3459 }; 3460 }; 3461 3462 static long kvm_vm_compat_ioctl(struct file *filp, 3463 unsigned int ioctl, unsigned long arg) 3464 { 3465 struct kvm *kvm = filp->private_data; 3466 int r; 3467 3468 if (kvm->mm != current->mm) 3469 return -EIO; 3470 switch (ioctl) { 3471 case KVM_GET_DIRTY_LOG: { 3472 struct compat_kvm_dirty_log compat_log; 3473 struct kvm_dirty_log log; 3474 3475 if (copy_from_user(&compat_log, (void __user *)arg, 3476 sizeof(compat_log))) 3477 return -EFAULT; 3478 log.slot = compat_log.slot; 3479 log.padding1 = compat_log.padding1; 3480 log.padding2 = compat_log.padding2; 3481 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3482 3483 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3484 break; 3485 } 3486 default: 3487 r = kvm_vm_ioctl(filp, ioctl, arg); 3488 } 3489 return r; 3490 } 3491 #endif 3492 3493 static struct file_operations kvm_vm_fops = { 3494 .release = kvm_vm_release, 3495 .unlocked_ioctl = kvm_vm_ioctl, 3496 .llseek = noop_llseek, 3497 KVM_COMPAT(kvm_vm_compat_ioctl), 3498 }; 3499 3500 static int kvm_dev_ioctl_create_vm(unsigned long type) 3501 { 3502 int r; 3503 struct kvm *kvm; 3504 struct file *file; 3505 3506 kvm = kvm_create_vm(type); 3507 if (IS_ERR(kvm)) 3508 return PTR_ERR(kvm); 3509 #ifdef CONFIG_KVM_MMIO 3510 r = kvm_coalesced_mmio_init(kvm); 3511 if (r < 0) 3512 goto put_kvm; 3513 #endif 3514 r = get_unused_fd_flags(O_CLOEXEC); 3515 if (r < 0) 3516 goto put_kvm; 3517 3518 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3519 if (IS_ERR(file)) { 3520 put_unused_fd(r); 3521 r = PTR_ERR(file); 3522 goto put_kvm; 3523 } 3524 3525 /* 3526 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3527 * already set, with ->release() being kvm_vm_release(). In error 3528 * cases it will be called by the final fput(file) and will take 3529 * care of doing kvm_put_kvm(kvm). 3530 */ 3531 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3532 put_unused_fd(r); 3533 fput(file); 3534 return -ENOMEM; 3535 } 3536 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3537 3538 fd_install(r, file); 3539 return r; 3540 3541 put_kvm: 3542 kvm_put_kvm(kvm); 3543 return r; 3544 } 3545 3546 static long kvm_dev_ioctl(struct file *filp, 3547 unsigned int ioctl, unsigned long arg) 3548 { 3549 long r = -EINVAL; 3550 3551 switch (ioctl) { 3552 case KVM_GET_API_VERSION: 3553 if (arg) 3554 goto out; 3555 r = KVM_API_VERSION; 3556 break; 3557 case KVM_CREATE_VM: 3558 r = kvm_dev_ioctl_create_vm(arg); 3559 break; 3560 case KVM_CHECK_EXTENSION: 3561 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3562 break; 3563 case KVM_GET_VCPU_MMAP_SIZE: 3564 if (arg) 3565 goto out; 3566 r = PAGE_SIZE; /* struct kvm_run */ 3567 #ifdef CONFIG_X86 3568 r += PAGE_SIZE; /* pio data page */ 3569 #endif 3570 #ifdef CONFIG_KVM_MMIO 3571 r += PAGE_SIZE; /* coalesced mmio ring page */ 3572 #endif 3573 break; 3574 case KVM_TRACE_ENABLE: 3575 case KVM_TRACE_PAUSE: 3576 case KVM_TRACE_DISABLE: 3577 r = -EOPNOTSUPP; 3578 break; 3579 default: 3580 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3581 } 3582 out: 3583 return r; 3584 } 3585 3586 static struct file_operations kvm_chardev_ops = { 3587 .unlocked_ioctl = kvm_dev_ioctl, 3588 .llseek = noop_llseek, 3589 KVM_COMPAT(kvm_dev_ioctl), 3590 }; 3591 3592 static struct miscdevice kvm_dev = { 3593 KVM_MINOR, 3594 "kvm", 3595 &kvm_chardev_ops, 3596 }; 3597 3598 static void hardware_enable_nolock(void *junk) 3599 { 3600 int cpu = raw_smp_processor_id(); 3601 int r; 3602 3603 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3604 return; 3605 3606 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3607 3608 r = kvm_arch_hardware_enable(); 3609 3610 if (r) { 3611 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3612 atomic_inc(&hardware_enable_failed); 3613 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3614 } 3615 } 3616 3617 static int kvm_starting_cpu(unsigned int cpu) 3618 { 3619 raw_spin_lock(&kvm_count_lock); 3620 if (kvm_usage_count) 3621 hardware_enable_nolock(NULL); 3622 raw_spin_unlock(&kvm_count_lock); 3623 return 0; 3624 } 3625 3626 static void hardware_disable_nolock(void *junk) 3627 { 3628 int cpu = raw_smp_processor_id(); 3629 3630 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3631 return; 3632 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3633 kvm_arch_hardware_disable(); 3634 } 3635 3636 static int kvm_dying_cpu(unsigned int cpu) 3637 { 3638 raw_spin_lock(&kvm_count_lock); 3639 if (kvm_usage_count) 3640 hardware_disable_nolock(NULL); 3641 raw_spin_unlock(&kvm_count_lock); 3642 return 0; 3643 } 3644 3645 static void hardware_disable_all_nolock(void) 3646 { 3647 BUG_ON(!kvm_usage_count); 3648 3649 kvm_usage_count--; 3650 if (!kvm_usage_count) 3651 on_each_cpu(hardware_disable_nolock, NULL, 1); 3652 } 3653 3654 static void hardware_disable_all(void) 3655 { 3656 raw_spin_lock(&kvm_count_lock); 3657 hardware_disable_all_nolock(); 3658 raw_spin_unlock(&kvm_count_lock); 3659 } 3660 3661 static int hardware_enable_all(void) 3662 { 3663 int r = 0; 3664 3665 raw_spin_lock(&kvm_count_lock); 3666 3667 kvm_usage_count++; 3668 if (kvm_usage_count == 1) { 3669 atomic_set(&hardware_enable_failed, 0); 3670 on_each_cpu(hardware_enable_nolock, NULL, 1); 3671 3672 if (atomic_read(&hardware_enable_failed)) { 3673 hardware_disable_all_nolock(); 3674 r = -EBUSY; 3675 } 3676 } 3677 3678 raw_spin_unlock(&kvm_count_lock); 3679 3680 return r; 3681 } 3682 3683 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 3684 void *v) 3685 { 3686 /* 3687 * Some (well, at least mine) BIOSes hang on reboot if 3688 * in vmx root mode. 3689 * 3690 * And Intel TXT required VMX off for all cpu when system shutdown. 3691 */ 3692 pr_info("kvm: exiting hardware virtualization\n"); 3693 kvm_rebooting = true; 3694 on_each_cpu(hardware_disable_nolock, NULL, 1); 3695 return NOTIFY_OK; 3696 } 3697 3698 static struct notifier_block kvm_reboot_notifier = { 3699 .notifier_call = kvm_reboot, 3700 .priority = 0, 3701 }; 3702 3703 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 3704 { 3705 int i; 3706 3707 for (i = 0; i < bus->dev_count; i++) { 3708 struct kvm_io_device *pos = bus->range[i].dev; 3709 3710 kvm_iodevice_destructor(pos); 3711 } 3712 kfree(bus); 3713 } 3714 3715 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 3716 const struct kvm_io_range *r2) 3717 { 3718 gpa_t addr1 = r1->addr; 3719 gpa_t addr2 = r2->addr; 3720 3721 if (addr1 < addr2) 3722 return -1; 3723 3724 /* If r2->len == 0, match the exact address. If r2->len != 0, 3725 * accept any overlapping write. Any order is acceptable for 3726 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 3727 * we process all of them. 3728 */ 3729 if (r2->len) { 3730 addr1 += r1->len; 3731 addr2 += r2->len; 3732 } 3733 3734 if (addr1 > addr2) 3735 return 1; 3736 3737 return 0; 3738 } 3739 3740 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 3741 { 3742 return kvm_io_bus_cmp(p1, p2); 3743 } 3744 3745 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 3746 gpa_t addr, int len) 3747 { 3748 struct kvm_io_range *range, key; 3749 int off; 3750 3751 key = (struct kvm_io_range) { 3752 .addr = addr, 3753 .len = len, 3754 }; 3755 3756 range = bsearch(&key, bus->range, bus->dev_count, 3757 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 3758 if (range == NULL) 3759 return -ENOENT; 3760 3761 off = range - bus->range; 3762 3763 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 3764 off--; 3765 3766 return off; 3767 } 3768 3769 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3770 struct kvm_io_range *range, const void *val) 3771 { 3772 int idx; 3773 3774 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3775 if (idx < 0) 3776 return -EOPNOTSUPP; 3777 3778 while (idx < bus->dev_count && 3779 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3780 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 3781 range->len, val)) 3782 return idx; 3783 idx++; 3784 } 3785 3786 return -EOPNOTSUPP; 3787 } 3788 3789 /* kvm_io_bus_write - called under kvm->slots_lock */ 3790 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3791 int len, const void *val) 3792 { 3793 struct kvm_io_bus *bus; 3794 struct kvm_io_range range; 3795 int r; 3796 3797 range = (struct kvm_io_range) { 3798 .addr = addr, 3799 .len = len, 3800 }; 3801 3802 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3803 if (!bus) 3804 return -ENOMEM; 3805 r = __kvm_io_bus_write(vcpu, bus, &range, val); 3806 return r < 0 ? r : 0; 3807 } 3808 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 3809 3810 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 3811 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 3812 gpa_t addr, int len, const void *val, long cookie) 3813 { 3814 struct kvm_io_bus *bus; 3815 struct kvm_io_range range; 3816 3817 range = (struct kvm_io_range) { 3818 .addr = addr, 3819 .len = len, 3820 }; 3821 3822 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3823 if (!bus) 3824 return -ENOMEM; 3825 3826 /* First try the device referenced by cookie. */ 3827 if ((cookie >= 0) && (cookie < bus->dev_count) && 3828 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 3829 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 3830 val)) 3831 return cookie; 3832 3833 /* 3834 * cookie contained garbage; fall back to search and return the 3835 * correct cookie value. 3836 */ 3837 return __kvm_io_bus_write(vcpu, bus, &range, val); 3838 } 3839 3840 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 3841 struct kvm_io_range *range, void *val) 3842 { 3843 int idx; 3844 3845 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 3846 if (idx < 0) 3847 return -EOPNOTSUPP; 3848 3849 while (idx < bus->dev_count && 3850 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 3851 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 3852 range->len, val)) 3853 return idx; 3854 idx++; 3855 } 3856 3857 return -EOPNOTSUPP; 3858 } 3859 3860 /* kvm_io_bus_read - called under kvm->slots_lock */ 3861 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 3862 int len, void *val) 3863 { 3864 struct kvm_io_bus *bus; 3865 struct kvm_io_range range; 3866 int r; 3867 3868 range = (struct kvm_io_range) { 3869 .addr = addr, 3870 .len = len, 3871 }; 3872 3873 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 3874 if (!bus) 3875 return -ENOMEM; 3876 r = __kvm_io_bus_read(vcpu, bus, &range, val); 3877 return r < 0 ? r : 0; 3878 } 3879 3880 /* Caller must hold slots_lock. */ 3881 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 3882 int len, struct kvm_io_device *dev) 3883 { 3884 int i; 3885 struct kvm_io_bus *new_bus, *bus; 3886 struct kvm_io_range range; 3887 3888 bus = kvm_get_bus(kvm, bus_idx); 3889 if (!bus) 3890 return -ENOMEM; 3891 3892 /* exclude ioeventfd which is limited by maximum fd */ 3893 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 3894 return -ENOSPC; 3895 3896 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 3897 GFP_KERNEL_ACCOUNT); 3898 if (!new_bus) 3899 return -ENOMEM; 3900 3901 range = (struct kvm_io_range) { 3902 .addr = addr, 3903 .len = len, 3904 .dev = dev, 3905 }; 3906 3907 for (i = 0; i < bus->dev_count; i++) 3908 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 3909 break; 3910 3911 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3912 new_bus->dev_count++; 3913 new_bus->range[i] = range; 3914 memcpy(new_bus->range + i + 1, bus->range + i, 3915 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 3916 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3917 synchronize_srcu_expedited(&kvm->srcu); 3918 kfree(bus); 3919 3920 return 0; 3921 } 3922 3923 /* Caller must hold slots_lock. */ 3924 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3925 struct kvm_io_device *dev) 3926 { 3927 int i; 3928 struct kvm_io_bus *new_bus, *bus; 3929 3930 bus = kvm_get_bus(kvm, bus_idx); 3931 if (!bus) 3932 return; 3933 3934 for (i = 0; i < bus->dev_count; i++) 3935 if (bus->range[i].dev == dev) { 3936 break; 3937 } 3938 3939 if (i == bus->dev_count) 3940 return; 3941 3942 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 3943 GFP_KERNEL_ACCOUNT); 3944 if (!new_bus) { 3945 pr_err("kvm: failed to shrink bus, removing it completely\n"); 3946 goto broken; 3947 } 3948 3949 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 3950 new_bus->dev_count--; 3951 memcpy(new_bus->range + i, bus->range + i + 1, 3952 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 3953 3954 broken: 3955 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 3956 synchronize_srcu_expedited(&kvm->srcu); 3957 kfree(bus); 3958 return; 3959 } 3960 3961 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 3962 gpa_t addr) 3963 { 3964 struct kvm_io_bus *bus; 3965 int dev_idx, srcu_idx; 3966 struct kvm_io_device *iodev = NULL; 3967 3968 srcu_idx = srcu_read_lock(&kvm->srcu); 3969 3970 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 3971 if (!bus) 3972 goto out_unlock; 3973 3974 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 3975 if (dev_idx < 0) 3976 goto out_unlock; 3977 3978 iodev = bus->range[dev_idx].dev; 3979 3980 out_unlock: 3981 srcu_read_unlock(&kvm->srcu, srcu_idx); 3982 3983 return iodev; 3984 } 3985 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 3986 3987 static int kvm_debugfs_open(struct inode *inode, struct file *file, 3988 int (*get)(void *, u64 *), int (*set)(void *, u64), 3989 const char *fmt) 3990 { 3991 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 3992 inode->i_private; 3993 3994 /* The debugfs files are a reference to the kvm struct which 3995 * is still valid when kvm_destroy_vm is called. 3996 * To avoid the race between open and the removal of the debugfs 3997 * directory we test against the users count. 3998 */ 3999 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4000 return -ENOENT; 4001 4002 if (simple_attr_open(inode, file, get, 4003 stat_data->mode & S_IWUGO ? set : NULL, 4004 fmt)) { 4005 kvm_put_kvm(stat_data->kvm); 4006 return -ENOMEM; 4007 } 4008 4009 return 0; 4010 } 4011 4012 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4013 { 4014 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4015 inode->i_private; 4016 4017 simple_attr_release(inode, file); 4018 kvm_put_kvm(stat_data->kvm); 4019 4020 return 0; 4021 } 4022 4023 static int vm_stat_get_per_vm(void *data, u64 *val) 4024 { 4025 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4026 4027 *val = *(ulong *)((void *)stat_data->kvm + stat_data->offset); 4028 4029 return 0; 4030 } 4031 4032 static int vm_stat_clear_per_vm(void *data, u64 val) 4033 { 4034 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4035 4036 if (val) 4037 return -EINVAL; 4038 4039 *(ulong *)((void *)stat_data->kvm + stat_data->offset) = 0; 4040 4041 return 0; 4042 } 4043 4044 static int vm_stat_get_per_vm_open(struct inode *inode, struct file *file) 4045 { 4046 __simple_attr_check_format("%llu\n", 0ull); 4047 return kvm_debugfs_open(inode, file, vm_stat_get_per_vm, 4048 vm_stat_clear_per_vm, "%llu\n"); 4049 } 4050 4051 static const struct file_operations vm_stat_get_per_vm_fops = { 4052 .owner = THIS_MODULE, 4053 .open = vm_stat_get_per_vm_open, 4054 .release = kvm_debugfs_release, 4055 .read = simple_attr_read, 4056 .write = simple_attr_write, 4057 .llseek = no_llseek, 4058 }; 4059 4060 static int vcpu_stat_get_per_vm(void *data, u64 *val) 4061 { 4062 int i; 4063 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4064 struct kvm_vcpu *vcpu; 4065 4066 *val = 0; 4067 4068 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4069 *val += *(u64 *)((void *)vcpu + stat_data->offset); 4070 4071 return 0; 4072 } 4073 4074 static int vcpu_stat_clear_per_vm(void *data, u64 val) 4075 { 4076 int i; 4077 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4078 struct kvm_vcpu *vcpu; 4079 4080 if (val) 4081 return -EINVAL; 4082 4083 kvm_for_each_vcpu(i, vcpu, stat_data->kvm) 4084 *(u64 *)((void *)vcpu + stat_data->offset) = 0; 4085 4086 return 0; 4087 } 4088 4089 static int vcpu_stat_get_per_vm_open(struct inode *inode, struct file *file) 4090 { 4091 __simple_attr_check_format("%llu\n", 0ull); 4092 return kvm_debugfs_open(inode, file, vcpu_stat_get_per_vm, 4093 vcpu_stat_clear_per_vm, "%llu\n"); 4094 } 4095 4096 static const struct file_operations vcpu_stat_get_per_vm_fops = { 4097 .owner = THIS_MODULE, 4098 .open = vcpu_stat_get_per_vm_open, 4099 .release = kvm_debugfs_release, 4100 .read = simple_attr_read, 4101 .write = simple_attr_write, 4102 .llseek = no_llseek, 4103 }; 4104 4105 static const struct file_operations *stat_fops_per_vm[] = { 4106 [KVM_STAT_VCPU] = &vcpu_stat_get_per_vm_fops, 4107 [KVM_STAT_VM] = &vm_stat_get_per_vm_fops, 4108 }; 4109 4110 static int vm_stat_get(void *_offset, u64 *val) 4111 { 4112 unsigned offset = (long)_offset; 4113 struct kvm *kvm; 4114 struct kvm_stat_data stat_tmp = {.offset = offset}; 4115 u64 tmp_val; 4116 4117 *val = 0; 4118 mutex_lock(&kvm_lock); 4119 list_for_each_entry(kvm, &vm_list, vm_list) { 4120 stat_tmp.kvm = kvm; 4121 vm_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4122 *val += tmp_val; 4123 } 4124 mutex_unlock(&kvm_lock); 4125 return 0; 4126 } 4127 4128 static int vm_stat_clear(void *_offset, u64 val) 4129 { 4130 unsigned offset = (long)_offset; 4131 struct kvm *kvm; 4132 struct kvm_stat_data stat_tmp = {.offset = offset}; 4133 4134 if (val) 4135 return -EINVAL; 4136 4137 mutex_lock(&kvm_lock); 4138 list_for_each_entry(kvm, &vm_list, vm_list) { 4139 stat_tmp.kvm = kvm; 4140 vm_stat_clear_per_vm((void *)&stat_tmp, 0); 4141 } 4142 mutex_unlock(&kvm_lock); 4143 4144 return 0; 4145 } 4146 4147 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4148 4149 static int vcpu_stat_get(void *_offset, u64 *val) 4150 { 4151 unsigned offset = (long)_offset; 4152 struct kvm *kvm; 4153 struct kvm_stat_data stat_tmp = {.offset = offset}; 4154 u64 tmp_val; 4155 4156 *val = 0; 4157 mutex_lock(&kvm_lock); 4158 list_for_each_entry(kvm, &vm_list, vm_list) { 4159 stat_tmp.kvm = kvm; 4160 vcpu_stat_get_per_vm((void *)&stat_tmp, &tmp_val); 4161 *val += tmp_val; 4162 } 4163 mutex_unlock(&kvm_lock); 4164 return 0; 4165 } 4166 4167 static int vcpu_stat_clear(void *_offset, u64 val) 4168 { 4169 unsigned offset = (long)_offset; 4170 struct kvm *kvm; 4171 struct kvm_stat_data stat_tmp = {.offset = offset}; 4172 4173 if (val) 4174 return -EINVAL; 4175 4176 mutex_lock(&kvm_lock); 4177 list_for_each_entry(kvm, &vm_list, vm_list) { 4178 stat_tmp.kvm = kvm; 4179 vcpu_stat_clear_per_vm((void *)&stat_tmp, 0); 4180 } 4181 mutex_unlock(&kvm_lock); 4182 4183 return 0; 4184 } 4185 4186 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4187 "%llu\n"); 4188 4189 static const struct file_operations *stat_fops[] = { 4190 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4191 [KVM_STAT_VM] = &vm_stat_fops, 4192 }; 4193 4194 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4195 { 4196 struct kobj_uevent_env *env; 4197 unsigned long long created, active; 4198 4199 if (!kvm_dev.this_device || !kvm) 4200 return; 4201 4202 mutex_lock(&kvm_lock); 4203 if (type == KVM_EVENT_CREATE_VM) { 4204 kvm_createvm_count++; 4205 kvm_active_vms++; 4206 } else if (type == KVM_EVENT_DESTROY_VM) { 4207 kvm_active_vms--; 4208 } 4209 created = kvm_createvm_count; 4210 active = kvm_active_vms; 4211 mutex_unlock(&kvm_lock); 4212 4213 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4214 if (!env) 4215 return; 4216 4217 add_uevent_var(env, "CREATED=%llu", created); 4218 add_uevent_var(env, "COUNT=%llu", active); 4219 4220 if (type == KVM_EVENT_CREATE_VM) { 4221 add_uevent_var(env, "EVENT=create"); 4222 kvm->userspace_pid = task_pid_nr(current); 4223 } else if (type == KVM_EVENT_DESTROY_VM) { 4224 add_uevent_var(env, "EVENT=destroy"); 4225 } 4226 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4227 4228 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4229 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4230 4231 if (p) { 4232 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4233 if (!IS_ERR(tmp)) 4234 add_uevent_var(env, "STATS_PATH=%s", tmp); 4235 kfree(p); 4236 } 4237 } 4238 /* no need for checks, since we are adding at most only 5 keys */ 4239 env->envp[env->envp_idx++] = NULL; 4240 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4241 kfree(env); 4242 } 4243 4244 static void kvm_init_debug(void) 4245 { 4246 struct kvm_stats_debugfs_item *p; 4247 4248 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4249 4250 kvm_debugfs_num_entries = 0; 4251 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4252 int mode = p->mode ? p->mode : 0644; 4253 debugfs_create_file(p->name, mode, kvm_debugfs_dir, 4254 (void *)(long)p->offset, 4255 stat_fops[p->kind]); 4256 } 4257 } 4258 4259 static int kvm_suspend(void) 4260 { 4261 if (kvm_usage_count) 4262 hardware_disable_nolock(NULL); 4263 return 0; 4264 } 4265 4266 static void kvm_resume(void) 4267 { 4268 if (kvm_usage_count) { 4269 #ifdef CONFIG_LOCKDEP 4270 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4271 #endif 4272 hardware_enable_nolock(NULL); 4273 } 4274 } 4275 4276 static struct syscore_ops kvm_syscore_ops = { 4277 .suspend = kvm_suspend, 4278 .resume = kvm_resume, 4279 }; 4280 4281 static inline 4282 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4283 { 4284 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4285 } 4286 4287 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4288 { 4289 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4290 4291 WRITE_ONCE(vcpu->preempted, false); 4292 WRITE_ONCE(vcpu->ready, false); 4293 4294 kvm_arch_sched_in(vcpu, cpu); 4295 4296 kvm_arch_vcpu_load(vcpu, cpu); 4297 } 4298 4299 static void kvm_sched_out(struct preempt_notifier *pn, 4300 struct task_struct *next) 4301 { 4302 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4303 4304 if (current->state == TASK_RUNNING) { 4305 WRITE_ONCE(vcpu->preempted, true); 4306 WRITE_ONCE(vcpu->ready, true); 4307 } 4308 kvm_arch_vcpu_put(vcpu); 4309 } 4310 4311 static void check_processor_compat(void *rtn) 4312 { 4313 *(int *)rtn = kvm_arch_check_processor_compat(); 4314 } 4315 4316 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4317 struct module *module) 4318 { 4319 int r; 4320 int cpu; 4321 4322 r = kvm_arch_init(opaque); 4323 if (r) 4324 goto out_fail; 4325 4326 /* 4327 * kvm_arch_init makes sure there's at most one caller 4328 * for architectures that support multiple implementations, 4329 * like intel and amd on x86. 4330 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4331 * conflicts in case kvm is already setup for another implementation. 4332 */ 4333 r = kvm_irqfd_init(); 4334 if (r) 4335 goto out_irqfd; 4336 4337 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4338 r = -ENOMEM; 4339 goto out_free_0; 4340 } 4341 4342 r = kvm_arch_hardware_setup(); 4343 if (r < 0) 4344 goto out_free_0a; 4345 4346 for_each_online_cpu(cpu) { 4347 smp_call_function_single(cpu, check_processor_compat, &r, 1); 4348 if (r < 0) 4349 goto out_free_1; 4350 } 4351 4352 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4353 kvm_starting_cpu, kvm_dying_cpu); 4354 if (r) 4355 goto out_free_2; 4356 register_reboot_notifier(&kvm_reboot_notifier); 4357 4358 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4359 if (!vcpu_align) 4360 vcpu_align = __alignof__(struct kvm_vcpu); 4361 kvm_vcpu_cache = 4362 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4363 SLAB_ACCOUNT, 4364 offsetof(struct kvm_vcpu, arch), 4365 sizeof_field(struct kvm_vcpu, arch), 4366 NULL); 4367 if (!kvm_vcpu_cache) { 4368 r = -ENOMEM; 4369 goto out_free_3; 4370 } 4371 4372 r = kvm_async_pf_init(); 4373 if (r) 4374 goto out_free; 4375 4376 kvm_chardev_ops.owner = module; 4377 kvm_vm_fops.owner = module; 4378 kvm_vcpu_fops.owner = module; 4379 4380 r = misc_register(&kvm_dev); 4381 if (r) { 4382 pr_err("kvm: misc device register failed\n"); 4383 goto out_unreg; 4384 } 4385 4386 register_syscore_ops(&kvm_syscore_ops); 4387 4388 kvm_preempt_ops.sched_in = kvm_sched_in; 4389 kvm_preempt_ops.sched_out = kvm_sched_out; 4390 4391 kvm_init_debug(); 4392 4393 r = kvm_vfio_ops_init(); 4394 WARN_ON(r); 4395 4396 return 0; 4397 4398 out_unreg: 4399 kvm_async_pf_deinit(); 4400 out_free: 4401 kmem_cache_destroy(kvm_vcpu_cache); 4402 out_free_3: 4403 unregister_reboot_notifier(&kvm_reboot_notifier); 4404 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4405 out_free_2: 4406 out_free_1: 4407 kvm_arch_hardware_unsetup(); 4408 out_free_0a: 4409 free_cpumask_var(cpus_hardware_enabled); 4410 out_free_0: 4411 kvm_irqfd_exit(); 4412 out_irqfd: 4413 kvm_arch_exit(); 4414 out_fail: 4415 return r; 4416 } 4417 EXPORT_SYMBOL_GPL(kvm_init); 4418 4419 void kvm_exit(void) 4420 { 4421 debugfs_remove_recursive(kvm_debugfs_dir); 4422 misc_deregister(&kvm_dev); 4423 kmem_cache_destroy(kvm_vcpu_cache); 4424 kvm_async_pf_deinit(); 4425 unregister_syscore_ops(&kvm_syscore_ops); 4426 unregister_reboot_notifier(&kvm_reboot_notifier); 4427 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4428 on_each_cpu(hardware_disable_nolock, NULL, 1); 4429 kvm_arch_hardware_unsetup(); 4430 kvm_arch_exit(); 4431 kvm_irqfd_exit(); 4432 free_cpumask_var(cpus_hardware_enabled); 4433 kvm_vfio_ops_exit(); 4434 } 4435 EXPORT_SYMBOL_GPL(kvm_exit); 4436 4437 struct kvm_vm_worker_thread_context { 4438 struct kvm *kvm; 4439 struct task_struct *parent; 4440 struct completion init_done; 4441 kvm_vm_thread_fn_t thread_fn; 4442 uintptr_t data; 4443 int err; 4444 }; 4445 4446 static int kvm_vm_worker_thread(void *context) 4447 { 4448 /* 4449 * The init_context is allocated on the stack of the parent thread, so 4450 * we have to locally copy anything that is needed beyond initialization 4451 */ 4452 struct kvm_vm_worker_thread_context *init_context = context; 4453 struct kvm *kvm = init_context->kvm; 4454 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4455 uintptr_t data = init_context->data; 4456 int err; 4457 4458 err = kthread_park(current); 4459 /* kthread_park(current) is never supposed to return an error */ 4460 WARN_ON(err != 0); 4461 if (err) 4462 goto init_complete; 4463 4464 err = cgroup_attach_task_all(init_context->parent, current); 4465 if (err) { 4466 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4467 __func__, err); 4468 goto init_complete; 4469 } 4470 4471 set_user_nice(current, task_nice(init_context->parent)); 4472 4473 init_complete: 4474 init_context->err = err; 4475 complete(&init_context->init_done); 4476 init_context = NULL; 4477 4478 if (err) 4479 return err; 4480 4481 /* Wait to be woken up by the spawner before proceeding. */ 4482 kthread_parkme(); 4483 4484 if (!kthread_should_stop()) 4485 err = thread_fn(kvm, data); 4486 4487 return err; 4488 } 4489 4490 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4491 uintptr_t data, const char *name, 4492 struct task_struct **thread_ptr) 4493 { 4494 struct kvm_vm_worker_thread_context init_context = {}; 4495 struct task_struct *thread; 4496 4497 *thread_ptr = NULL; 4498 init_context.kvm = kvm; 4499 init_context.parent = current; 4500 init_context.thread_fn = thread_fn; 4501 init_context.data = data; 4502 init_completion(&init_context.init_done); 4503 4504 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4505 "%s-%d", name, task_pid_nr(current)); 4506 if (IS_ERR(thread)) 4507 return PTR_ERR(thread); 4508 4509 /* kthread_run is never supposed to return NULL */ 4510 WARN_ON(thread == NULL); 4511 4512 wait_for_completion(&init_context.init_done); 4513 4514 if (!init_context.err) 4515 *thread_ptr = thread; 4516 4517 return init_context.err; 4518 } 4519