1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Kernel-based Virtual Machine driver for Linux 4 * 5 * This module enables machines with Intel VT-x extensions to run virtual 6 * machines without emulation or binary translation. 7 * 8 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates. 10 * 11 * Authors: 12 * Avi Kivity <avi@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com> 14 */ 15 16 #include <kvm/iodev.h> 17 18 #include <linux/kvm_host.h> 19 #include <linux/kvm.h> 20 #include <linux/module.h> 21 #include <linux/errno.h> 22 #include <linux/percpu.h> 23 #include <linux/mm.h> 24 #include <linux/miscdevice.h> 25 #include <linux/vmalloc.h> 26 #include <linux/reboot.h> 27 #include <linux/debugfs.h> 28 #include <linux/highmem.h> 29 #include <linux/file.h> 30 #include <linux/syscore_ops.h> 31 #include <linux/cpu.h> 32 #include <linux/sched/signal.h> 33 #include <linux/sched/mm.h> 34 #include <linux/sched/stat.h> 35 #include <linux/cpumask.h> 36 #include <linux/smp.h> 37 #include <linux/anon_inodes.h> 38 #include <linux/profile.h> 39 #include <linux/kvm_para.h> 40 #include <linux/pagemap.h> 41 #include <linux/mman.h> 42 #include <linux/swap.h> 43 #include <linux/bitops.h> 44 #include <linux/spinlock.h> 45 #include <linux/compat.h> 46 #include <linux/srcu.h> 47 #include <linux/hugetlb.h> 48 #include <linux/slab.h> 49 #include <linux/sort.h> 50 #include <linux/bsearch.h> 51 #include <linux/io.h> 52 #include <linux/lockdep.h> 53 #include <linux/kthread.h> 54 55 #include <asm/processor.h> 56 #include <asm/ioctl.h> 57 #include <linux/uaccess.h> 58 59 #include "coalesced_mmio.h" 60 #include "async_pf.h" 61 #include "vfio.h" 62 63 #define CREATE_TRACE_POINTS 64 #include <trace/events/kvm.h> 65 66 /* Worst case buffer size needed for holding an integer. */ 67 #define ITOA_MAX_LEN 12 68 69 MODULE_AUTHOR("Qumranet"); 70 MODULE_LICENSE("GPL"); 71 72 /* Architectures should define their poll value according to the halt latency */ 73 unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT; 74 module_param(halt_poll_ns, uint, 0644); 75 EXPORT_SYMBOL_GPL(halt_poll_ns); 76 77 /* Default doubles per-vcpu halt_poll_ns. */ 78 unsigned int halt_poll_ns_grow = 2; 79 module_param(halt_poll_ns_grow, uint, 0644); 80 EXPORT_SYMBOL_GPL(halt_poll_ns_grow); 81 82 /* The start value to grow halt_poll_ns from */ 83 unsigned int halt_poll_ns_grow_start = 10000; /* 10us */ 84 module_param(halt_poll_ns_grow_start, uint, 0644); 85 EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start); 86 87 /* Default resets per-vcpu halt_poll_ns . */ 88 unsigned int halt_poll_ns_shrink; 89 module_param(halt_poll_ns_shrink, uint, 0644); 90 EXPORT_SYMBOL_GPL(halt_poll_ns_shrink); 91 92 /* 93 * Ordering of locks: 94 * 95 * kvm->lock --> kvm->slots_lock --> kvm->irq_lock 96 */ 97 98 DEFINE_MUTEX(kvm_lock); 99 static DEFINE_RAW_SPINLOCK(kvm_count_lock); 100 LIST_HEAD(vm_list); 101 102 static cpumask_var_t cpus_hardware_enabled; 103 static int kvm_usage_count; 104 static atomic_t hardware_enable_failed; 105 106 static struct kmem_cache *kvm_vcpu_cache; 107 108 static __read_mostly struct preempt_ops kvm_preempt_ops; 109 static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu); 110 111 struct dentry *kvm_debugfs_dir; 112 EXPORT_SYMBOL_GPL(kvm_debugfs_dir); 113 114 static int kvm_debugfs_num_entries; 115 static const struct file_operations stat_fops_per_vm; 116 117 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 118 unsigned long arg); 119 #ifdef CONFIG_KVM_COMPAT 120 static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl, 121 unsigned long arg); 122 #define KVM_COMPAT(c) .compat_ioctl = (c) 123 #else 124 /* 125 * For architectures that don't implement a compat infrastructure, 126 * adopt a double line of defense: 127 * - Prevent a compat task from opening /dev/kvm 128 * - If the open has been done by a 64bit task, and the KVM fd 129 * passed to a compat task, let the ioctls fail. 130 */ 131 static long kvm_no_compat_ioctl(struct file *file, unsigned int ioctl, 132 unsigned long arg) { return -EINVAL; } 133 134 static int kvm_no_compat_open(struct inode *inode, struct file *file) 135 { 136 return is_compat_task() ? -ENODEV : 0; 137 } 138 #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ 139 .open = kvm_no_compat_open 140 #endif 141 static int hardware_enable_all(void); 142 static void hardware_disable_all(void); 143 144 static void kvm_io_bus_destroy(struct kvm_io_bus *bus); 145 146 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn); 147 148 __visible bool kvm_rebooting; 149 EXPORT_SYMBOL_GPL(kvm_rebooting); 150 151 #define KVM_EVENT_CREATE_VM 0 152 #define KVM_EVENT_DESTROY_VM 1 153 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); 154 static unsigned long long kvm_createvm_count; 155 static unsigned long long kvm_active_vms; 156 157 __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, 158 unsigned long start, unsigned long end) 159 { 160 } 161 162 bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) 163 { 164 /* 165 * The metadata used by is_zone_device_page() to determine whether or 166 * not a page is ZONE_DEVICE is guaranteed to be valid if and only if 167 * the device has been pinned, e.g. by get_user_pages(). WARN if the 168 * page_count() is zero to help detect bad usage of this helper. 169 */ 170 if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) 171 return false; 172 173 return is_zone_device_page(pfn_to_page(pfn)); 174 } 175 176 bool kvm_is_reserved_pfn(kvm_pfn_t pfn) 177 { 178 /* 179 * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting 180 * perspective they are "normal" pages, albeit with slightly different 181 * usage rules. 182 */ 183 if (pfn_valid(pfn)) 184 return PageReserved(pfn_to_page(pfn)) && 185 !is_zero_pfn(pfn) && 186 !kvm_is_zone_device_pfn(pfn); 187 188 return true; 189 } 190 191 bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) 192 { 193 struct page *page = pfn_to_page(pfn); 194 195 if (!PageTransCompoundMap(page)) 196 return false; 197 198 return is_transparent_hugepage(compound_head(page)); 199 } 200 201 /* 202 * Switches to specified vcpu, until a matching vcpu_put() 203 */ 204 void vcpu_load(struct kvm_vcpu *vcpu) 205 { 206 int cpu = get_cpu(); 207 208 __this_cpu_write(kvm_running_vcpu, vcpu); 209 preempt_notifier_register(&vcpu->preempt_notifier); 210 kvm_arch_vcpu_load(vcpu, cpu); 211 put_cpu(); 212 } 213 EXPORT_SYMBOL_GPL(vcpu_load); 214 215 void vcpu_put(struct kvm_vcpu *vcpu) 216 { 217 preempt_disable(); 218 kvm_arch_vcpu_put(vcpu); 219 preempt_notifier_unregister(&vcpu->preempt_notifier); 220 __this_cpu_write(kvm_running_vcpu, NULL); 221 preempt_enable(); 222 } 223 EXPORT_SYMBOL_GPL(vcpu_put); 224 225 /* TODO: merge with kvm_arch_vcpu_should_kick */ 226 static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) 227 { 228 int mode = kvm_vcpu_exiting_guest_mode(vcpu); 229 230 /* 231 * We need to wait for the VCPU to reenable interrupts and get out of 232 * READING_SHADOW_PAGE_TABLES mode. 233 */ 234 if (req & KVM_REQUEST_WAIT) 235 return mode != OUTSIDE_GUEST_MODE; 236 237 /* 238 * Need to kick a running VCPU, but otherwise there is nothing to do. 239 */ 240 return mode == IN_GUEST_MODE; 241 } 242 243 static void ack_flush(void *_completed) 244 { 245 } 246 247 static inline bool kvm_kick_many_cpus(const struct cpumask *cpus, bool wait) 248 { 249 if (unlikely(!cpus)) 250 cpus = cpu_online_mask; 251 252 if (cpumask_empty(cpus)) 253 return false; 254 255 smp_call_function_many(cpus, ack_flush, NULL, wait); 256 return true; 257 } 258 259 bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, 260 struct kvm_vcpu *except, 261 unsigned long *vcpu_bitmap, cpumask_var_t tmp) 262 { 263 int i, cpu, me; 264 struct kvm_vcpu *vcpu; 265 bool called; 266 267 me = get_cpu(); 268 269 kvm_for_each_vcpu(i, vcpu, kvm) { 270 if ((vcpu_bitmap && !test_bit(i, vcpu_bitmap)) || 271 vcpu == except) 272 continue; 273 274 kvm_make_request(req, vcpu); 275 cpu = vcpu->cpu; 276 277 if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) 278 continue; 279 280 if (tmp != NULL && cpu != -1 && cpu != me && 281 kvm_request_needs_ipi(vcpu, req)) 282 __cpumask_set_cpu(cpu, tmp); 283 } 284 285 called = kvm_kick_many_cpus(tmp, !!(req & KVM_REQUEST_WAIT)); 286 put_cpu(); 287 288 return called; 289 } 290 291 bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, 292 struct kvm_vcpu *except) 293 { 294 cpumask_var_t cpus; 295 bool called; 296 297 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 298 299 called = kvm_make_vcpus_request_mask(kvm, req, except, NULL, cpus); 300 301 free_cpumask_var(cpus); 302 return called; 303 } 304 305 bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) 306 { 307 return kvm_make_all_cpus_request_except(kvm, req, NULL); 308 } 309 310 #ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL 311 void kvm_flush_remote_tlbs(struct kvm *kvm) 312 { 313 /* 314 * Read tlbs_dirty before setting KVM_REQ_TLB_FLUSH in 315 * kvm_make_all_cpus_request. 316 */ 317 long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); 318 319 /* 320 * We want to publish modifications to the page tables before reading 321 * mode. Pairs with a memory barrier in arch-specific code. 322 * - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest 323 * and smp_mb in walk_shadow_page_lockless_begin/end. 324 * - powerpc: smp_mb in kvmppc_prepare_to_enter. 325 * 326 * There is already an smp_mb__after_atomic() before 327 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that 328 * barrier here. 329 */ 330 if (!kvm_arch_flush_remote_tlb(kvm) 331 || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 332 ++kvm->stat.remote_tlb_flush; 333 cmpxchg(&kvm->tlbs_dirty, dirty_count, 0); 334 } 335 EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); 336 #endif 337 338 void kvm_reload_remote_mmus(struct kvm *kvm) 339 { 340 kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 341 } 342 343 static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 344 { 345 mutex_init(&vcpu->mutex); 346 vcpu->cpu = -1; 347 vcpu->kvm = kvm; 348 vcpu->vcpu_id = id; 349 vcpu->pid = NULL; 350 rcuwait_init(&vcpu->wait); 351 kvm_async_pf_vcpu_init(vcpu); 352 353 vcpu->pre_pcpu = -1; 354 INIT_LIST_HEAD(&vcpu->blocked_vcpu_list); 355 356 kvm_vcpu_set_in_spin_loop(vcpu, false); 357 kvm_vcpu_set_dy_eligible(vcpu, false); 358 vcpu->preempted = false; 359 vcpu->ready = false; 360 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 361 } 362 363 void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) 364 { 365 kvm_arch_vcpu_destroy(vcpu); 366 367 /* 368 * No need for rcu_read_lock as VCPU_RUN is the only place that changes 369 * the vcpu->pid pointer, and at destruction time all file descriptors 370 * are already gone. 371 */ 372 put_pid(rcu_dereference_protected(vcpu->pid, 1)); 373 374 free_page((unsigned long)vcpu->run); 375 kmem_cache_free(kvm_vcpu_cache, vcpu); 376 } 377 EXPORT_SYMBOL_GPL(kvm_vcpu_destroy); 378 379 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 380 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 381 { 382 return container_of(mn, struct kvm, mmu_notifier); 383 } 384 385 static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, 386 struct mm_struct *mm, 387 unsigned long start, unsigned long end) 388 { 389 struct kvm *kvm = mmu_notifier_to_kvm(mn); 390 int idx; 391 392 idx = srcu_read_lock(&kvm->srcu); 393 kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); 394 srcu_read_unlock(&kvm->srcu, idx); 395 } 396 397 static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, 398 struct mm_struct *mm, 399 unsigned long address, 400 pte_t pte) 401 { 402 struct kvm *kvm = mmu_notifier_to_kvm(mn); 403 int idx; 404 405 idx = srcu_read_lock(&kvm->srcu); 406 spin_lock(&kvm->mmu_lock); 407 kvm->mmu_notifier_seq++; 408 409 if (kvm_set_spte_hva(kvm, address, pte)) 410 kvm_flush_remote_tlbs(kvm); 411 412 spin_unlock(&kvm->mmu_lock); 413 srcu_read_unlock(&kvm->srcu, idx); 414 } 415 416 static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 417 const struct mmu_notifier_range *range) 418 { 419 struct kvm *kvm = mmu_notifier_to_kvm(mn); 420 int need_tlb_flush = 0, idx; 421 422 idx = srcu_read_lock(&kvm->srcu); 423 spin_lock(&kvm->mmu_lock); 424 /* 425 * The count increase must become visible at unlock time as no 426 * spte can be established without taking the mmu_lock and 427 * count is also read inside the mmu_lock critical section. 428 */ 429 kvm->mmu_notifier_count++; 430 need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end); 431 need_tlb_flush |= kvm->tlbs_dirty; 432 /* we've to flush the tlb before the pages can be freed */ 433 if (need_tlb_flush) 434 kvm_flush_remote_tlbs(kvm); 435 436 spin_unlock(&kvm->mmu_lock); 437 srcu_read_unlock(&kvm->srcu, idx); 438 439 return 0; 440 } 441 442 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 443 const struct mmu_notifier_range *range) 444 { 445 struct kvm *kvm = mmu_notifier_to_kvm(mn); 446 447 spin_lock(&kvm->mmu_lock); 448 /* 449 * This sequence increase will notify the kvm page fault that 450 * the page that is going to be mapped in the spte could have 451 * been freed. 452 */ 453 kvm->mmu_notifier_seq++; 454 smp_wmb(); 455 /* 456 * The above sequence increase must be visible before the 457 * below count decrease, which is ensured by the smp_wmb above 458 * in conjunction with the smp_rmb in mmu_notifier_retry(). 459 */ 460 kvm->mmu_notifier_count--; 461 spin_unlock(&kvm->mmu_lock); 462 463 BUG_ON(kvm->mmu_notifier_count < 0); 464 } 465 466 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 467 struct mm_struct *mm, 468 unsigned long start, 469 unsigned long end) 470 { 471 struct kvm *kvm = mmu_notifier_to_kvm(mn); 472 int young, idx; 473 474 idx = srcu_read_lock(&kvm->srcu); 475 spin_lock(&kvm->mmu_lock); 476 477 young = kvm_age_hva(kvm, start, end); 478 if (young) 479 kvm_flush_remote_tlbs(kvm); 480 481 spin_unlock(&kvm->mmu_lock); 482 srcu_read_unlock(&kvm->srcu, idx); 483 484 return young; 485 } 486 487 static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, 488 struct mm_struct *mm, 489 unsigned long start, 490 unsigned long end) 491 { 492 struct kvm *kvm = mmu_notifier_to_kvm(mn); 493 int young, idx; 494 495 idx = srcu_read_lock(&kvm->srcu); 496 spin_lock(&kvm->mmu_lock); 497 /* 498 * Even though we do not flush TLB, this will still adversely 499 * affect performance on pre-Haswell Intel EPT, where there is 500 * no EPT Access Bit to clear so that we have to tear down EPT 501 * tables instead. If we find this unacceptable, we can always 502 * add a parameter to kvm_age_hva so that it effectively doesn't 503 * do anything on clear_young. 504 * 505 * Also note that currently we never issue secondary TLB flushes 506 * from clear_young, leaving this job up to the regular system 507 * cadence. If we find this inaccurate, we might come up with a 508 * more sophisticated heuristic later. 509 */ 510 young = kvm_age_hva(kvm, start, end); 511 spin_unlock(&kvm->mmu_lock); 512 srcu_read_unlock(&kvm->srcu, idx); 513 514 return young; 515 } 516 517 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, 518 struct mm_struct *mm, 519 unsigned long address) 520 { 521 struct kvm *kvm = mmu_notifier_to_kvm(mn); 522 int young, idx; 523 524 idx = srcu_read_lock(&kvm->srcu); 525 spin_lock(&kvm->mmu_lock); 526 young = kvm_test_age_hva(kvm, address); 527 spin_unlock(&kvm->mmu_lock); 528 srcu_read_unlock(&kvm->srcu, idx); 529 530 return young; 531 } 532 533 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 534 struct mm_struct *mm) 535 { 536 struct kvm *kvm = mmu_notifier_to_kvm(mn); 537 int idx; 538 539 idx = srcu_read_lock(&kvm->srcu); 540 kvm_arch_flush_shadow_all(kvm); 541 srcu_read_unlock(&kvm->srcu, idx); 542 } 543 544 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 545 .invalidate_range = kvm_mmu_notifier_invalidate_range, 546 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 547 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 548 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 549 .clear_young = kvm_mmu_notifier_clear_young, 550 .test_young = kvm_mmu_notifier_test_young, 551 .change_pte = kvm_mmu_notifier_change_pte, 552 .release = kvm_mmu_notifier_release, 553 }; 554 555 static int kvm_init_mmu_notifier(struct kvm *kvm) 556 { 557 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 558 return mmu_notifier_register(&kvm->mmu_notifier, current->mm); 559 } 560 561 #else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */ 562 563 static int kvm_init_mmu_notifier(struct kvm *kvm) 564 { 565 return 0; 566 } 567 568 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 569 570 static struct kvm_memslots *kvm_alloc_memslots(void) 571 { 572 int i; 573 struct kvm_memslots *slots; 574 575 slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT); 576 if (!slots) 577 return NULL; 578 579 for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 580 slots->id_to_index[i] = -1; 581 582 return slots; 583 } 584 585 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot) 586 { 587 if (!memslot->dirty_bitmap) 588 return; 589 590 kvfree(memslot->dirty_bitmap); 591 memslot->dirty_bitmap = NULL; 592 } 593 594 static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) 595 { 596 kvm_destroy_dirty_bitmap(slot); 597 598 kvm_arch_free_memslot(kvm, slot); 599 600 slot->flags = 0; 601 slot->npages = 0; 602 } 603 604 static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots) 605 { 606 struct kvm_memory_slot *memslot; 607 608 if (!slots) 609 return; 610 611 kvm_for_each_memslot(memslot, slots) 612 kvm_free_memslot(kvm, memslot); 613 614 kvfree(slots); 615 } 616 617 static void kvm_destroy_vm_debugfs(struct kvm *kvm) 618 { 619 int i; 620 621 if (!kvm->debugfs_dentry) 622 return; 623 624 debugfs_remove_recursive(kvm->debugfs_dentry); 625 626 if (kvm->debugfs_stat_data) { 627 for (i = 0; i < kvm_debugfs_num_entries; i++) 628 kfree(kvm->debugfs_stat_data[i]); 629 kfree(kvm->debugfs_stat_data); 630 } 631 } 632 633 static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) 634 { 635 char dir_name[ITOA_MAX_LEN * 2]; 636 struct kvm_stat_data *stat_data; 637 struct kvm_stats_debugfs_item *p; 638 639 if (!debugfs_initialized()) 640 return 0; 641 642 snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd); 643 kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir); 644 645 kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries, 646 sizeof(*kvm->debugfs_stat_data), 647 GFP_KERNEL_ACCOUNT); 648 if (!kvm->debugfs_stat_data) 649 return -ENOMEM; 650 651 for (p = debugfs_entries; p->name; p++) { 652 stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT); 653 if (!stat_data) 654 return -ENOMEM; 655 656 stat_data->kvm = kvm; 657 stat_data->dbgfs_item = p; 658 kvm->debugfs_stat_data[p - debugfs_entries] = stat_data; 659 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 660 kvm->debugfs_dentry, stat_data, 661 &stat_fops_per_vm); 662 } 663 return 0; 664 } 665 666 /* 667 * Called after the VM is otherwise initialized, but just before adding it to 668 * the vm_list. 669 */ 670 int __weak kvm_arch_post_init_vm(struct kvm *kvm) 671 { 672 return 0; 673 } 674 675 /* 676 * Called just after removing the VM from the vm_list, but before doing any 677 * other destruction. 678 */ 679 void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) 680 { 681 } 682 683 static struct kvm *kvm_create_vm(unsigned long type) 684 { 685 struct kvm *kvm = kvm_arch_alloc_vm(); 686 int r = -ENOMEM; 687 int i; 688 689 if (!kvm) 690 return ERR_PTR(-ENOMEM); 691 692 spin_lock_init(&kvm->mmu_lock); 693 mmgrab(current->mm); 694 kvm->mm = current->mm; 695 kvm_eventfd_init(kvm); 696 mutex_init(&kvm->lock); 697 mutex_init(&kvm->irq_lock); 698 mutex_init(&kvm->slots_lock); 699 INIT_LIST_HEAD(&kvm->devices); 700 701 BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); 702 703 if (init_srcu_struct(&kvm->srcu)) 704 goto out_err_no_srcu; 705 if (init_srcu_struct(&kvm->irq_srcu)) 706 goto out_err_no_irq_srcu; 707 708 refcount_set(&kvm->users_count, 1); 709 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { 710 struct kvm_memslots *slots = kvm_alloc_memslots(); 711 712 if (!slots) 713 goto out_err_no_arch_destroy_vm; 714 /* Generations must be different for each address space. */ 715 slots->generation = i; 716 rcu_assign_pointer(kvm->memslots[i], slots); 717 } 718 719 for (i = 0; i < KVM_NR_BUSES; i++) { 720 rcu_assign_pointer(kvm->buses[i], 721 kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT)); 722 if (!kvm->buses[i]) 723 goto out_err_no_arch_destroy_vm; 724 } 725 726 kvm->max_halt_poll_ns = halt_poll_ns; 727 728 r = kvm_arch_init_vm(kvm, type); 729 if (r) 730 goto out_err_no_arch_destroy_vm; 731 732 r = hardware_enable_all(); 733 if (r) 734 goto out_err_no_disable; 735 736 #ifdef CONFIG_HAVE_KVM_IRQFD 737 INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list); 738 #endif 739 740 r = kvm_init_mmu_notifier(kvm); 741 if (r) 742 goto out_err_no_mmu_notifier; 743 744 r = kvm_arch_post_init_vm(kvm); 745 if (r) 746 goto out_err; 747 748 mutex_lock(&kvm_lock); 749 list_add(&kvm->vm_list, &vm_list); 750 mutex_unlock(&kvm_lock); 751 752 preempt_notifier_inc(); 753 754 return kvm; 755 756 out_err: 757 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 758 if (kvm->mmu_notifier.ops) 759 mmu_notifier_unregister(&kvm->mmu_notifier, current->mm); 760 #endif 761 out_err_no_mmu_notifier: 762 hardware_disable_all(); 763 out_err_no_disable: 764 kvm_arch_destroy_vm(kvm); 765 out_err_no_arch_destroy_vm: 766 WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count)); 767 for (i = 0; i < KVM_NR_BUSES; i++) 768 kfree(kvm_get_bus(kvm, i)); 769 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 770 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 771 cleanup_srcu_struct(&kvm->irq_srcu); 772 out_err_no_irq_srcu: 773 cleanup_srcu_struct(&kvm->srcu); 774 out_err_no_srcu: 775 kvm_arch_free_vm(kvm); 776 mmdrop(current->mm); 777 return ERR_PTR(r); 778 } 779 780 static void kvm_destroy_devices(struct kvm *kvm) 781 { 782 struct kvm_device *dev, *tmp; 783 784 /* 785 * We do not need to take the kvm->lock here, because nobody else 786 * has a reference to the struct kvm at this point and therefore 787 * cannot access the devices list anyhow. 788 */ 789 list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) { 790 list_del(&dev->vm_node); 791 dev->ops->destroy(dev); 792 } 793 } 794 795 static void kvm_destroy_vm(struct kvm *kvm) 796 { 797 int i; 798 struct mm_struct *mm = kvm->mm; 799 800 kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm); 801 kvm_destroy_vm_debugfs(kvm); 802 kvm_arch_sync_events(kvm); 803 mutex_lock(&kvm_lock); 804 list_del(&kvm->vm_list); 805 mutex_unlock(&kvm_lock); 806 kvm_arch_pre_destroy_vm(kvm); 807 808 kvm_free_irq_routing(kvm); 809 for (i = 0; i < KVM_NR_BUSES; i++) { 810 struct kvm_io_bus *bus = kvm_get_bus(kvm, i); 811 812 if (bus) 813 kvm_io_bus_destroy(bus); 814 kvm->buses[i] = NULL; 815 } 816 kvm_coalesced_mmio_free(kvm); 817 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 818 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 819 #else 820 kvm_arch_flush_shadow_all(kvm); 821 #endif 822 kvm_arch_destroy_vm(kvm); 823 kvm_destroy_devices(kvm); 824 for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) 825 kvm_free_memslots(kvm, __kvm_memslots(kvm, i)); 826 cleanup_srcu_struct(&kvm->irq_srcu); 827 cleanup_srcu_struct(&kvm->srcu); 828 kvm_arch_free_vm(kvm); 829 preempt_notifier_dec(); 830 hardware_disable_all(); 831 mmdrop(mm); 832 } 833 834 void kvm_get_kvm(struct kvm *kvm) 835 { 836 refcount_inc(&kvm->users_count); 837 } 838 EXPORT_SYMBOL_GPL(kvm_get_kvm); 839 840 void kvm_put_kvm(struct kvm *kvm) 841 { 842 if (refcount_dec_and_test(&kvm->users_count)) 843 kvm_destroy_vm(kvm); 844 } 845 EXPORT_SYMBOL_GPL(kvm_put_kvm); 846 847 /* 848 * Used to put a reference that was taken on behalf of an object associated 849 * with a user-visible file descriptor, e.g. a vcpu or device, if installation 850 * of the new file descriptor fails and the reference cannot be transferred to 851 * its final owner. In such cases, the caller is still actively using @kvm and 852 * will fail miserably if the refcount unexpectedly hits zero. 853 */ 854 void kvm_put_kvm_no_destroy(struct kvm *kvm) 855 { 856 WARN_ON(refcount_dec_and_test(&kvm->users_count)); 857 } 858 EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy); 859 860 static int kvm_vm_release(struct inode *inode, struct file *filp) 861 { 862 struct kvm *kvm = filp->private_data; 863 864 kvm_irqfd_release(kvm); 865 866 kvm_put_kvm(kvm); 867 return 0; 868 } 869 870 /* 871 * Allocation size is twice as large as the actual dirty bitmap size. 872 * See kvm_vm_ioctl_get_dirty_log() why this is needed. 873 */ 874 static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot) 875 { 876 unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot); 877 878 memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT); 879 if (!memslot->dirty_bitmap) 880 return -ENOMEM; 881 882 return 0; 883 } 884 885 /* 886 * Delete a memslot by decrementing the number of used slots and shifting all 887 * other entries in the array forward one spot. 888 */ 889 static inline void kvm_memslot_delete(struct kvm_memslots *slots, 890 struct kvm_memory_slot *memslot) 891 { 892 struct kvm_memory_slot *mslots = slots->memslots; 893 int i; 894 895 if (WARN_ON(slots->id_to_index[memslot->id] == -1)) 896 return; 897 898 slots->used_slots--; 899 900 if (atomic_read(&slots->lru_slot) >= slots->used_slots) 901 atomic_set(&slots->lru_slot, 0); 902 903 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { 904 mslots[i] = mslots[i + 1]; 905 slots->id_to_index[mslots[i].id] = i; 906 } 907 mslots[i] = *memslot; 908 slots->id_to_index[memslot->id] = -1; 909 } 910 911 /* 912 * "Insert" a new memslot by incrementing the number of used slots. Returns 913 * the new slot's initial index into the memslots array. 914 */ 915 static inline int kvm_memslot_insert_back(struct kvm_memslots *slots) 916 { 917 return slots->used_slots++; 918 } 919 920 /* 921 * Move a changed memslot backwards in the array by shifting existing slots 922 * with a higher GFN toward the front of the array. Note, the changed memslot 923 * itself is not preserved in the array, i.e. not swapped at this time, only 924 * its new index into the array is tracked. Returns the changed memslot's 925 * current index into the memslots array. 926 */ 927 static inline int kvm_memslot_move_backward(struct kvm_memslots *slots, 928 struct kvm_memory_slot *memslot) 929 { 930 struct kvm_memory_slot *mslots = slots->memslots; 931 int i; 932 933 if (WARN_ON_ONCE(slots->id_to_index[memslot->id] == -1) || 934 WARN_ON_ONCE(!slots->used_slots)) 935 return -1; 936 937 /* 938 * Move the target memslot backward in the array by shifting existing 939 * memslots with a higher GFN (than the target memslot) towards the 940 * front of the array. 941 */ 942 for (i = slots->id_to_index[memslot->id]; i < slots->used_slots - 1; i++) { 943 if (memslot->base_gfn > mslots[i + 1].base_gfn) 944 break; 945 946 WARN_ON_ONCE(memslot->base_gfn == mslots[i + 1].base_gfn); 947 948 /* Shift the next memslot forward one and update its index. */ 949 mslots[i] = mslots[i + 1]; 950 slots->id_to_index[mslots[i].id] = i; 951 } 952 return i; 953 } 954 955 /* 956 * Move a changed memslot forwards in the array by shifting existing slots with 957 * a lower GFN toward the back of the array. Note, the changed memslot itself 958 * is not preserved in the array, i.e. not swapped at this time, only its new 959 * index into the array is tracked. Returns the changed memslot's final index 960 * into the memslots array. 961 */ 962 static inline int kvm_memslot_move_forward(struct kvm_memslots *slots, 963 struct kvm_memory_slot *memslot, 964 int start) 965 { 966 struct kvm_memory_slot *mslots = slots->memslots; 967 int i; 968 969 for (i = start; i > 0; i--) { 970 if (memslot->base_gfn < mslots[i - 1].base_gfn) 971 break; 972 973 WARN_ON_ONCE(memslot->base_gfn == mslots[i - 1].base_gfn); 974 975 /* Shift the next memslot back one and update its index. */ 976 mslots[i] = mslots[i - 1]; 977 slots->id_to_index[mslots[i].id] = i; 978 } 979 return i; 980 } 981 982 /* 983 * Re-sort memslots based on their GFN to account for an added, deleted, or 984 * moved memslot. Sorting memslots by GFN allows using a binary search during 985 * memslot lookup. 986 * 987 * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! I.e. the entry 988 * at memslots[0] has the highest GFN. 989 * 990 * The sorting algorithm takes advantage of having initially sorted memslots 991 * and knowing the position of the changed memslot. Sorting is also optimized 992 * by not swapping the updated memslot and instead only shifting other memslots 993 * and tracking the new index for the update memslot. Only once its final 994 * index is known is the updated memslot copied into its position in the array. 995 * 996 * - When deleting a memslot, the deleted memslot simply needs to be moved to 997 * the end of the array. 998 * 999 * - When creating a memslot, the algorithm "inserts" the new memslot at the 1000 * end of the array and then it forward to its correct location. 1001 * 1002 * - When moving a memslot, the algorithm first moves the updated memslot 1003 * backward to handle the scenario where the memslot's GFN was changed to a 1004 * lower value. update_memslots() then falls through and runs the same flow 1005 * as creating a memslot to move the memslot forward to handle the scenario 1006 * where its GFN was changed to a higher value. 1007 * 1008 * Note, slots are sorted from highest->lowest instead of lowest->highest for 1009 * historical reasons. Originally, invalid memslots where denoted by having 1010 * GFN=0, thus sorting from highest->lowest naturally sorted invalid memslots 1011 * to the end of the array. The current algorithm uses dedicated logic to 1012 * delete a memslot and thus does not rely on invalid memslots having GFN=0. 1013 * 1014 * The other historical motiviation for highest->lowest was to improve the 1015 * performance of memslot lookup. KVM originally used a linear search starting 1016 * at memslots[0]. On x86, the largest memslot usually has one of the highest, 1017 * if not *the* highest, GFN, as the bulk of the guest's RAM is located in a 1018 * single memslot above the 4gb boundary. As the largest memslot is also the 1019 * most likely to be referenced, sorting it to the front of the array was 1020 * advantageous. The current binary search starts from the middle of the array 1021 * and uses an LRU pointer to improve performance for all memslots and GFNs. 1022 */ 1023 static void update_memslots(struct kvm_memslots *slots, 1024 struct kvm_memory_slot *memslot, 1025 enum kvm_mr_change change) 1026 { 1027 int i; 1028 1029 if (change == KVM_MR_DELETE) { 1030 kvm_memslot_delete(slots, memslot); 1031 } else { 1032 if (change == KVM_MR_CREATE) 1033 i = kvm_memslot_insert_back(slots); 1034 else 1035 i = kvm_memslot_move_backward(slots, memslot); 1036 i = kvm_memslot_move_forward(slots, memslot, i); 1037 1038 /* 1039 * Copy the memslot to its new position in memslots and update 1040 * its index accordingly. 1041 */ 1042 slots->memslots[i] = *memslot; 1043 slots->id_to_index[memslot->id] = i; 1044 } 1045 } 1046 1047 static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem) 1048 { 1049 u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES; 1050 1051 #ifdef __KVM_HAVE_READONLY_MEM 1052 valid_flags |= KVM_MEM_READONLY; 1053 #endif 1054 1055 if (mem->flags & ~valid_flags) 1056 return -EINVAL; 1057 1058 return 0; 1059 } 1060 1061 static struct kvm_memslots *install_new_memslots(struct kvm *kvm, 1062 int as_id, struct kvm_memslots *slots) 1063 { 1064 struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id); 1065 u64 gen = old_memslots->generation; 1066 1067 WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); 1068 slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1069 1070 rcu_assign_pointer(kvm->memslots[as_id], slots); 1071 synchronize_srcu_expedited(&kvm->srcu); 1072 1073 /* 1074 * Increment the new memslot generation a second time, dropping the 1075 * update in-progress flag and incrementing the generation based on 1076 * the number of address spaces. This provides a unique and easily 1077 * identifiable generation number while the memslots are in flux. 1078 */ 1079 gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; 1080 1081 /* 1082 * Generations must be unique even across address spaces. We do not need 1083 * a global counter for that, instead the generation space is evenly split 1084 * across address spaces. For example, with two address spaces, address 1085 * space 0 will use generations 0, 2, 4, ... while address space 1 will 1086 * use generations 1, 3, 5, ... 1087 */ 1088 gen += KVM_ADDRESS_SPACE_NUM; 1089 1090 kvm_arch_memslots_updated(kvm, gen); 1091 1092 slots->generation = gen; 1093 1094 return old_memslots; 1095 } 1096 1097 /* 1098 * Note, at a minimum, the current number of used slots must be allocated, even 1099 * when deleting a memslot, as we need a complete duplicate of the memslots for 1100 * use when invalidating a memslot prior to deleting/moving the memslot. 1101 */ 1102 static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old, 1103 enum kvm_mr_change change) 1104 { 1105 struct kvm_memslots *slots; 1106 size_t old_size, new_size; 1107 1108 old_size = sizeof(struct kvm_memslots) + 1109 (sizeof(struct kvm_memory_slot) * old->used_slots); 1110 1111 if (change == KVM_MR_CREATE) 1112 new_size = old_size + sizeof(struct kvm_memory_slot); 1113 else 1114 new_size = old_size; 1115 1116 slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT); 1117 if (likely(slots)) 1118 memcpy(slots, old, old_size); 1119 1120 return slots; 1121 } 1122 1123 static int kvm_set_memslot(struct kvm *kvm, 1124 const struct kvm_userspace_memory_region *mem, 1125 struct kvm_memory_slot *old, 1126 struct kvm_memory_slot *new, int as_id, 1127 enum kvm_mr_change change) 1128 { 1129 struct kvm_memory_slot *slot; 1130 struct kvm_memslots *slots; 1131 int r; 1132 1133 slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change); 1134 if (!slots) 1135 return -ENOMEM; 1136 1137 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) { 1138 /* 1139 * Note, the INVALID flag needs to be in the appropriate entry 1140 * in the freshly allocated memslots, not in @old or @new. 1141 */ 1142 slot = id_to_memslot(slots, old->id); 1143 slot->flags |= KVM_MEMSLOT_INVALID; 1144 1145 /* 1146 * We can re-use the old memslots, the only difference from the 1147 * newly installed memslots is the invalid flag, which will get 1148 * dropped by update_memslots anyway. We'll also revert to the 1149 * old memslots if preparing the new memory region fails. 1150 */ 1151 slots = install_new_memslots(kvm, as_id, slots); 1152 1153 /* From this point no new shadow pages pointing to a deleted, 1154 * or moved, memslot will be created. 1155 * 1156 * validation of sp->gfn happens in: 1157 * - gfn_to_hva (kvm_read_guest, gfn_to_pfn) 1158 * - kvm_is_visible_gfn (mmu_check_root) 1159 */ 1160 kvm_arch_flush_shadow_memslot(kvm, slot); 1161 } 1162 1163 r = kvm_arch_prepare_memory_region(kvm, new, mem, change); 1164 if (r) 1165 goto out_slots; 1166 1167 update_memslots(slots, new, change); 1168 slots = install_new_memslots(kvm, as_id, slots); 1169 1170 kvm_arch_commit_memory_region(kvm, mem, old, new, change); 1171 1172 kvfree(slots); 1173 return 0; 1174 1175 out_slots: 1176 if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) 1177 slots = install_new_memslots(kvm, as_id, slots); 1178 kvfree(slots); 1179 return r; 1180 } 1181 1182 static int kvm_delete_memslot(struct kvm *kvm, 1183 const struct kvm_userspace_memory_region *mem, 1184 struct kvm_memory_slot *old, int as_id) 1185 { 1186 struct kvm_memory_slot new; 1187 int r; 1188 1189 if (!old->npages) 1190 return -EINVAL; 1191 1192 memset(&new, 0, sizeof(new)); 1193 new.id = old->id; 1194 1195 r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE); 1196 if (r) 1197 return r; 1198 1199 kvm_free_memslot(kvm, old); 1200 return 0; 1201 } 1202 1203 /* 1204 * Allocate some memory and give it an address in the guest physical address 1205 * space. 1206 * 1207 * Discontiguous memory is allowed, mostly for framebuffers. 1208 * 1209 * Must be called holding kvm->slots_lock for write. 1210 */ 1211 int __kvm_set_memory_region(struct kvm *kvm, 1212 const struct kvm_userspace_memory_region *mem) 1213 { 1214 struct kvm_memory_slot old, new; 1215 struct kvm_memory_slot *tmp; 1216 enum kvm_mr_change change; 1217 int as_id, id; 1218 int r; 1219 1220 r = check_memory_region_flags(mem); 1221 if (r) 1222 return r; 1223 1224 as_id = mem->slot >> 16; 1225 id = (u16)mem->slot; 1226 1227 /* General sanity checks */ 1228 if (mem->memory_size & (PAGE_SIZE - 1)) 1229 return -EINVAL; 1230 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1231 return -EINVAL; 1232 /* We can read the guest memory with __xxx_user() later on. */ 1233 if ((mem->userspace_addr & (PAGE_SIZE - 1)) || 1234 !access_ok((void __user *)(unsigned long)mem->userspace_addr, 1235 mem->memory_size)) 1236 return -EINVAL; 1237 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM) 1238 return -EINVAL; 1239 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1240 return -EINVAL; 1241 1242 /* 1243 * Make a full copy of the old memslot, the pointer will become stale 1244 * when the memslots are re-sorted by update_memslots(), and the old 1245 * memslot needs to be referenced after calling update_memslots(), e.g. 1246 * to free its resources and for arch specific behavior. 1247 */ 1248 tmp = id_to_memslot(__kvm_memslots(kvm, as_id), id); 1249 if (tmp) { 1250 old = *tmp; 1251 tmp = NULL; 1252 } else { 1253 memset(&old, 0, sizeof(old)); 1254 old.id = id; 1255 } 1256 1257 if (!mem->memory_size) 1258 return kvm_delete_memslot(kvm, mem, &old, as_id); 1259 1260 new.id = id; 1261 new.base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1262 new.npages = mem->memory_size >> PAGE_SHIFT; 1263 new.flags = mem->flags; 1264 new.userspace_addr = mem->userspace_addr; 1265 1266 if (new.npages > KVM_MEM_MAX_NR_PAGES) 1267 return -EINVAL; 1268 1269 if (!old.npages) { 1270 change = KVM_MR_CREATE; 1271 new.dirty_bitmap = NULL; 1272 memset(&new.arch, 0, sizeof(new.arch)); 1273 } else { /* Modify an existing slot. */ 1274 if ((new.userspace_addr != old.userspace_addr) || 1275 (new.npages != old.npages) || 1276 ((new.flags ^ old.flags) & KVM_MEM_READONLY)) 1277 return -EINVAL; 1278 1279 if (new.base_gfn != old.base_gfn) 1280 change = KVM_MR_MOVE; 1281 else if (new.flags != old.flags) 1282 change = KVM_MR_FLAGS_ONLY; 1283 else /* Nothing to change. */ 1284 return 0; 1285 1286 /* Copy dirty_bitmap and arch from the current memslot. */ 1287 new.dirty_bitmap = old.dirty_bitmap; 1288 memcpy(&new.arch, &old.arch, sizeof(new.arch)); 1289 } 1290 1291 if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) { 1292 /* Check for overlaps */ 1293 kvm_for_each_memslot(tmp, __kvm_memslots(kvm, as_id)) { 1294 if (tmp->id == id) 1295 continue; 1296 if (!((new.base_gfn + new.npages <= tmp->base_gfn) || 1297 (new.base_gfn >= tmp->base_gfn + tmp->npages))) 1298 return -EEXIST; 1299 } 1300 } 1301 1302 /* Allocate/free page dirty bitmap as needed */ 1303 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1304 new.dirty_bitmap = NULL; 1305 else if (!new.dirty_bitmap) { 1306 r = kvm_alloc_dirty_bitmap(&new); 1307 if (r) 1308 return r; 1309 1310 if (kvm_dirty_log_manual_protect_and_init_set(kvm)) 1311 bitmap_set(new.dirty_bitmap, 0, new.npages); 1312 } 1313 1314 r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change); 1315 if (r) 1316 goto out_bitmap; 1317 1318 if (old.dirty_bitmap && !new.dirty_bitmap) 1319 kvm_destroy_dirty_bitmap(&old); 1320 return 0; 1321 1322 out_bitmap: 1323 if (new.dirty_bitmap && !old.dirty_bitmap) 1324 kvm_destroy_dirty_bitmap(&new); 1325 return r; 1326 } 1327 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1328 1329 int kvm_set_memory_region(struct kvm *kvm, 1330 const struct kvm_userspace_memory_region *mem) 1331 { 1332 int r; 1333 1334 mutex_lock(&kvm->slots_lock); 1335 r = __kvm_set_memory_region(kvm, mem); 1336 mutex_unlock(&kvm->slots_lock); 1337 return r; 1338 } 1339 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1340 1341 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1342 struct kvm_userspace_memory_region *mem) 1343 { 1344 if ((u16)mem->slot >= KVM_USER_MEM_SLOTS) 1345 return -EINVAL; 1346 1347 return kvm_set_memory_region(kvm, mem); 1348 } 1349 1350 #ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 1351 /** 1352 * kvm_get_dirty_log - get a snapshot of dirty pages 1353 * @kvm: pointer to kvm instance 1354 * @log: slot id and address to which we copy the log 1355 * @is_dirty: set to '1' if any dirty pages were found 1356 * @memslot: set to the associated memslot, always valid on success 1357 */ 1358 int kvm_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log, 1359 int *is_dirty, struct kvm_memory_slot **memslot) 1360 { 1361 struct kvm_memslots *slots; 1362 int i, as_id, id; 1363 unsigned long n; 1364 unsigned long any = 0; 1365 1366 *memslot = NULL; 1367 *is_dirty = 0; 1368 1369 as_id = log->slot >> 16; 1370 id = (u16)log->slot; 1371 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1372 return -EINVAL; 1373 1374 slots = __kvm_memslots(kvm, as_id); 1375 *memslot = id_to_memslot(slots, id); 1376 if (!(*memslot) || !(*memslot)->dirty_bitmap) 1377 return -ENOENT; 1378 1379 kvm_arch_sync_dirty_log(kvm, *memslot); 1380 1381 n = kvm_dirty_bitmap_bytes(*memslot); 1382 1383 for (i = 0; !any && i < n/sizeof(long); ++i) 1384 any = (*memslot)->dirty_bitmap[i]; 1385 1386 if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n)) 1387 return -EFAULT; 1388 1389 if (any) 1390 *is_dirty = 1; 1391 return 0; 1392 } 1393 EXPORT_SYMBOL_GPL(kvm_get_dirty_log); 1394 1395 #else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1396 /** 1397 * kvm_get_dirty_log_protect - get a snapshot of dirty pages 1398 * and reenable dirty page tracking for the corresponding pages. 1399 * @kvm: pointer to kvm instance 1400 * @log: slot id and address to which we copy the log 1401 * 1402 * We need to keep it in mind that VCPU threads can write to the bitmap 1403 * concurrently. So, to avoid losing track of dirty pages we keep the 1404 * following order: 1405 * 1406 * 1. Take a snapshot of the bit and clear it if needed. 1407 * 2. Write protect the corresponding page. 1408 * 3. Copy the snapshot to the userspace. 1409 * 4. Upon return caller flushes TLB's if needed. 1410 * 1411 * Between 2 and 4, the guest may write to the page using the remaining TLB 1412 * entry. This is not a problem because the page is reported dirty using 1413 * the snapshot taken before and step 4 ensures that writes done after 1414 * exiting to userspace will be logged for the next call. 1415 * 1416 */ 1417 static int kvm_get_dirty_log_protect(struct kvm *kvm, struct kvm_dirty_log *log) 1418 { 1419 struct kvm_memslots *slots; 1420 struct kvm_memory_slot *memslot; 1421 int i, as_id, id; 1422 unsigned long n; 1423 unsigned long *dirty_bitmap; 1424 unsigned long *dirty_bitmap_buffer; 1425 bool flush; 1426 1427 as_id = log->slot >> 16; 1428 id = (u16)log->slot; 1429 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1430 return -EINVAL; 1431 1432 slots = __kvm_memslots(kvm, as_id); 1433 memslot = id_to_memslot(slots, id); 1434 if (!memslot || !memslot->dirty_bitmap) 1435 return -ENOENT; 1436 1437 dirty_bitmap = memslot->dirty_bitmap; 1438 1439 kvm_arch_sync_dirty_log(kvm, memslot); 1440 1441 n = kvm_dirty_bitmap_bytes(memslot); 1442 flush = false; 1443 if (kvm->manual_dirty_log_protect) { 1444 /* 1445 * Unlike kvm_get_dirty_log, we always return false in *flush, 1446 * because no flush is needed until KVM_CLEAR_DIRTY_LOG. There 1447 * is some code duplication between this function and 1448 * kvm_get_dirty_log, but hopefully all architecture 1449 * transition to kvm_get_dirty_log_protect and kvm_get_dirty_log 1450 * can be eliminated. 1451 */ 1452 dirty_bitmap_buffer = dirty_bitmap; 1453 } else { 1454 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1455 memset(dirty_bitmap_buffer, 0, n); 1456 1457 spin_lock(&kvm->mmu_lock); 1458 for (i = 0; i < n / sizeof(long); i++) { 1459 unsigned long mask; 1460 gfn_t offset; 1461 1462 if (!dirty_bitmap[i]) 1463 continue; 1464 1465 flush = true; 1466 mask = xchg(&dirty_bitmap[i], 0); 1467 dirty_bitmap_buffer[i] = mask; 1468 1469 offset = i * BITS_PER_LONG; 1470 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1471 offset, mask); 1472 } 1473 spin_unlock(&kvm->mmu_lock); 1474 } 1475 1476 if (flush) 1477 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1478 1479 if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n)) 1480 return -EFAULT; 1481 return 0; 1482 } 1483 1484 1485 /** 1486 * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot 1487 * @kvm: kvm instance 1488 * @log: slot id and address to which we copy the log 1489 * 1490 * Steps 1-4 below provide general overview of dirty page logging. See 1491 * kvm_get_dirty_log_protect() function description for additional details. 1492 * 1493 * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we 1494 * always flush the TLB (step 4) even if previous step failed and the dirty 1495 * bitmap may be corrupt. Regardless of previous outcome the KVM logging API 1496 * does not preclude user space subsequent dirty log read. Flushing TLB ensures 1497 * writes will be marked dirty for next log read. 1498 * 1499 * 1. Take a snapshot of the bit and clear it if needed. 1500 * 2. Write protect the corresponding page. 1501 * 3. Copy the snapshot to the userspace. 1502 * 4. Flush TLB's if needed. 1503 */ 1504 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 1505 struct kvm_dirty_log *log) 1506 { 1507 int r; 1508 1509 mutex_lock(&kvm->slots_lock); 1510 1511 r = kvm_get_dirty_log_protect(kvm, log); 1512 1513 mutex_unlock(&kvm->slots_lock); 1514 return r; 1515 } 1516 1517 /** 1518 * kvm_clear_dirty_log_protect - clear dirty bits in the bitmap 1519 * and reenable dirty page tracking for the corresponding pages. 1520 * @kvm: pointer to kvm instance 1521 * @log: slot id and address from which to fetch the bitmap of dirty pages 1522 */ 1523 static int kvm_clear_dirty_log_protect(struct kvm *kvm, 1524 struct kvm_clear_dirty_log *log) 1525 { 1526 struct kvm_memslots *slots; 1527 struct kvm_memory_slot *memslot; 1528 int as_id, id; 1529 gfn_t offset; 1530 unsigned long i, n; 1531 unsigned long *dirty_bitmap; 1532 unsigned long *dirty_bitmap_buffer; 1533 bool flush; 1534 1535 as_id = log->slot >> 16; 1536 id = (u16)log->slot; 1537 if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_USER_MEM_SLOTS) 1538 return -EINVAL; 1539 1540 if (log->first_page & 63) 1541 return -EINVAL; 1542 1543 slots = __kvm_memslots(kvm, as_id); 1544 memslot = id_to_memslot(slots, id); 1545 if (!memslot || !memslot->dirty_bitmap) 1546 return -ENOENT; 1547 1548 dirty_bitmap = memslot->dirty_bitmap; 1549 1550 n = ALIGN(log->num_pages, BITS_PER_LONG) / 8; 1551 1552 if (log->first_page > memslot->npages || 1553 log->num_pages > memslot->npages - log->first_page || 1554 (log->num_pages < memslot->npages - log->first_page && (log->num_pages & 63))) 1555 return -EINVAL; 1556 1557 kvm_arch_sync_dirty_log(kvm, memslot); 1558 1559 flush = false; 1560 dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot); 1561 if (copy_from_user(dirty_bitmap_buffer, log->dirty_bitmap, n)) 1562 return -EFAULT; 1563 1564 spin_lock(&kvm->mmu_lock); 1565 for (offset = log->first_page, i = offset / BITS_PER_LONG, 1566 n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--; 1567 i++, offset += BITS_PER_LONG) { 1568 unsigned long mask = *dirty_bitmap_buffer++; 1569 atomic_long_t *p = (atomic_long_t *) &dirty_bitmap[i]; 1570 if (!mask) 1571 continue; 1572 1573 mask &= atomic_long_fetch_andnot(mask, p); 1574 1575 /* 1576 * mask contains the bits that really have been cleared. This 1577 * never includes any bits beyond the length of the memslot (if 1578 * the length is not aligned to 64 pages), therefore it is not 1579 * a problem if userspace sets them in log->dirty_bitmap. 1580 */ 1581 if (mask) { 1582 flush = true; 1583 kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, 1584 offset, mask); 1585 } 1586 } 1587 spin_unlock(&kvm->mmu_lock); 1588 1589 if (flush) 1590 kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); 1591 1592 return 0; 1593 } 1594 1595 static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, 1596 struct kvm_clear_dirty_log *log) 1597 { 1598 int r; 1599 1600 mutex_lock(&kvm->slots_lock); 1601 1602 r = kvm_clear_dirty_log_protect(kvm, log); 1603 1604 mutex_unlock(&kvm->slots_lock); 1605 return r; 1606 } 1607 #endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */ 1608 1609 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1610 { 1611 return __gfn_to_memslot(kvm_memslots(kvm), gfn); 1612 } 1613 EXPORT_SYMBOL_GPL(gfn_to_memslot); 1614 1615 struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) 1616 { 1617 return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); 1618 } 1619 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); 1620 1621 bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1622 { 1623 struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn); 1624 1625 return kvm_is_visible_memslot(memslot); 1626 } 1627 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1628 1629 unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) 1630 { 1631 struct vm_area_struct *vma; 1632 unsigned long addr, size; 1633 1634 size = PAGE_SIZE; 1635 1636 addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL); 1637 if (kvm_is_error_hva(addr)) 1638 return PAGE_SIZE; 1639 1640 mmap_read_lock(current->mm); 1641 vma = find_vma(current->mm, addr); 1642 if (!vma) 1643 goto out; 1644 1645 size = vma_kernel_pagesize(vma); 1646 1647 out: 1648 mmap_read_unlock(current->mm); 1649 1650 return size; 1651 } 1652 1653 static bool memslot_is_readonly(struct kvm_memory_slot *slot) 1654 { 1655 return slot->flags & KVM_MEM_READONLY; 1656 } 1657 1658 static unsigned long __gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1659 gfn_t *nr_pages, bool write) 1660 { 1661 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 1662 return KVM_HVA_ERR_BAD; 1663 1664 if (memslot_is_readonly(slot) && write) 1665 return KVM_HVA_ERR_RO_BAD; 1666 1667 if (nr_pages) 1668 *nr_pages = slot->npages - (gfn - slot->base_gfn); 1669 1670 return __gfn_to_hva_memslot(slot, gfn); 1671 } 1672 1673 static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn, 1674 gfn_t *nr_pages) 1675 { 1676 return __gfn_to_hva_many(slot, gfn, nr_pages, true); 1677 } 1678 1679 unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, 1680 gfn_t gfn) 1681 { 1682 return gfn_to_hva_many(slot, gfn, NULL); 1683 } 1684 EXPORT_SYMBOL_GPL(gfn_to_hva_memslot); 1685 1686 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1687 { 1688 return gfn_to_hva_many(gfn_to_memslot(kvm, gfn), gfn, NULL); 1689 } 1690 EXPORT_SYMBOL_GPL(gfn_to_hva); 1691 1692 unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn) 1693 { 1694 return gfn_to_hva_many(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL); 1695 } 1696 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva); 1697 1698 /* 1699 * Return the hva of a @gfn and the R/W attribute if possible. 1700 * 1701 * @slot: the kvm_memory_slot which contains @gfn 1702 * @gfn: the gfn to be translated 1703 * @writable: used to return the read/write attribute of the @slot if the hva 1704 * is valid and @writable is not NULL 1705 */ 1706 unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot, 1707 gfn_t gfn, bool *writable) 1708 { 1709 unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, false); 1710 1711 if (!kvm_is_error_hva(hva) && writable) 1712 *writable = !memslot_is_readonly(slot); 1713 1714 return hva; 1715 } 1716 1717 unsigned long gfn_to_hva_prot(struct kvm *kvm, gfn_t gfn, bool *writable) 1718 { 1719 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 1720 1721 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1722 } 1723 1724 unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu *vcpu, gfn_t gfn, bool *writable) 1725 { 1726 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 1727 1728 return gfn_to_hva_memslot_prot(slot, gfn, writable); 1729 } 1730 1731 static inline int check_user_page_hwpoison(unsigned long addr) 1732 { 1733 int rc, flags = FOLL_HWPOISON | FOLL_WRITE; 1734 1735 rc = get_user_pages(addr, 1, flags, NULL, NULL); 1736 return rc == -EHWPOISON; 1737 } 1738 1739 /* 1740 * The fast path to get the writable pfn which will be stored in @pfn, 1741 * true indicates success, otherwise false is returned. It's also the 1742 * only part that runs if we can in atomic context. 1743 */ 1744 static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, 1745 bool *writable, kvm_pfn_t *pfn) 1746 { 1747 struct page *page[1]; 1748 1749 /* 1750 * Fast pin a writable pfn only if it is a write fault request 1751 * or the caller allows to map a writable pfn for a read fault 1752 * request. 1753 */ 1754 if (!(write_fault || writable)) 1755 return false; 1756 1757 if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { 1758 *pfn = page_to_pfn(page[0]); 1759 1760 if (writable) 1761 *writable = true; 1762 return true; 1763 } 1764 1765 return false; 1766 } 1767 1768 /* 1769 * The slow path to get the pfn of the specified host virtual address, 1770 * 1 indicates success, -errno is returned if error is detected. 1771 */ 1772 static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, 1773 bool *writable, kvm_pfn_t *pfn) 1774 { 1775 unsigned int flags = FOLL_HWPOISON; 1776 struct page *page; 1777 int npages = 0; 1778 1779 might_sleep(); 1780 1781 if (writable) 1782 *writable = write_fault; 1783 1784 if (write_fault) 1785 flags |= FOLL_WRITE; 1786 if (async) 1787 flags |= FOLL_NOWAIT; 1788 1789 npages = get_user_pages_unlocked(addr, 1, &page, flags); 1790 if (npages != 1) 1791 return npages; 1792 1793 /* map read fault as writable if possible */ 1794 if (unlikely(!write_fault) && writable) { 1795 struct page *wpage; 1796 1797 if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { 1798 *writable = true; 1799 put_page(page); 1800 page = wpage; 1801 } 1802 } 1803 *pfn = page_to_pfn(page); 1804 return npages; 1805 } 1806 1807 static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault) 1808 { 1809 if (unlikely(!(vma->vm_flags & VM_READ))) 1810 return false; 1811 1812 if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE)))) 1813 return false; 1814 1815 return true; 1816 } 1817 1818 static int hva_to_pfn_remapped(struct vm_area_struct *vma, 1819 unsigned long addr, bool *async, 1820 bool write_fault, bool *writable, 1821 kvm_pfn_t *p_pfn) 1822 { 1823 unsigned long pfn; 1824 int r; 1825 1826 r = follow_pfn(vma, addr, &pfn); 1827 if (r) { 1828 /* 1829 * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does 1830 * not call the fault handler, so do it here. 1831 */ 1832 bool unlocked = false; 1833 r = fixup_user_fault(current, current->mm, addr, 1834 (write_fault ? FAULT_FLAG_WRITE : 0), 1835 &unlocked); 1836 if (unlocked) 1837 return -EAGAIN; 1838 if (r) 1839 return r; 1840 1841 r = follow_pfn(vma, addr, &pfn); 1842 if (r) 1843 return r; 1844 1845 } 1846 1847 if (writable) 1848 *writable = true; 1849 1850 /* 1851 * Get a reference here because callers of *hva_to_pfn* and 1852 * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the 1853 * returned pfn. This is only needed if the VMA has VM_MIXEDMAP 1854 * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will 1855 * simply do nothing for reserved pfns. 1856 * 1857 * Whoever called remap_pfn_range is also going to call e.g. 1858 * unmap_mapping_range before the underlying pages are freed, 1859 * causing a call to our MMU notifier. 1860 */ 1861 kvm_get_pfn(pfn); 1862 1863 *p_pfn = pfn; 1864 return 0; 1865 } 1866 1867 /* 1868 * Pin guest page in memory and return its pfn. 1869 * @addr: host virtual address which maps memory to the guest 1870 * @atomic: whether this function can sleep 1871 * @async: whether this function need to wait IO complete if the 1872 * host page is not in the memory 1873 * @write_fault: whether we should get a writable host page 1874 * @writable: whether it allows to map a writable host page for !@write_fault 1875 * 1876 * The function will map a writable host page for these two cases: 1877 * 1): @write_fault = true 1878 * 2): @write_fault = false && @writable, @writable will tell the caller 1879 * whether the mapping is writable. 1880 */ 1881 static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async, 1882 bool write_fault, bool *writable) 1883 { 1884 struct vm_area_struct *vma; 1885 kvm_pfn_t pfn = 0; 1886 int npages, r; 1887 1888 /* we can do it either atomically or asynchronously, not both */ 1889 BUG_ON(atomic && async); 1890 1891 if (hva_to_pfn_fast(addr, write_fault, writable, &pfn)) 1892 return pfn; 1893 1894 if (atomic) 1895 return KVM_PFN_ERR_FAULT; 1896 1897 npages = hva_to_pfn_slow(addr, async, write_fault, writable, &pfn); 1898 if (npages == 1) 1899 return pfn; 1900 1901 mmap_read_lock(current->mm); 1902 if (npages == -EHWPOISON || 1903 (!async && check_user_page_hwpoison(addr))) { 1904 pfn = KVM_PFN_ERR_HWPOISON; 1905 goto exit; 1906 } 1907 1908 retry: 1909 vma = find_vma_intersection(current->mm, addr, addr + 1); 1910 1911 if (vma == NULL) 1912 pfn = KVM_PFN_ERR_FAULT; 1913 else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) { 1914 r = hva_to_pfn_remapped(vma, addr, async, write_fault, writable, &pfn); 1915 if (r == -EAGAIN) 1916 goto retry; 1917 if (r < 0) 1918 pfn = KVM_PFN_ERR_FAULT; 1919 } else { 1920 if (async && vma_is_valid(vma, write_fault)) 1921 *async = true; 1922 pfn = KVM_PFN_ERR_FAULT; 1923 } 1924 exit: 1925 mmap_read_unlock(current->mm); 1926 return pfn; 1927 } 1928 1929 kvm_pfn_t __gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn, 1930 bool atomic, bool *async, bool write_fault, 1931 bool *writable) 1932 { 1933 unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write_fault); 1934 1935 if (addr == KVM_HVA_ERR_RO_BAD) { 1936 if (writable) 1937 *writable = false; 1938 return KVM_PFN_ERR_RO_FAULT; 1939 } 1940 1941 if (kvm_is_error_hva(addr)) { 1942 if (writable) 1943 *writable = false; 1944 return KVM_PFN_NOSLOT; 1945 } 1946 1947 /* Do not map writable pfn in the readonly memslot. */ 1948 if (writable && memslot_is_readonly(slot)) { 1949 *writable = false; 1950 writable = NULL; 1951 } 1952 1953 return hva_to_pfn(addr, atomic, async, write_fault, 1954 writable); 1955 } 1956 EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot); 1957 1958 kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault, 1959 bool *writable) 1960 { 1961 return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, NULL, 1962 write_fault, writable); 1963 } 1964 EXPORT_SYMBOL_GPL(gfn_to_pfn_prot); 1965 1966 kvm_pfn_t gfn_to_pfn_memslot(struct kvm_memory_slot *slot, gfn_t gfn) 1967 { 1968 return __gfn_to_pfn_memslot(slot, gfn, false, NULL, true, NULL); 1969 } 1970 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot); 1971 1972 kvm_pfn_t gfn_to_pfn_memslot_atomic(struct kvm_memory_slot *slot, gfn_t gfn) 1973 { 1974 return __gfn_to_pfn_memslot(slot, gfn, true, NULL, true, NULL); 1975 } 1976 EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic); 1977 1978 kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn) 1979 { 1980 return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1981 } 1982 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic); 1983 1984 kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1985 { 1986 return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn); 1987 } 1988 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1989 1990 kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn) 1991 { 1992 return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn); 1993 } 1994 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn); 1995 1996 int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 1997 struct page **pages, int nr_pages) 1998 { 1999 unsigned long addr; 2000 gfn_t entry = 0; 2001 2002 addr = gfn_to_hva_many(slot, gfn, &entry); 2003 if (kvm_is_error_hva(addr)) 2004 return -1; 2005 2006 if (entry < nr_pages) 2007 return 0; 2008 2009 return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); 2010 } 2011 EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); 2012 2013 static struct page *kvm_pfn_to_page(kvm_pfn_t pfn) 2014 { 2015 if (is_error_noslot_pfn(pfn)) 2016 return KVM_ERR_PTR_BAD_PAGE; 2017 2018 if (kvm_is_reserved_pfn(pfn)) { 2019 WARN_ON(1); 2020 return KVM_ERR_PTR_BAD_PAGE; 2021 } 2022 2023 return pfn_to_page(pfn); 2024 } 2025 2026 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 2027 { 2028 kvm_pfn_t pfn; 2029 2030 pfn = gfn_to_pfn(kvm, gfn); 2031 2032 return kvm_pfn_to_page(pfn); 2033 } 2034 EXPORT_SYMBOL_GPL(gfn_to_page); 2035 2036 void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache) 2037 { 2038 if (pfn == 0) 2039 return; 2040 2041 if (cache) 2042 cache->pfn = cache->gfn = 0; 2043 2044 if (dirty) 2045 kvm_release_pfn_dirty(pfn); 2046 else 2047 kvm_release_pfn_clean(pfn); 2048 } 2049 2050 static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn, 2051 struct gfn_to_pfn_cache *cache, u64 gen) 2052 { 2053 kvm_release_pfn(cache->pfn, cache->dirty, cache); 2054 2055 cache->pfn = gfn_to_pfn_memslot(slot, gfn); 2056 cache->gfn = gfn; 2057 cache->dirty = false; 2058 cache->generation = gen; 2059 } 2060 2061 static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn, 2062 struct kvm_host_map *map, 2063 struct gfn_to_pfn_cache *cache, 2064 bool atomic) 2065 { 2066 kvm_pfn_t pfn; 2067 void *hva = NULL; 2068 struct page *page = KVM_UNMAPPED_PAGE; 2069 struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn); 2070 u64 gen = slots->generation; 2071 2072 if (!map) 2073 return -EINVAL; 2074 2075 if (cache) { 2076 if (!cache->pfn || cache->gfn != gfn || 2077 cache->generation != gen) { 2078 if (atomic) 2079 return -EAGAIN; 2080 kvm_cache_gfn_to_pfn(slot, gfn, cache, gen); 2081 } 2082 pfn = cache->pfn; 2083 } else { 2084 if (atomic) 2085 return -EAGAIN; 2086 pfn = gfn_to_pfn_memslot(slot, gfn); 2087 } 2088 if (is_error_noslot_pfn(pfn)) 2089 return -EINVAL; 2090 2091 if (pfn_valid(pfn)) { 2092 page = pfn_to_page(pfn); 2093 if (atomic) 2094 hva = kmap_atomic(page); 2095 else 2096 hva = kmap(page); 2097 #ifdef CONFIG_HAS_IOMEM 2098 } else if (!atomic) { 2099 hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB); 2100 } else { 2101 return -EINVAL; 2102 #endif 2103 } 2104 2105 if (!hva) 2106 return -EFAULT; 2107 2108 map->page = page; 2109 map->hva = hva; 2110 map->pfn = pfn; 2111 map->gfn = gfn; 2112 2113 return 0; 2114 } 2115 2116 int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map, 2117 struct gfn_to_pfn_cache *cache, bool atomic) 2118 { 2119 return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map, 2120 cache, atomic); 2121 } 2122 EXPORT_SYMBOL_GPL(kvm_map_gfn); 2123 2124 int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map) 2125 { 2126 return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map, 2127 NULL, false); 2128 } 2129 EXPORT_SYMBOL_GPL(kvm_vcpu_map); 2130 2131 static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot, 2132 struct kvm_host_map *map, 2133 struct gfn_to_pfn_cache *cache, 2134 bool dirty, bool atomic) 2135 { 2136 if (!map) 2137 return; 2138 2139 if (!map->hva) 2140 return; 2141 2142 if (map->page != KVM_UNMAPPED_PAGE) { 2143 if (atomic) 2144 kunmap_atomic(map->hva); 2145 else 2146 kunmap(map->page); 2147 } 2148 #ifdef CONFIG_HAS_IOMEM 2149 else if (!atomic) 2150 memunmap(map->hva); 2151 else 2152 WARN_ONCE(1, "Unexpected unmapping in atomic context"); 2153 #endif 2154 2155 if (dirty) 2156 mark_page_dirty_in_slot(memslot, map->gfn); 2157 2158 if (cache) 2159 cache->dirty |= dirty; 2160 else 2161 kvm_release_pfn(map->pfn, dirty, NULL); 2162 2163 map->hva = NULL; 2164 map->page = NULL; 2165 } 2166 2167 int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map, 2168 struct gfn_to_pfn_cache *cache, bool dirty, bool atomic) 2169 { 2170 __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map, 2171 cache, dirty, atomic); 2172 return 0; 2173 } 2174 EXPORT_SYMBOL_GPL(kvm_unmap_gfn); 2175 2176 void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty) 2177 { 2178 __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL, 2179 dirty, false); 2180 } 2181 EXPORT_SYMBOL_GPL(kvm_vcpu_unmap); 2182 2183 struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn) 2184 { 2185 kvm_pfn_t pfn; 2186 2187 pfn = kvm_vcpu_gfn_to_pfn(vcpu, gfn); 2188 2189 return kvm_pfn_to_page(pfn); 2190 } 2191 EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_page); 2192 2193 void kvm_release_page_clean(struct page *page) 2194 { 2195 WARN_ON(is_error_page(page)); 2196 2197 kvm_release_pfn_clean(page_to_pfn(page)); 2198 } 2199 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 2200 2201 void kvm_release_pfn_clean(kvm_pfn_t pfn) 2202 { 2203 if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn)) 2204 put_page(pfn_to_page(pfn)); 2205 } 2206 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 2207 2208 void kvm_release_page_dirty(struct page *page) 2209 { 2210 WARN_ON(is_error_page(page)); 2211 2212 kvm_release_pfn_dirty(page_to_pfn(page)); 2213 } 2214 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 2215 2216 void kvm_release_pfn_dirty(kvm_pfn_t pfn) 2217 { 2218 kvm_set_pfn_dirty(pfn); 2219 kvm_release_pfn_clean(pfn); 2220 } 2221 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 2222 2223 void kvm_set_pfn_dirty(kvm_pfn_t pfn) 2224 { 2225 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2226 SetPageDirty(pfn_to_page(pfn)); 2227 } 2228 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 2229 2230 void kvm_set_pfn_accessed(kvm_pfn_t pfn) 2231 { 2232 if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) 2233 mark_page_accessed(pfn_to_page(pfn)); 2234 } 2235 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 2236 2237 void kvm_get_pfn(kvm_pfn_t pfn) 2238 { 2239 if (!kvm_is_reserved_pfn(pfn)) 2240 get_page(pfn_to_page(pfn)); 2241 } 2242 EXPORT_SYMBOL_GPL(kvm_get_pfn); 2243 2244 static int next_segment(unsigned long len, int offset) 2245 { 2246 if (len > PAGE_SIZE - offset) 2247 return PAGE_SIZE - offset; 2248 else 2249 return len; 2250 } 2251 2252 static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn, 2253 void *data, int offset, int len) 2254 { 2255 int r; 2256 unsigned long addr; 2257 2258 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2259 if (kvm_is_error_hva(addr)) 2260 return -EFAULT; 2261 r = __copy_from_user(data, (void __user *)addr + offset, len); 2262 if (r) 2263 return -EFAULT; 2264 return 0; 2265 } 2266 2267 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 2268 int len) 2269 { 2270 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2271 2272 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2273 } 2274 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 2275 2276 int kvm_vcpu_read_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, void *data, 2277 int offset, int len) 2278 { 2279 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2280 2281 return __kvm_read_guest_page(slot, gfn, data, offset, len); 2282 } 2283 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page); 2284 2285 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 2286 { 2287 gfn_t gfn = gpa >> PAGE_SHIFT; 2288 int seg; 2289 int offset = offset_in_page(gpa); 2290 int ret; 2291 2292 while ((seg = next_segment(len, offset)) != 0) { 2293 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 2294 if (ret < 0) 2295 return ret; 2296 offset = 0; 2297 len -= seg; 2298 data += seg; 2299 ++gfn; 2300 } 2301 return 0; 2302 } 2303 EXPORT_SYMBOL_GPL(kvm_read_guest); 2304 2305 int kvm_vcpu_read_guest(struct kvm_vcpu *vcpu, gpa_t gpa, void *data, unsigned long len) 2306 { 2307 gfn_t gfn = gpa >> PAGE_SHIFT; 2308 int seg; 2309 int offset = offset_in_page(gpa); 2310 int ret; 2311 2312 while ((seg = next_segment(len, offset)) != 0) { 2313 ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg); 2314 if (ret < 0) 2315 return ret; 2316 offset = 0; 2317 len -= seg; 2318 data += seg; 2319 ++gfn; 2320 } 2321 return 0; 2322 } 2323 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest); 2324 2325 static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn, 2326 void *data, int offset, unsigned long len) 2327 { 2328 int r; 2329 unsigned long addr; 2330 2331 addr = gfn_to_hva_memslot_prot(slot, gfn, NULL); 2332 if (kvm_is_error_hva(addr)) 2333 return -EFAULT; 2334 pagefault_disable(); 2335 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 2336 pagefault_enable(); 2337 if (r) 2338 return -EFAULT; 2339 return 0; 2340 } 2341 2342 int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa, 2343 void *data, unsigned long len) 2344 { 2345 gfn_t gfn = gpa >> PAGE_SHIFT; 2346 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2347 int offset = offset_in_page(gpa); 2348 2349 return __kvm_read_guest_atomic(slot, gfn, data, offset, len); 2350 } 2351 EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic); 2352 2353 static int __kvm_write_guest_page(struct kvm_memory_slot *memslot, gfn_t gfn, 2354 const void *data, int offset, int len) 2355 { 2356 int r; 2357 unsigned long addr; 2358 2359 addr = gfn_to_hva_memslot(memslot, gfn); 2360 if (kvm_is_error_hva(addr)) 2361 return -EFAULT; 2362 r = __copy_to_user((void __user *)addr + offset, data, len); 2363 if (r) 2364 return -EFAULT; 2365 mark_page_dirty_in_slot(memslot, gfn); 2366 return 0; 2367 } 2368 2369 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, 2370 const void *data, int offset, int len) 2371 { 2372 struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); 2373 2374 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2375 } 2376 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 2377 2378 int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn, 2379 const void *data, int offset, int len) 2380 { 2381 struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2382 2383 return __kvm_write_guest_page(slot, gfn, data, offset, len); 2384 } 2385 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page); 2386 2387 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 2388 unsigned long len) 2389 { 2390 gfn_t gfn = gpa >> PAGE_SHIFT; 2391 int seg; 2392 int offset = offset_in_page(gpa); 2393 int ret; 2394 2395 while ((seg = next_segment(len, offset)) != 0) { 2396 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 2397 if (ret < 0) 2398 return ret; 2399 offset = 0; 2400 len -= seg; 2401 data += seg; 2402 ++gfn; 2403 } 2404 return 0; 2405 } 2406 EXPORT_SYMBOL_GPL(kvm_write_guest); 2407 2408 int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data, 2409 unsigned long len) 2410 { 2411 gfn_t gfn = gpa >> PAGE_SHIFT; 2412 int seg; 2413 int offset = offset_in_page(gpa); 2414 int ret; 2415 2416 while ((seg = next_segment(len, offset)) != 0) { 2417 ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg); 2418 if (ret < 0) 2419 return ret; 2420 offset = 0; 2421 len -= seg; 2422 data += seg; 2423 ++gfn; 2424 } 2425 return 0; 2426 } 2427 EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest); 2428 2429 static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, 2430 struct gfn_to_hva_cache *ghc, 2431 gpa_t gpa, unsigned long len) 2432 { 2433 int offset = offset_in_page(gpa); 2434 gfn_t start_gfn = gpa >> PAGE_SHIFT; 2435 gfn_t end_gfn = (gpa + len - 1) >> PAGE_SHIFT; 2436 gfn_t nr_pages_needed = end_gfn - start_gfn + 1; 2437 gfn_t nr_pages_avail; 2438 2439 /* Update ghc->generation before performing any error checks. */ 2440 ghc->generation = slots->generation; 2441 2442 if (start_gfn > end_gfn) { 2443 ghc->hva = KVM_HVA_ERR_BAD; 2444 return -EINVAL; 2445 } 2446 2447 /* 2448 * If the requested region crosses two memslots, we still 2449 * verify that the entire region is valid here. 2450 */ 2451 for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) { 2452 ghc->memslot = __gfn_to_memslot(slots, start_gfn); 2453 ghc->hva = gfn_to_hva_many(ghc->memslot, start_gfn, 2454 &nr_pages_avail); 2455 if (kvm_is_error_hva(ghc->hva)) 2456 return -EFAULT; 2457 } 2458 2459 /* Use the slow path for cross page reads and writes. */ 2460 if (nr_pages_needed == 1) 2461 ghc->hva += offset; 2462 else 2463 ghc->memslot = NULL; 2464 2465 ghc->gpa = gpa; 2466 ghc->len = len; 2467 return 0; 2468 } 2469 2470 int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2471 gpa_t gpa, unsigned long len) 2472 { 2473 struct kvm_memslots *slots = kvm_memslots(kvm); 2474 return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); 2475 } 2476 EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); 2477 2478 int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2479 void *data, unsigned int offset, 2480 unsigned long len) 2481 { 2482 struct kvm_memslots *slots = kvm_memslots(kvm); 2483 int r; 2484 gpa_t gpa = ghc->gpa + offset; 2485 2486 BUG_ON(len + offset > ghc->len); 2487 2488 if (slots->generation != ghc->generation) { 2489 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2490 return -EFAULT; 2491 } 2492 2493 if (kvm_is_error_hva(ghc->hva)) 2494 return -EFAULT; 2495 2496 if (unlikely(!ghc->memslot)) 2497 return kvm_write_guest(kvm, gpa, data, len); 2498 2499 r = __copy_to_user((void __user *)ghc->hva + offset, data, len); 2500 if (r) 2501 return -EFAULT; 2502 mark_page_dirty_in_slot(ghc->memslot, gpa >> PAGE_SHIFT); 2503 2504 return 0; 2505 } 2506 EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); 2507 2508 int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2509 void *data, unsigned long len) 2510 { 2511 return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); 2512 } 2513 EXPORT_SYMBOL_GPL(kvm_write_guest_cached); 2514 2515 int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2516 void *data, unsigned int offset, 2517 unsigned long len) 2518 { 2519 struct kvm_memslots *slots = kvm_memslots(kvm); 2520 int r; 2521 gpa_t gpa = ghc->gpa + offset; 2522 2523 BUG_ON(len + offset > ghc->len); 2524 2525 if (slots->generation != ghc->generation) { 2526 if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len)) 2527 return -EFAULT; 2528 } 2529 2530 if (kvm_is_error_hva(ghc->hva)) 2531 return -EFAULT; 2532 2533 if (unlikely(!ghc->memslot)) 2534 return kvm_read_guest(kvm, gpa, data, len); 2535 2536 r = __copy_from_user(data, (void __user *)ghc->hva + offset, len); 2537 if (r) 2538 return -EFAULT; 2539 2540 return 0; 2541 } 2542 EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached); 2543 2544 int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, 2545 void *data, unsigned long len) 2546 { 2547 return kvm_read_guest_offset_cached(kvm, ghc, data, 0, len); 2548 } 2549 EXPORT_SYMBOL_GPL(kvm_read_guest_cached); 2550 2551 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 2552 { 2553 const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0))); 2554 2555 return kvm_write_guest_page(kvm, gfn, zero_page, offset, len); 2556 } 2557 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 2558 2559 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 2560 { 2561 gfn_t gfn = gpa >> PAGE_SHIFT; 2562 int seg; 2563 int offset = offset_in_page(gpa); 2564 int ret; 2565 2566 while ((seg = next_segment(len, offset)) != 0) { 2567 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 2568 if (ret < 0) 2569 return ret; 2570 offset = 0; 2571 len -= seg; 2572 ++gfn; 2573 } 2574 return 0; 2575 } 2576 EXPORT_SYMBOL_GPL(kvm_clear_guest); 2577 2578 static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, 2579 gfn_t gfn) 2580 { 2581 if (memslot && memslot->dirty_bitmap) { 2582 unsigned long rel_gfn = gfn - memslot->base_gfn; 2583 2584 set_bit_le(rel_gfn, memslot->dirty_bitmap); 2585 } 2586 } 2587 2588 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 2589 { 2590 struct kvm_memory_slot *memslot; 2591 2592 memslot = gfn_to_memslot(kvm, gfn); 2593 mark_page_dirty_in_slot(memslot, gfn); 2594 } 2595 EXPORT_SYMBOL_GPL(mark_page_dirty); 2596 2597 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn) 2598 { 2599 struct kvm_memory_slot *memslot; 2600 2601 memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 2602 mark_page_dirty_in_slot(memslot, gfn); 2603 } 2604 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty); 2605 2606 void kvm_sigset_activate(struct kvm_vcpu *vcpu) 2607 { 2608 if (!vcpu->sigset_active) 2609 return; 2610 2611 /* 2612 * This does a lockless modification of ->real_blocked, which is fine 2613 * because, only current can change ->real_blocked and all readers of 2614 * ->real_blocked don't care as long ->real_blocked is always a subset 2615 * of ->blocked. 2616 */ 2617 sigprocmask(SIG_SETMASK, &vcpu->sigset, ¤t->real_blocked); 2618 } 2619 2620 void kvm_sigset_deactivate(struct kvm_vcpu *vcpu) 2621 { 2622 if (!vcpu->sigset_active) 2623 return; 2624 2625 sigprocmask(SIG_SETMASK, ¤t->real_blocked, NULL); 2626 sigemptyset(¤t->real_blocked); 2627 } 2628 2629 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu) 2630 { 2631 unsigned int old, val, grow, grow_start; 2632 2633 old = val = vcpu->halt_poll_ns; 2634 grow_start = READ_ONCE(halt_poll_ns_grow_start); 2635 grow = READ_ONCE(halt_poll_ns_grow); 2636 if (!grow) 2637 goto out; 2638 2639 val *= grow; 2640 if (val < grow_start) 2641 val = grow_start; 2642 2643 if (val > halt_poll_ns) 2644 val = halt_poll_ns; 2645 2646 vcpu->halt_poll_ns = val; 2647 out: 2648 trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old); 2649 } 2650 2651 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu) 2652 { 2653 unsigned int old, val, shrink; 2654 2655 old = val = vcpu->halt_poll_ns; 2656 shrink = READ_ONCE(halt_poll_ns_shrink); 2657 if (shrink == 0) 2658 val = 0; 2659 else 2660 val /= shrink; 2661 2662 vcpu->halt_poll_ns = val; 2663 trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old); 2664 } 2665 2666 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu) 2667 { 2668 int ret = -EINTR; 2669 int idx = srcu_read_lock(&vcpu->kvm->srcu); 2670 2671 if (kvm_arch_vcpu_runnable(vcpu)) { 2672 kvm_make_request(KVM_REQ_UNHALT, vcpu); 2673 goto out; 2674 } 2675 if (kvm_cpu_has_pending_timer(vcpu)) 2676 goto out; 2677 if (signal_pending(current)) 2678 goto out; 2679 2680 ret = 0; 2681 out: 2682 srcu_read_unlock(&vcpu->kvm->srcu, idx); 2683 return ret; 2684 } 2685 2686 static inline void 2687 update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) 2688 { 2689 if (waited) 2690 vcpu->stat.halt_poll_fail_ns += poll_ns; 2691 else 2692 vcpu->stat.halt_poll_success_ns += poll_ns; 2693 } 2694 2695 /* 2696 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 2697 */ 2698 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 2699 { 2700 ktime_t start, cur, poll_end; 2701 bool waited = false; 2702 u64 block_ns; 2703 2704 kvm_arch_vcpu_blocking(vcpu); 2705 2706 start = cur = poll_end = ktime_get(); 2707 if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { 2708 ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); 2709 2710 ++vcpu->stat.halt_attempted_poll; 2711 do { 2712 /* 2713 * This sets KVM_REQ_UNHALT if an interrupt 2714 * arrives. 2715 */ 2716 if (kvm_vcpu_check_block(vcpu) < 0) { 2717 ++vcpu->stat.halt_successful_poll; 2718 if (!vcpu_valid_wakeup(vcpu)) 2719 ++vcpu->stat.halt_poll_invalid; 2720 goto out; 2721 } 2722 poll_end = cur = ktime_get(); 2723 } while (single_task_running() && ktime_before(cur, stop)); 2724 } 2725 2726 prepare_to_rcuwait(&vcpu->wait); 2727 for (;;) { 2728 set_current_state(TASK_INTERRUPTIBLE); 2729 2730 if (kvm_vcpu_check_block(vcpu) < 0) 2731 break; 2732 2733 waited = true; 2734 schedule(); 2735 } 2736 finish_rcuwait(&vcpu->wait); 2737 cur = ktime_get(); 2738 out: 2739 kvm_arch_vcpu_unblocking(vcpu); 2740 block_ns = ktime_to_ns(cur) - ktime_to_ns(start); 2741 2742 update_halt_poll_stats( 2743 vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); 2744 2745 if (!kvm_arch_no_poll(vcpu)) { 2746 if (!vcpu_valid_wakeup(vcpu)) { 2747 shrink_halt_poll_ns(vcpu); 2748 } else if (vcpu->kvm->max_halt_poll_ns) { 2749 if (block_ns <= vcpu->halt_poll_ns) 2750 ; 2751 /* we had a long block, shrink polling */ 2752 else if (vcpu->halt_poll_ns && 2753 block_ns > vcpu->kvm->max_halt_poll_ns) 2754 shrink_halt_poll_ns(vcpu); 2755 /* we had a short halt and our poll time is too small */ 2756 else if (vcpu->halt_poll_ns < vcpu->kvm->max_halt_poll_ns && 2757 block_ns < vcpu->kvm->max_halt_poll_ns) 2758 grow_halt_poll_ns(vcpu); 2759 } else { 2760 vcpu->halt_poll_ns = 0; 2761 } 2762 } 2763 2764 trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); 2765 kvm_arch_vcpu_block_finish(vcpu); 2766 } 2767 EXPORT_SYMBOL_GPL(kvm_vcpu_block); 2768 2769 bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) 2770 { 2771 struct rcuwait *waitp; 2772 2773 waitp = kvm_arch_vcpu_get_wait(vcpu); 2774 if (rcuwait_wake_up(waitp)) { 2775 WRITE_ONCE(vcpu->ready, true); 2776 ++vcpu->stat.halt_wakeup; 2777 return true; 2778 } 2779 2780 return false; 2781 } 2782 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); 2783 2784 #ifndef CONFIG_S390 2785 /* 2786 * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. 2787 */ 2788 void kvm_vcpu_kick(struct kvm_vcpu *vcpu) 2789 { 2790 int me; 2791 int cpu = vcpu->cpu; 2792 2793 if (kvm_vcpu_wake_up(vcpu)) 2794 return; 2795 2796 me = get_cpu(); 2797 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 2798 if (kvm_arch_vcpu_should_kick(vcpu)) 2799 smp_send_reschedule(cpu); 2800 put_cpu(); 2801 } 2802 EXPORT_SYMBOL_GPL(kvm_vcpu_kick); 2803 #endif /* !CONFIG_S390 */ 2804 2805 int kvm_vcpu_yield_to(struct kvm_vcpu *target) 2806 { 2807 struct pid *pid; 2808 struct task_struct *task = NULL; 2809 int ret = 0; 2810 2811 rcu_read_lock(); 2812 pid = rcu_dereference(target->pid); 2813 if (pid) 2814 task = get_pid_task(pid, PIDTYPE_PID); 2815 rcu_read_unlock(); 2816 if (!task) 2817 return ret; 2818 ret = yield_to(task, 1); 2819 put_task_struct(task); 2820 2821 return ret; 2822 } 2823 EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to); 2824 2825 /* 2826 * Helper that checks whether a VCPU is eligible for directed yield. 2827 * Most eligible candidate to yield is decided by following heuristics: 2828 * 2829 * (a) VCPU which has not done pl-exit or cpu relax intercepted recently 2830 * (preempted lock holder), indicated by @in_spin_loop. 2831 * Set at the beginning and cleared at the end of interception/PLE handler. 2832 * 2833 * (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get 2834 * chance last time (mostly it has become eligible now since we have probably 2835 * yielded to lockholder in last iteration. This is done by toggling 2836 * @dy_eligible each time a VCPU checked for eligibility.) 2837 * 2838 * Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding 2839 * to preempted lock-holder could result in wrong VCPU selection and CPU 2840 * burning. Giving priority for a potential lock-holder increases lock 2841 * progress. 2842 * 2843 * Since algorithm is based on heuristics, accessing another VCPU data without 2844 * locking does not harm. It may result in trying to yield to same VCPU, fail 2845 * and continue with next VCPU and so on. 2846 */ 2847 static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu) 2848 { 2849 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT 2850 bool eligible; 2851 2852 eligible = !vcpu->spin_loop.in_spin_loop || 2853 vcpu->spin_loop.dy_eligible; 2854 2855 if (vcpu->spin_loop.in_spin_loop) 2856 kvm_vcpu_set_dy_eligible(vcpu, !vcpu->spin_loop.dy_eligible); 2857 2858 return eligible; 2859 #else 2860 return true; 2861 #endif 2862 } 2863 2864 /* 2865 * Unlike kvm_arch_vcpu_runnable, this function is called outside 2866 * a vcpu_load/vcpu_put pair. However, for most architectures 2867 * kvm_arch_vcpu_runnable does not require vcpu_load. 2868 */ 2869 bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) 2870 { 2871 return kvm_arch_vcpu_runnable(vcpu); 2872 } 2873 2874 static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu) 2875 { 2876 if (kvm_arch_dy_runnable(vcpu)) 2877 return true; 2878 2879 #ifdef CONFIG_KVM_ASYNC_PF 2880 if (!list_empty_careful(&vcpu->async_pf.done)) 2881 return true; 2882 #endif 2883 2884 return false; 2885 } 2886 2887 void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) 2888 { 2889 struct kvm *kvm = me->kvm; 2890 struct kvm_vcpu *vcpu; 2891 int last_boosted_vcpu = me->kvm->last_boosted_vcpu; 2892 int yielded = 0; 2893 int try = 3; 2894 int pass; 2895 int i; 2896 2897 kvm_vcpu_set_in_spin_loop(me, true); 2898 /* 2899 * We boost the priority of a VCPU that is runnable but not 2900 * currently running, because it got preempted by something 2901 * else and called schedule in __vcpu_run. Hopefully that 2902 * VCPU is holding the lock that we need and will release it. 2903 * We approximate round-robin by starting at the last boosted VCPU. 2904 */ 2905 for (pass = 0; pass < 2 && !yielded && try; pass++) { 2906 kvm_for_each_vcpu(i, vcpu, kvm) { 2907 if (!pass && i <= last_boosted_vcpu) { 2908 i = last_boosted_vcpu; 2909 continue; 2910 } else if (pass && i > last_boosted_vcpu) 2911 break; 2912 if (!READ_ONCE(vcpu->ready)) 2913 continue; 2914 if (vcpu == me) 2915 continue; 2916 if (rcuwait_active(&vcpu->wait) && 2917 !vcpu_dy_runnable(vcpu)) 2918 continue; 2919 if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode && 2920 !kvm_arch_vcpu_in_kernel(vcpu)) 2921 continue; 2922 if (!kvm_vcpu_eligible_for_directed_yield(vcpu)) 2923 continue; 2924 2925 yielded = kvm_vcpu_yield_to(vcpu); 2926 if (yielded > 0) { 2927 kvm->last_boosted_vcpu = i; 2928 break; 2929 } else if (yielded < 0) { 2930 try--; 2931 if (!try) 2932 break; 2933 } 2934 } 2935 } 2936 kvm_vcpu_set_in_spin_loop(me, false); 2937 2938 /* Ensure vcpu is not eligible during next spinloop */ 2939 kvm_vcpu_set_dy_eligible(me, false); 2940 } 2941 EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); 2942 2943 static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf) 2944 { 2945 struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data; 2946 struct page *page; 2947 2948 if (vmf->pgoff == 0) 2949 page = virt_to_page(vcpu->run); 2950 #ifdef CONFIG_X86 2951 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 2952 page = virt_to_page(vcpu->arch.pio_data); 2953 #endif 2954 #ifdef CONFIG_KVM_MMIO 2955 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 2956 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 2957 #endif 2958 else 2959 return kvm_arch_vcpu_fault(vcpu, vmf); 2960 get_page(page); 2961 vmf->page = page; 2962 return 0; 2963 } 2964 2965 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 2966 .fault = kvm_vcpu_fault, 2967 }; 2968 2969 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 2970 { 2971 vma->vm_ops = &kvm_vcpu_vm_ops; 2972 return 0; 2973 } 2974 2975 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 2976 { 2977 struct kvm_vcpu *vcpu = filp->private_data; 2978 2979 kvm_put_kvm(vcpu->kvm); 2980 return 0; 2981 } 2982 2983 static struct file_operations kvm_vcpu_fops = { 2984 .release = kvm_vcpu_release, 2985 .unlocked_ioctl = kvm_vcpu_ioctl, 2986 .mmap = kvm_vcpu_mmap, 2987 .llseek = noop_llseek, 2988 KVM_COMPAT(kvm_vcpu_compat_ioctl), 2989 }; 2990 2991 /* 2992 * Allocates an inode for the vcpu. 2993 */ 2994 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 2995 { 2996 char name[8 + 1 + ITOA_MAX_LEN + 1]; 2997 2998 snprintf(name, sizeof(name), "kvm-vcpu:%d", vcpu->vcpu_id); 2999 return anon_inode_getfd(name, &kvm_vcpu_fops, vcpu, O_RDWR | O_CLOEXEC); 3000 } 3001 3002 static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu) 3003 { 3004 #ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS 3005 struct dentry *debugfs_dentry; 3006 char dir_name[ITOA_MAX_LEN * 2]; 3007 3008 if (!debugfs_initialized()) 3009 return; 3010 3011 snprintf(dir_name, sizeof(dir_name), "vcpu%d", vcpu->vcpu_id); 3012 debugfs_dentry = debugfs_create_dir(dir_name, 3013 vcpu->kvm->debugfs_dentry); 3014 3015 kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry); 3016 #endif 3017 } 3018 3019 /* 3020 * Creates some virtual cpus. Good luck creating more than one. 3021 */ 3022 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 3023 { 3024 int r; 3025 struct kvm_vcpu *vcpu; 3026 struct page *page; 3027 3028 if (id >= KVM_MAX_VCPU_ID) 3029 return -EINVAL; 3030 3031 mutex_lock(&kvm->lock); 3032 if (kvm->created_vcpus == KVM_MAX_VCPUS) { 3033 mutex_unlock(&kvm->lock); 3034 return -EINVAL; 3035 } 3036 3037 kvm->created_vcpus++; 3038 mutex_unlock(&kvm->lock); 3039 3040 r = kvm_arch_vcpu_precreate(kvm, id); 3041 if (r) 3042 goto vcpu_decrement; 3043 3044 vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); 3045 if (!vcpu) { 3046 r = -ENOMEM; 3047 goto vcpu_decrement; 3048 } 3049 3050 BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE); 3051 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 3052 if (!page) { 3053 r = -ENOMEM; 3054 goto vcpu_free; 3055 } 3056 vcpu->run = page_address(page); 3057 3058 kvm_vcpu_init(vcpu, kvm, id); 3059 3060 r = kvm_arch_vcpu_create(vcpu); 3061 if (r) 3062 goto vcpu_free_run_page; 3063 3064 mutex_lock(&kvm->lock); 3065 if (kvm_get_vcpu_by_id(kvm, id)) { 3066 r = -EEXIST; 3067 goto unlock_vcpu_destroy; 3068 } 3069 3070 vcpu->vcpu_idx = atomic_read(&kvm->online_vcpus); 3071 BUG_ON(kvm->vcpus[vcpu->vcpu_idx]); 3072 3073 /* Now it's all set up, let userspace reach it */ 3074 kvm_get_kvm(kvm); 3075 r = create_vcpu_fd(vcpu); 3076 if (r < 0) { 3077 kvm_put_kvm_no_destroy(kvm); 3078 goto unlock_vcpu_destroy; 3079 } 3080 3081 kvm->vcpus[vcpu->vcpu_idx] = vcpu; 3082 3083 /* 3084 * Pairs with smp_rmb() in kvm_get_vcpu. Write kvm->vcpus 3085 * before kvm->online_vcpu's incremented value. 3086 */ 3087 smp_wmb(); 3088 atomic_inc(&kvm->online_vcpus); 3089 3090 mutex_unlock(&kvm->lock); 3091 kvm_arch_vcpu_postcreate(vcpu); 3092 kvm_create_vcpu_debugfs(vcpu); 3093 return r; 3094 3095 unlock_vcpu_destroy: 3096 mutex_unlock(&kvm->lock); 3097 kvm_arch_vcpu_destroy(vcpu); 3098 vcpu_free_run_page: 3099 free_page((unsigned long)vcpu->run); 3100 vcpu_free: 3101 kmem_cache_free(kvm_vcpu_cache, vcpu); 3102 vcpu_decrement: 3103 mutex_lock(&kvm->lock); 3104 kvm->created_vcpus--; 3105 mutex_unlock(&kvm->lock); 3106 return r; 3107 } 3108 3109 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 3110 { 3111 if (sigset) { 3112 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 3113 vcpu->sigset_active = 1; 3114 vcpu->sigset = *sigset; 3115 } else 3116 vcpu->sigset_active = 0; 3117 return 0; 3118 } 3119 3120 static long kvm_vcpu_ioctl(struct file *filp, 3121 unsigned int ioctl, unsigned long arg) 3122 { 3123 struct kvm_vcpu *vcpu = filp->private_data; 3124 void __user *argp = (void __user *)arg; 3125 int r; 3126 struct kvm_fpu *fpu = NULL; 3127 struct kvm_sregs *kvm_sregs = NULL; 3128 3129 if (vcpu->kvm->mm != current->mm) 3130 return -EIO; 3131 3132 if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) 3133 return -EINVAL; 3134 3135 /* 3136 * Some architectures have vcpu ioctls that are asynchronous to vcpu 3137 * execution; mutex_lock() would break them. 3138 */ 3139 r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg); 3140 if (r != -ENOIOCTLCMD) 3141 return r; 3142 3143 if (mutex_lock_killable(&vcpu->mutex)) 3144 return -EINTR; 3145 switch (ioctl) { 3146 case KVM_RUN: { 3147 struct pid *oldpid; 3148 r = -EINVAL; 3149 if (arg) 3150 goto out; 3151 oldpid = rcu_access_pointer(vcpu->pid); 3152 if (unlikely(oldpid != task_pid(current))) { 3153 /* The thread running this VCPU changed. */ 3154 struct pid *newpid; 3155 3156 r = kvm_arch_vcpu_run_pid_change(vcpu); 3157 if (r) 3158 break; 3159 3160 newpid = get_task_pid(current, PIDTYPE_PID); 3161 rcu_assign_pointer(vcpu->pid, newpid); 3162 if (oldpid) 3163 synchronize_rcu(); 3164 put_pid(oldpid); 3165 } 3166 r = kvm_arch_vcpu_ioctl_run(vcpu); 3167 trace_kvm_userspace_exit(vcpu->run->exit_reason, r); 3168 break; 3169 } 3170 case KVM_GET_REGS: { 3171 struct kvm_regs *kvm_regs; 3172 3173 r = -ENOMEM; 3174 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT); 3175 if (!kvm_regs) 3176 goto out; 3177 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 3178 if (r) 3179 goto out_free1; 3180 r = -EFAULT; 3181 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 3182 goto out_free1; 3183 r = 0; 3184 out_free1: 3185 kfree(kvm_regs); 3186 break; 3187 } 3188 case KVM_SET_REGS: { 3189 struct kvm_regs *kvm_regs; 3190 3191 kvm_regs = memdup_user(argp, sizeof(*kvm_regs)); 3192 if (IS_ERR(kvm_regs)) { 3193 r = PTR_ERR(kvm_regs); 3194 goto out; 3195 } 3196 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 3197 kfree(kvm_regs); 3198 break; 3199 } 3200 case KVM_GET_SREGS: { 3201 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), 3202 GFP_KERNEL_ACCOUNT); 3203 r = -ENOMEM; 3204 if (!kvm_sregs) 3205 goto out; 3206 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 3207 if (r) 3208 goto out; 3209 r = -EFAULT; 3210 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 3211 goto out; 3212 r = 0; 3213 break; 3214 } 3215 case KVM_SET_SREGS: { 3216 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 3217 if (IS_ERR(kvm_sregs)) { 3218 r = PTR_ERR(kvm_sregs); 3219 kvm_sregs = NULL; 3220 goto out; 3221 } 3222 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 3223 break; 3224 } 3225 case KVM_GET_MP_STATE: { 3226 struct kvm_mp_state mp_state; 3227 3228 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 3229 if (r) 3230 goto out; 3231 r = -EFAULT; 3232 if (copy_to_user(argp, &mp_state, sizeof(mp_state))) 3233 goto out; 3234 r = 0; 3235 break; 3236 } 3237 case KVM_SET_MP_STATE: { 3238 struct kvm_mp_state mp_state; 3239 3240 r = -EFAULT; 3241 if (copy_from_user(&mp_state, argp, sizeof(mp_state))) 3242 goto out; 3243 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 3244 break; 3245 } 3246 case KVM_TRANSLATE: { 3247 struct kvm_translation tr; 3248 3249 r = -EFAULT; 3250 if (copy_from_user(&tr, argp, sizeof(tr))) 3251 goto out; 3252 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 3253 if (r) 3254 goto out; 3255 r = -EFAULT; 3256 if (copy_to_user(argp, &tr, sizeof(tr))) 3257 goto out; 3258 r = 0; 3259 break; 3260 } 3261 case KVM_SET_GUEST_DEBUG: { 3262 struct kvm_guest_debug dbg; 3263 3264 r = -EFAULT; 3265 if (copy_from_user(&dbg, argp, sizeof(dbg))) 3266 goto out; 3267 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 3268 break; 3269 } 3270 case KVM_SET_SIGNAL_MASK: { 3271 struct kvm_signal_mask __user *sigmask_arg = argp; 3272 struct kvm_signal_mask kvm_sigmask; 3273 sigset_t sigset, *p; 3274 3275 p = NULL; 3276 if (argp) { 3277 r = -EFAULT; 3278 if (copy_from_user(&kvm_sigmask, argp, 3279 sizeof(kvm_sigmask))) 3280 goto out; 3281 r = -EINVAL; 3282 if (kvm_sigmask.len != sizeof(sigset)) 3283 goto out; 3284 r = -EFAULT; 3285 if (copy_from_user(&sigset, sigmask_arg->sigset, 3286 sizeof(sigset))) 3287 goto out; 3288 p = &sigset; 3289 } 3290 r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 3291 break; 3292 } 3293 case KVM_GET_FPU: { 3294 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT); 3295 r = -ENOMEM; 3296 if (!fpu) 3297 goto out; 3298 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 3299 if (r) 3300 goto out; 3301 r = -EFAULT; 3302 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 3303 goto out; 3304 r = 0; 3305 break; 3306 } 3307 case KVM_SET_FPU: { 3308 fpu = memdup_user(argp, sizeof(*fpu)); 3309 if (IS_ERR(fpu)) { 3310 r = PTR_ERR(fpu); 3311 fpu = NULL; 3312 goto out; 3313 } 3314 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 3315 break; 3316 } 3317 default: 3318 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 3319 } 3320 out: 3321 mutex_unlock(&vcpu->mutex); 3322 kfree(fpu); 3323 kfree(kvm_sregs); 3324 return r; 3325 } 3326 3327 #ifdef CONFIG_KVM_COMPAT 3328 static long kvm_vcpu_compat_ioctl(struct file *filp, 3329 unsigned int ioctl, unsigned long arg) 3330 { 3331 struct kvm_vcpu *vcpu = filp->private_data; 3332 void __user *argp = compat_ptr(arg); 3333 int r; 3334 3335 if (vcpu->kvm->mm != current->mm) 3336 return -EIO; 3337 3338 switch (ioctl) { 3339 case KVM_SET_SIGNAL_MASK: { 3340 struct kvm_signal_mask __user *sigmask_arg = argp; 3341 struct kvm_signal_mask kvm_sigmask; 3342 sigset_t sigset; 3343 3344 if (argp) { 3345 r = -EFAULT; 3346 if (copy_from_user(&kvm_sigmask, argp, 3347 sizeof(kvm_sigmask))) 3348 goto out; 3349 r = -EINVAL; 3350 if (kvm_sigmask.len != sizeof(compat_sigset_t)) 3351 goto out; 3352 r = -EFAULT; 3353 if (get_compat_sigset(&sigset, (void *)sigmask_arg->sigset)) 3354 goto out; 3355 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 3356 } else 3357 r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL); 3358 break; 3359 } 3360 default: 3361 r = kvm_vcpu_ioctl(filp, ioctl, arg); 3362 } 3363 3364 out: 3365 return r; 3366 } 3367 #endif 3368 3369 static int kvm_device_mmap(struct file *filp, struct vm_area_struct *vma) 3370 { 3371 struct kvm_device *dev = filp->private_data; 3372 3373 if (dev->ops->mmap) 3374 return dev->ops->mmap(dev, vma); 3375 3376 return -ENODEV; 3377 } 3378 3379 static int kvm_device_ioctl_attr(struct kvm_device *dev, 3380 int (*accessor)(struct kvm_device *dev, 3381 struct kvm_device_attr *attr), 3382 unsigned long arg) 3383 { 3384 struct kvm_device_attr attr; 3385 3386 if (!accessor) 3387 return -EPERM; 3388 3389 if (copy_from_user(&attr, (void __user *)arg, sizeof(attr))) 3390 return -EFAULT; 3391 3392 return accessor(dev, &attr); 3393 } 3394 3395 static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, 3396 unsigned long arg) 3397 { 3398 struct kvm_device *dev = filp->private_data; 3399 3400 if (dev->kvm->mm != current->mm) 3401 return -EIO; 3402 3403 switch (ioctl) { 3404 case KVM_SET_DEVICE_ATTR: 3405 return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg); 3406 case KVM_GET_DEVICE_ATTR: 3407 return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg); 3408 case KVM_HAS_DEVICE_ATTR: 3409 return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg); 3410 default: 3411 if (dev->ops->ioctl) 3412 return dev->ops->ioctl(dev, ioctl, arg); 3413 3414 return -ENOTTY; 3415 } 3416 } 3417 3418 static int kvm_device_release(struct inode *inode, struct file *filp) 3419 { 3420 struct kvm_device *dev = filp->private_data; 3421 struct kvm *kvm = dev->kvm; 3422 3423 if (dev->ops->release) { 3424 mutex_lock(&kvm->lock); 3425 list_del(&dev->vm_node); 3426 dev->ops->release(dev); 3427 mutex_unlock(&kvm->lock); 3428 } 3429 3430 kvm_put_kvm(kvm); 3431 return 0; 3432 } 3433 3434 static const struct file_operations kvm_device_fops = { 3435 .unlocked_ioctl = kvm_device_ioctl, 3436 .release = kvm_device_release, 3437 KVM_COMPAT(kvm_device_ioctl), 3438 .mmap = kvm_device_mmap, 3439 }; 3440 3441 struct kvm_device *kvm_device_from_filp(struct file *filp) 3442 { 3443 if (filp->f_op != &kvm_device_fops) 3444 return NULL; 3445 3446 return filp->private_data; 3447 } 3448 3449 static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = { 3450 #ifdef CONFIG_KVM_MPIC 3451 [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops, 3452 [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops, 3453 #endif 3454 }; 3455 3456 int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type) 3457 { 3458 if (type >= ARRAY_SIZE(kvm_device_ops_table)) 3459 return -ENOSPC; 3460 3461 if (kvm_device_ops_table[type] != NULL) 3462 return -EEXIST; 3463 3464 kvm_device_ops_table[type] = ops; 3465 return 0; 3466 } 3467 3468 void kvm_unregister_device_ops(u32 type) 3469 { 3470 if (kvm_device_ops_table[type] != NULL) 3471 kvm_device_ops_table[type] = NULL; 3472 } 3473 3474 static int kvm_ioctl_create_device(struct kvm *kvm, 3475 struct kvm_create_device *cd) 3476 { 3477 const struct kvm_device_ops *ops = NULL; 3478 struct kvm_device *dev; 3479 bool test = cd->flags & KVM_CREATE_DEVICE_TEST; 3480 int type; 3481 int ret; 3482 3483 if (cd->type >= ARRAY_SIZE(kvm_device_ops_table)) 3484 return -ENODEV; 3485 3486 type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table)); 3487 ops = kvm_device_ops_table[type]; 3488 if (ops == NULL) 3489 return -ENODEV; 3490 3491 if (test) 3492 return 0; 3493 3494 dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT); 3495 if (!dev) 3496 return -ENOMEM; 3497 3498 dev->ops = ops; 3499 dev->kvm = kvm; 3500 3501 mutex_lock(&kvm->lock); 3502 ret = ops->create(dev, type); 3503 if (ret < 0) { 3504 mutex_unlock(&kvm->lock); 3505 kfree(dev); 3506 return ret; 3507 } 3508 list_add(&dev->vm_node, &kvm->devices); 3509 mutex_unlock(&kvm->lock); 3510 3511 if (ops->init) 3512 ops->init(dev); 3513 3514 kvm_get_kvm(kvm); 3515 ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR | O_CLOEXEC); 3516 if (ret < 0) { 3517 kvm_put_kvm_no_destroy(kvm); 3518 mutex_lock(&kvm->lock); 3519 list_del(&dev->vm_node); 3520 mutex_unlock(&kvm->lock); 3521 ops->destroy(dev); 3522 return ret; 3523 } 3524 3525 cd->fd = ret; 3526 return 0; 3527 } 3528 3529 static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) 3530 { 3531 switch (arg) { 3532 case KVM_CAP_USER_MEMORY: 3533 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 3534 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 3535 case KVM_CAP_INTERNAL_ERROR_DATA: 3536 #ifdef CONFIG_HAVE_KVM_MSI 3537 case KVM_CAP_SIGNAL_MSI: 3538 #endif 3539 #ifdef CONFIG_HAVE_KVM_IRQFD 3540 case KVM_CAP_IRQFD: 3541 case KVM_CAP_IRQFD_RESAMPLE: 3542 #endif 3543 case KVM_CAP_IOEVENTFD_ANY_LENGTH: 3544 case KVM_CAP_CHECK_EXTENSION_VM: 3545 case KVM_CAP_ENABLE_CAP_VM: 3546 case KVM_CAP_HALT_POLL: 3547 return 1; 3548 #ifdef CONFIG_KVM_MMIO 3549 case KVM_CAP_COALESCED_MMIO: 3550 return KVM_COALESCED_MMIO_PAGE_OFFSET; 3551 case KVM_CAP_COALESCED_PIO: 3552 return 1; 3553 #endif 3554 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3555 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: 3556 return KVM_DIRTY_LOG_MANUAL_CAPS; 3557 #endif 3558 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3559 case KVM_CAP_IRQ_ROUTING: 3560 return KVM_MAX_IRQ_ROUTES; 3561 #endif 3562 #if KVM_ADDRESS_SPACE_NUM > 1 3563 case KVM_CAP_MULTI_ADDRESS_SPACE: 3564 return KVM_ADDRESS_SPACE_NUM; 3565 #endif 3566 case KVM_CAP_NR_MEMSLOTS: 3567 return KVM_USER_MEM_SLOTS; 3568 default: 3569 break; 3570 } 3571 return kvm_vm_ioctl_check_extension(kvm, arg); 3572 } 3573 3574 int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, 3575 struct kvm_enable_cap *cap) 3576 { 3577 return -EINVAL; 3578 } 3579 3580 static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, 3581 struct kvm_enable_cap *cap) 3582 { 3583 switch (cap->cap) { 3584 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3585 case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: { 3586 u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE; 3587 3588 if (cap->args[0] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE) 3589 allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS; 3590 3591 if (cap->flags || (cap->args[0] & ~allowed_options)) 3592 return -EINVAL; 3593 kvm->manual_dirty_log_protect = cap->args[0]; 3594 return 0; 3595 } 3596 #endif 3597 case KVM_CAP_HALT_POLL: { 3598 if (cap->flags || cap->args[0] != (unsigned int)cap->args[0]) 3599 return -EINVAL; 3600 3601 kvm->max_halt_poll_ns = cap->args[0]; 3602 return 0; 3603 } 3604 default: 3605 return kvm_vm_ioctl_enable_cap(kvm, cap); 3606 } 3607 } 3608 3609 static long kvm_vm_ioctl(struct file *filp, 3610 unsigned int ioctl, unsigned long arg) 3611 { 3612 struct kvm *kvm = filp->private_data; 3613 void __user *argp = (void __user *)arg; 3614 int r; 3615 3616 if (kvm->mm != current->mm) 3617 return -EIO; 3618 switch (ioctl) { 3619 case KVM_CREATE_VCPU: 3620 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 3621 break; 3622 case KVM_ENABLE_CAP: { 3623 struct kvm_enable_cap cap; 3624 3625 r = -EFAULT; 3626 if (copy_from_user(&cap, argp, sizeof(cap))) 3627 goto out; 3628 r = kvm_vm_ioctl_enable_cap_generic(kvm, &cap); 3629 break; 3630 } 3631 case KVM_SET_USER_MEMORY_REGION: { 3632 struct kvm_userspace_memory_region kvm_userspace_mem; 3633 3634 r = -EFAULT; 3635 if (copy_from_user(&kvm_userspace_mem, argp, 3636 sizeof(kvm_userspace_mem))) 3637 goto out; 3638 3639 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem); 3640 break; 3641 } 3642 case KVM_GET_DIRTY_LOG: { 3643 struct kvm_dirty_log log; 3644 3645 r = -EFAULT; 3646 if (copy_from_user(&log, argp, sizeof(log))) 3647 goto out; 3648 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3649 break; 3650 } 3651 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT 3652 case KVM_CLEAR_DIRTY_LOG: { 3653 struct kvm_clear_dirty_log log; 3654 3655 r = -EFAULT; 3656 if (copy_from_user(&log, argp, sizeof(log))) 3657 goto out; 3658 r = kvm_vm_ioctl_clear_dirty_log(kvm, &log); 3659 break; 3660 } 3661 #endif 3662 #ifdef CONFIG_KVM_MMIO 3663 case KVM_REGISTER_COALESCED_MMIO: { 3664 struct kvm_coalesced_mmio_zone zone; 3665 3666 r = -EFAULT; 3667 if (copy_from_user(&zone, argp, sizeof(zone))) 3668 goto out; 3669 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 3670 break; 3671 } 3672 case KVM_UNREGISTER_COALESCED_MMIO: { 3673 struct kvm_coalesced_mmio_zone zone; 3674 3675 r = -EFAULT; 3676 if (copy_from_user(&zone, argp, sizeof(zone))) 3677 goto out; 3678 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 3679 break; 3680 } 3681 #endif 3682 case KVM_IRQFD: { 3683 struct kvm_irqfd data; 3684 3685 r = -EFAULT; 3686 if (copy_from_user(&data, argp, sizeof(data))) 3687 goto out; 3688 r = kvm_irqfd(kvm, &data); 3689 break; 3690 } 3691 case KVM_IOEVENTFD: { 3692 struct kvm_ioeventfd data; 3693 3694 r = -EFAULT; 3695 if (copy_from_user(&data, argp, sizeof(data))) 3696 goto out; 3697 r = kvm_ioeventfd(kvm, &data); 3698 break; 3699 } 3700 #ifdef CONFIG_HAVE_KVM_MSI 3701 case KVM_SIGNAL_MSI: { 3702 struct kvm_msi msi; 3703 3704 r = -EFAULT; 3705 if (copy_from_user(&msi, argp, sizeof(msi))) 3706 goto out; 3707 r = kvm_send_userspace_msi(kvm, &msi); 3708 break; 3709 } 3710 #endif 3711 #ifdef __KVM_HAVE_IRQ_LINE 3712 case KVM_IRQ_LINE_STATUS: 3713 case KVM_IRQ_LINE: { 3714 struct kvm_irq_level irq_event; 3715 3716 r = -EFAULT; 3717 if (copy_from_user(&irq_event, argp, sizeof(irq_event))) 3718 goto out; 3719 3720 r = kvm_vm_ioctl_irq_line(kvm, &irq_event, 3721 ioctl == KVM_IRQ_LINE_STATUS); 3722 if (r) 3723 goto out; 3724 3725 r = -EFAULT; 3726 if (ioctl == KVM_IRQ_LINE_STATUS) { 3727 if (copy_to_user(argp, &irq_event, sizeof(irq_event))) 3728 goto out; 3729 } 3730 3731 r = 0; 3732 break; 3733 } 3734 #endif 3735 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING 3736 case KVM_SET_GSI_ROUTING: { 3737 struct kvm_irq_routing routing; 3738 struct kvm_irq_routing __user *urouting; 3739 struct kvm_irq_routing_entry *entries = NULL; 3740 3741 r = -EFAULT; 3742 if (copy_from_user(&routing, argp, sizeof(routing))) 3743 goto out; 3744 r = -EINVAL; 3745 if (!kvm_arch_can_set_irq_routing(kvm)) 3746 goto out; 3747 if (routing.nr > KVM_MAX_IRQ_ROUTES) 3748 goto out; 3749 if (routing.flags) 3750 goto out; 3751 if (routing.nr) { 3752 urouting = argp; 3753 entries = vmemdup_user(urouting->entries, 3754 array_size(sizeof(*entries), 3755 routing.nr)); 3756 if (IS_ERR(entries)) { 3757 r = PTR_ERR(entries); 3758 goto out; 3759 } 3760 } 3761 r = kvm_set_irq_routing(kvm, entries, routing.nr, 3762 routing.flags); 3763 kvfree(entries); 3764 break; 3765 } 3766 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */ 3767 case KVM_CREATE_DEVICE: { 3768 struct kvm_create_device cd; 3769 3770 r = -EFAULT; 3771 if (copy_from_user(&cd, argp, sizeof(cd))) 3772 goto out; 3773 3774 r = kvm_ioctl_create_device(kvm, &cd); 3775 if (r) 3776 goto out; 3777 3778 r = -EFAULT; 3779 if (copy_to_user(argp, &cd, sizeof(cd))) 3780 goto out; 3781 3782 r = 0; 3783 break; 3784 } 3785 case KVM_CHECK_EXTENSION: 3786 r = kvm_vm_ioctl_check_extension_generic(kvm, arg); 3787 break; 3788 default: 3789 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 3790 } 3791 out: 3792 return r; 3793 } 3794 3795 #ifdef CONFIG_KVM_COMPAT 3796 struct compat_kvm_dirty_log { 3797 __u32 slot; 3798 __u32 padding1; 3799 union { 3800 compat_uptr_t dirty_bitmap; /* one bit per page */ 3801 __u64 padding2; 3802 }; 3803 }; 3804 3805 static long kvm_vm_compat_ioctl(struct file *filp, 3806 unsigned int ioctl, unsigned long arg) 3807 { 3808 struct kvm *kvm = filp->private_data; 3809 int r; 3810 3811 if (kvm->mm != current->mm) 3812 return -EIO; 3813 switch (ioctl) { 3814 case KVM_GET_DIRTY_LOG: { 3815 struct compat_kvm_dirty_log compat_log; 3816 struct kvm_dirty_log log; 3817 3818 if (copy_from_user(&compat_log, (void __user *)arg, 3819 sizeof(compat_log))) 3820 return -EFAULT; 3821 log.slot = compat_log.slot; 3822 log.padding1 = compat_log.padding1; 3823 log.padding2 = compat_log.padding2; 3824 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 3825 3826 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 3827 break; 3828 } 3829 default: 3830 r = kvm_vm_ioctl(filp, ioctl, arg); 3831 } 3832 return r; 3833 } 3834 #endif 3835 3836 static struct file_operations kvm_vm_fops = { 3837 .release = kvm_vm_release, 3838 .unlocked_ioctl = kvm_vm_ioctl, 3839 .llseek = noop_llseek, 3840 KVM_COMPAT(kvm_vm_compat_ioctl), 3841 }; 3842 3843 static int kvm_dev_ioctl_create_vm(unsigned long type) 3844 { 3845 int r; 3846 struct kvm *kvm; 3847 struct file *file; 3848 3849 kvm = kvm_create_vm(type); 3850 if (IS_ERR(kvm)) 3851 return PTR_ERR(kvm); 3852 #ifdef CONFIG_KVM_MMIO 3853 r = kvm_coalesced_mmio_init(kvm); 3854 if (r < 0) 3855 goto put_kvm; 3856 #endif 3857 r = get_unused_fd_flags(O_CLOEXEC); 3858 if (r < 0) 3859 goto put_kvm; 3860 3861 file = anon_inode_getfile("kvm-vm", &kvm_vm_fops, kvm, O_RDWR); 3862 if (IS_ERR(file)) { 3863 put_unused_fd(r); 3864 r = PTR_ERR(file); 3865 goto put_kvm; 3866 } 3867 3868 /* 3869 * Don't call kvm_put_kvm anymore at this point; file->f_op is 3870 * already set, with ->release() being kvm_vm_release(). In error 3871 * cases it will be called by the final fput(file) and will take 3872 * care of doing kvm_put_kvm(kvm). 3873 */ 3874 if (kvm_create_vm_debugfs(kvm, r) < 0) { 3875 put_unused_fd(r); 3876 fput(file); 3877 return -ENOMEM; 3878 } 3879 kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm); 3880 3881 fd_install(r, file); 3882 return r; 3883 3884 put_kvm: 3885 kvm_put_kvm(kvm); 3886 return r; 3887 } 3888 3889 static long kvm_dev_ioctl(struct file *filp, 3890 unsigned int ioctl, unsigned long arg) 3891 { 3892 long r = -EINVAL; 3893 3894 switch (ioctl) { 3895 case KVM_GET_API_VERSION: 3896 if (arg) 3897 goto out; 3898 r = KVM_API_VERSION; 3899 break; 3900 case KVM_CREATE_VM: 3901 r = kvm_dev_ioctl_create_vm(arg); 3902 break; 3903 case KVM_CHECK_EXTENSION: 3904 r = kvm_vm_ioctl_check_extension_generic(NULL, arg); 3905 break; 3906 case KVM_GET_VCPU_MMAP_SIZE: 3907 if (arg) 3908 goto out; 3909 r = PAGE_SIZE; /* struct kvm_run */ 3910 #ifdef CONFIG_X86 3911 r += PAGE_SIZE; /* pio data page */ 3912 #endif 3913 #ifdef CONFIG_KVM_MMIO 3914 r += PAGE_SIZE; /* coalesced mmio ring page */ 3915 #endif 3916 break; 3917 case KVM_TRACE_ENABLE: 3918 case KVM_TRACE_PAUSE: 3919 case KVM_TRACE_DISABLE: 3920 r = -EOPNOTSUPP; 3921 break; 3922 default: 3923 return kvm_arch_dev_ioctl(filp, ioctl, arg); 3924 } 3925 out: 3926 return r; 3927 } 3928 3929 static struct file_operations kvm_chardev_ops = { 3930 .unlocked_ioctl = kvm_dev_ioctl, 3931 .llseek = noop_llseek, 3932 KVM_COMPAT(kvm_dev_ioctl), 3933 }; 3934 3935 static struct miscdevice kvm_dev = { 3936 KVM_MINOR, 3937 "kvm", 3938 &kvm_chardev_ops, 3939 }; 3940 3941 static void hardware_enable_nolock(void *junk) 3942 { 3943 int cpu = raw_smp_processor_id(); 3944 int r; 3945 3946 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3947 return; 3948 3949 cpumask_set_cpu(cpu, cpus_hardware_enabled); 3950 3951 r = kvm_arch_hardware_enable(); 3952 3953 if (r) { 3954 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3955 atomic_inc(&hardware_enable_failed); 3956 pr_info("kvm: enabling virtualization on CPU%d failed\n", cpu); 3957 } 3958 } 3959 3960 static int kvm_starting_cpu(unsigned int cpu) 3961 { 3962 raw_spin_lock(&kvm_count_lock); 3963 if (kvm_usage_count) 3964 hardware_enable_nolock(NULL); 3965 raw_spin_unlock(&kvm_count_lock); 3966 return 0; 3967 } 3968 3969 static void hardware_disable_nolock(void *junk) 3970 { 3971 int cpu = raw_smp_processor_id(); 3972 3973 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 3974 return; 3975 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 3976 kvm_arch_hardware_disable(); 3977 } 3978 3979 static int kvm_dying_cpu(unsigned int cpu) 3980 { 3981 raw_spin_lock(&kvm_count_lock); 3982 if (kvm_usage_count) 3983 hardware_disable_nolock(NULL); 3984 raw_spin_unlock(&kvm_count_lock); 3985 return 0; 3986 } 3987 3988 static void hardware_disable_all_nolock(void) 3989 { 3990 BUG_ON(!kvm_usage_count); 3991 3992 kvm_usage_count--; 3993 if (!kvm_usage_count) 3994 on_each_cpu(hardware_disable_nolock, NULL, 1); 3995 } 3996 3997 static void hardware_disable_all(void) 3998 { 3999 raw_spin_lock(&kvm_count_lock); 4000 hardware_disable_all_nolock(); 4001 raw_spin_unlock(&kvm_count_lock); 4002 } 4003 4004 static int hardware_enable_all(void) 4005 { 4006 int r = 0; 4007 4008 raw_spin_lock(&kvm_count_lock); 4009 4010 kvm_usage_count++; 4011 if (kvm_usage_count == 1) { 4012 atomic_set(&hardware_enable_failed, 0); 4013 on_each_cpu(hardware_enable_nolock, NULL, 1); 4014 4015 if (atomic_read(&hardware_enable_failed)) { 4016 hardware_disable_all_nolock(); 4017 r = -EBUSY; 4018 } 4019 } 4020 4021 raw_spin_unlock(&kvm_count_lock); 4022 4023 return r; 4024 } 4025 4026 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 4027 void *v) 4028 { 4029 /* 4030 * Some (well, at least mine) BIOSes hang on reboot if 4031 * in vmx root mode. 4032 * 4033 * And Intel TXT required VMX off for all cpu when system shutdown. 4034 */ 4035 pr_info("kvm: exiting hardware virtualization\n"); 4036 kvm_rebooting = true; 4037 on_each_cpu(hardware_disable_nolock, NULL, 1); 4038 return NOTIFY_OK; 4039 } 4040 4041 static struct notifier_block kvm_reboot_notifier = { 4042 .notifier_call = kvm_reboot, 4043 .priority = 0, 4044 }; 4045 4046 static void kvm_io_bus_destroy(struct kvm_io_bus *bus) 4047 { 4048 int i; 4049 4050 for (i = 0; i < bus->dev_count; i++) { 4051 struct kvm_io_device *pos = bus->range[i].dev; 4052 4053 kvm_iodevice_destructor(pos); 4054 } 4055 kfree(bus); 4056 } 4057 4058 static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1, 4059 const struct kvm_io_range *r2) 4060 { 4061 gpa_t addr1 = r1->addr; 4062 gpa_t addr2 = r2->addr; 4063 4064 if (addr1 < addr2) 4065 return -1; 4066 4067 /* If r2->len == 0, match the exact address. If r2->len != 0, 4068 * accept any overlapping write. Any order is acceptable for 4069 * overlapping ranges, because kvm_io_bus_get_first_dev ensures 4070 * we process all of them. 4071 */ 4072 if (r2->len) { 4073 addr1 += r1->len; 4074 addr2 += r2->len; 4075 } 4076 4077 if (addr1 > addr2) 4078 return 1; 4079 4080 return 0; 4081 } 4082 4083 static int kvm_io_bus_sort_cmp(const void *p1, const void *p2) 4084 { 4085 return kvm_io_bus_cmp(p1, p2); 4086 } 4087 4088 static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus, 4089 gpa_t addr, int len) 4090 { 4091 struct kvm_io_range *range, key; 4092 int off; 4093 4094 key = (struct kvm_io_range) { 4095 .addr = addr, 4096 .len = len, 4097 }; 4098 4099 range = bsearch(&key, bus->range, bus->dev_count, 4100 sizeof(struct kvm_io_range), kvm_io_bus_sort_cmp); 4101 if (range == NULL) 4102 return -ENOENT; 4103 4104 off = range - bus->range; 4105 4106 while (off > 0 && kvm_io_bus_cmp(&key, &bus->range[off-1]) == 0) 4107 off--; 4108 4109 return off; 4110 } 4111 4112 static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4113 struct kvm_io_range *range, const void *val) 4114 { 4115 int idx; 4116 4117 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4118 if (idx < 0) 4119 return -EOPNOTSUPP; 4120 4121 while (idx < bus->dev_count && 4122 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4123 if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr, 4124 range->len, val)) 4125 return idx; 4126 idx++; 4127 } 4128 4129 return -EOPNOTSUPP; 4130 } 4131 4132 /* kvm_io_bus_write - called under kvm->slots_lock */ 4133 int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4134 int len, const void *val) 4135 { 4136 struct kvm_io_bus *bus; 4137 struct kvm_io_range range; 4138 int r; 4139 4140 range = (struct kvm_io_range) { 4141 .addr = addr, 4142 .len = len, 4143 }; 4144 4145 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4146 if (!bus) 4147 return -ENOMEM; 4148 r = __kvm_io_bus_write(vcpu, bus, &range, val); 4149 return r < 0 ? r : 0; 4150 } 4151 EXPORT_SYMBOL_GPL(kvm_io_bus_write); 4152 4153 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */ 4154 int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, 4155 gpa_t addr, int len, const void *val, long cookie) 4156 { 4157 struct kvm_io_bus *bus; 4158 struct kvm_io_range range; 4159 4160 range = (struct kvm_io_range) { 4161 .addr = addr, 4162 .len = len, 4163 }; 4164 4165 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4166 if (!bus) 4167 return -ENOMEM; 4168 4169 /* First try the device referenced by cookie. */ 4170 if ((cookie >= 0) && (cookie < bus->dev_count) && 4171 (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0)) 4172 if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len, 4173 val)) 4174 return cookie; 4175 4176 /* 4177 * cookie contained garbage; fall back to search and return the 4178 * correct cookie value. 4179 */ 4180 return __kvm_io_bus_write(vcpu, bus, &range, val); 4181 } 4182 4183 static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus, 4184 struct kvm_io_range *range, void *val) 4185 { 4186 int idx; 4187 4188 idx = kvm_io_bus_get_first_dev(bus, range->addr, range->len); 4189 if (idx < 0) 4190 return -EOPNOTSUPP; 4191 4192 while (idx < bus->dev_count && 4193 kvm_io_bus_cmp(range, &bus->range[idx]) == 0) { 4194 if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr, 4195 range->len, val)) 4196 return idx; 4197 idx++; 4198 } 4199 4200 return -EOPNOTSUPP; 4201 } 4202 4203 /* kvm_io_bus_read - called under kvm->slots_lock */ 4204 int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr, 4205 int len, void *val) 4206 { 4207 struct kvm_io_bus *bus; 4208 struct kvm_io_range range; 4209 int r; 4210 4211 range = (struct kvm_io_range) { 4212 .addr = addr, 4213 .len = len, 4214 }; 4215 4216 bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu); 4217 if (!bus) 4218 return -ENOMEM; 4219 r = __kvm_io_bus_read(vcpu, bus, &range, val); 4220 return r < 0 ? r : 0; 4221 } 4222 4223 /* Caller must hold slots_lock. */ 4224 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, 4225 int len, struct kvm_io_device *dev) 4226 { 4227 int i; 4228 struct kvm_io_bus *new_bus, *bus; 4229 struct kvm_io_range range; 4230 4231 bus = kvm_get_bus(kvm, bus_idx); 4232 if (!bus) 4233 return -ENOMEM; 4234 4235 /* exclude ioeventfd which is limited by maximum fd */ 4236 if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1) 4237 return -ENOSPC; 4238 4239 new_bus = kmalloc(struct_size(bus, range, bus->dev_count + 1), 4240 GFP_KERNEL_ACCOUNT); 4241 if (!new_bus) 4242 return -ENOMEM; 4243 4244 range = (struct kvm_io_range) { 4245 .addr = addr, 4246 .len = len, 4247 .dev = dev, 4248 }; 4249 4250 for (i = 0; i < bus->dev_count; i++) 4251 if (kvm_io_bus_cmp(&bus->range[i], &range) > 0) 4252 break; 4253 4254 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4255 new_bus->dev_count++; 4256 new_bus->range[i] = range; 4257 memcpy(new_bus->range + i + 1, bus->range + i, 4258 (bus->dev_count - i) * sizeof(struct kvm_io_range)); 4259 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4260 synchronize_srcu_expedited(&kvm->srcu); 4261 kfree(bus); 4262 4263 return 0; 4264 } 4265 4266 /* Caller must hold slots_lock. */ 4267 void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4268 struct kvm_io_device *dev) 4269 { 4270 int i; 4271 struct kvm_io_bus *new_bus, *bus; 4272 4273 bus = kvm_get_bus(kvm, bus_idx); 4274 if (!bus) 4275 return; 4276 4277 for (i = 0; i < bus->dev_count; i++) 4278 if (bus->range[i].dev == dev) { 4279 break; 4280 } 4281 4282 if (i == bus->dev_count) 4283 return; 4284 4285 new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1), 4286 GFP_KERNEL_ACCOUNT); 4287 if (!new_bus) { 4288 pr_err("kvm: failed to shrink bus, removing it completely\n"); 4289 goto broken; 4290 } 4291 4292 memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range)); 4293 new_bus->dev_count--; 4294 memcpy(new_bus->range + i, bus->range + i + 1, 4295 (new_bus->dev_count - i) * sizeof(struct kvm_io_range)); 4296 4297 broken: 4298 rcu_assign_pointer(kvm->buses[bus_idx], new_bus); 4299 synchronize_srcu_expedited(&kvm->srcu); 4300 kfree(bus); 4301 return; 4302 } 4303 4304 struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, 4305 gpa_t addr) 4306 { 4307 struct kvm_io_bus *bus; 4308 int dev_idx, srcu_idx; 4309 struct kvm_io_device *iodev = NULL; 4310 4311 srcu_idx = srcu_read_lock(&kvm->srcu); 4312 4313 bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu); 4314 if (!bus) 4315 goto out_unlock; 4316 4317 dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1); 4318 if (dev_idx < 0) 4319 goto out_unlock; 4320 4321 iodev = bus->range[dev_idx].dev; 4322 4323 out_unlock: 4324 srcu_read_unlock(&kvm->srcu, srcu_idx); 4325 4326 return iodev; 4327 } 4328 EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev); 4329 4330 static int kvm_debugfs_open(struct inode *inode, struct file *file, 4331 int (*get)(void *, u64 *), int (*set)(void *, u64), 4332 const char *fmt) 4333 { 4334 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4335 inode->i_private; 4336 4337 /* The debugfs files are a reference to the kvm struct which 4338 * is still valid when kvm_destroy_vm is called. 4339 * To avoid the race between open and the removal of the debugfs 4340 * directory we test against the users count. 4341 */ 4342 if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) 4343 return -ENOENT; 4344 4345 if (simple_attr_open(inode, file, get, 4346 KVM_DBGFS_GET_MODE(stat_data->dbgfs_item) & 0222 4347 ? set : NULL, 4348 fmt)) { 4349 kvm_put_kvm(stat_data->kvm); 4350 return -ENOMEM; 4351 } 4352 4353 return 0; 4354 } 4355 4356 static int kvm_debugfs_release(struct inode *inode, struct file *file) 4357 { 4358 struct kvm_stat_data *stat_data = (struct kvm_stat_data *) 4359 inode->i_private; 4360 4361 simple_attr_release(inode, file); 4362 kvm_put_kvm(stat_data->kvm); 4363 4364 return 0; 4365 } 4366 4367 static int kvm_get_stat_per_vm(struct kvm *kvm, size_t offset, u64 *val) 4368 { 4369 *val = *(ulong *)((void *)kvm + offset); 4370 4371 return 0; 4372 } 4373 4374 static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset) 4375 { 4376 *(ulong *)((void *)kvm + offset) = 0; 4377 4378 return 0; 4379 } 4380 4381 static int kvm_get_stat_per_vcpu(struct kvm *kvm, size_t offset, u64 *val) 4382 { 4383 int i; 4384 struct kvm_vcpu *vcpu; 4385 4386 *val = 0; 4387 4388 kvm_for_each_vcpu(i, vcpu, kvm) 4389 *val += *(u64 *)((void *)vcpu + offset); 4390 4391 return 0; 4392 } 4393 4394 static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset) 4395 { 4396 int i; 4397 struct kvm_vcpu *vcpu; 4398 4399 kvm_for_each_vcpu(i, vcpu, kvm) 4400 *(u64 *)((void *)vcpu + offset) = 0; 4401 4402 return 0; 4403 } 4404 4405 static int kvm_stat_data_get(void *data, u64 *val) 4406 { 4407 int r = -EFAULT; 4408 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4409 4410 switch (stat_data->dbgfs_item->kind) { 4411 case KVM_STAT_VM: 4412 r = kvm_get_stat_per_vm(stat_data->kvm, 4413 stat_data->dbgfs_item->offset, val); 4414 break; 4415 case KVM_STAT_VCPU: 4416 r = kvm_get_stat_per_vcpu(stat_data->kvm, 4417 stat_data->dbgfs_item->offset, val); 4418 break; 4419 } 4420 4421 return r; 4422 } 4423 4424 static int kvm_stat_data_clear(void *data, u64 val) 4425 { 4426 int r = -EFAULT; 4427 struct kvm_stat_data *stat_data = (struct kvm_stat_data *)data; 4428 4429 if (val) 4430 return -EINVAL; 4431 4432 switch (stat_data->dbgfs_item->kind) { 4433 case KVM_STAT_VM: 4434 r = kvm_clear_stat_per_vm(stat_data->kvm, 4435 stat_data->dbgfs_item->offset); 4436 break; 4437 case KVM_STAT_VCPU: 4438 r = kvm_clear_stat_per_vcpu(stat_data->kvm, 4439 stat_data->dbgfs_item->offset); 4440 break; 4441 } 4442 4443 return r; 4444 } 4445 4446 static int kvm_stat_data_open(struct inode *inode, struct file *file) 4447 { 4448 __simple_attr_check_format("%llu\n", 0ull); 4449 return kvm_debugfs_open(inode, file, kvm_stat_data_get, 4450 kvm_stat_data_clear, "%llu\n"); 4451 } 4452 4453 static const struct file_operations stat_fops_per_vm = { 4454 .owner = THIS_MODULE, 4455 .open = kvm_stat_data_open, 4456 .release = kvm_debugfs_release, 4457 .read = simple_attr_read, 4458 .write = simple_attr_write, 4459 .llseek = no_llseek, 4460 }; 4461 4462 static int vm_stat_get(void *_offset, u64 *val) 4463 { 4464 unsigned offset = (long)_offset; 4465 struct kvm *kvm; 4466 u64 tmp_val; 4467 4468 *val = 0; 4469 mutex_lock(&kvm_lock); 4470 list_for_each_entry(kvm, &vm_list, vm_list) { 4471 kvm_get_stat_per_vm(kvm, offset, &tmp_val); 4472 *val += tmp_val; 4473 } 4474 mutex_unlock(&kvm_lock); 4475 return 0; 4476 } 4477 4478 static int vm_stat_clear(void *_offset, u64 val) 4479 { 4480 unsigned offset = (long)_offset; 4481 struct kvm *kvm; 4482 4483 if (val) 4484 return -EINVAL; 4485 4486 mutex_lock(&kvm_lock); 4487 list_for_each_entry(kvm, &vm_list, vm_list) { 4488 kvm_clear_stat_per_vm(kvm, offset); 4489 } 4490 mutex_unlock(&kvm_lock); 4491 4492 return 0; 4493 } 4494 4495 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n"); 4496 4497 static int vcpu_stat_get(void *_offset, u64 *val) 4498 { 4499 unsigned offset = (long)_offset; 4500 struct kvm *kvm; 4501 u64 tmp_val; 4502 4503 *val = 0; 4504 mutex_lock(&kvm_lock); 4505 list_for_each_entry(kvm, &vm_list, vm_list) { 4506 kvm_get_stat_per_vcpu(kvm, offset, &tmp_val); 4507 *val += tmp_val; 4508 } 4509 mutex_unlock(&kvm_lock); 4510 return 0; 4511 } 4512 4513 static int vcpu_stat_clear(void *_offset, u64 val) 4514 { 4515 unsigned offset = (long)_offset; 4516 struct kvm *kvm; 4517 4518 if (val) 4519 return -EINVAL; 4520 4521 mutex_lock(&kvm_lock); 4522 list_for_each_entry(kvm, &vm_list, vm_list) { 4523 kvm_clear_stat_per_vcpu(kvm, offset); 4524 } 4525 mutex_unlock(&kvm_lock); 4526 4527 return 0; 4528 } 4529 4530 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear, 4531 "%llu\n"); 4532 4533 static const struct file_operations *stat_fops[] = { 4534 [KVM_STAT_VCPU] = &vcpu_stat_fops, 4535 [KVM_STAT_VM] = &vm_stat_fops, 4536 }; 4537 4538 static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm) 4539 { 4540 struct kobj_uevent_env *env; 4541 unsigned long long created, active; 4542 4543 if (!kvm_dev.this_device || !kvm) 4544 return; 4545 4546 mutex_lock(&kvm_lock); 4547 if (type == KVM_EVENT_CREATE_VM) { 4548 kvm_createvm_count++; 4549 kvm_active_vms++; 4550 } else if (type == KVM_EVENT_DESTROY_VM) { 4551 kvm_active_vms--; 4552 } 4553 created = kvm_createvm_count; 4554 active = kvm_active_vms; 4555 mutex_unlock(&kvm_lock); 4556 4557 env = kzalloc(sizeof(*env), GFP_KERNEL_ACCOUNT); 4558 if (!env) 4559 return; 4560 4561 add_uevent_var(env, "CREATED=%llu", created); 4562 add_uevent_var(env, "COUNT=%llu", active); 4563 4564 if (type == KVM_EVENT_CREATE_VM) { 4565 add_uevent_var(env, "EVENT=create"); 4566 kvm->userspace_pid = task_pid_nr(current); 4567 } else if (type == KVM_EVENT_DESTROY_VM) { 4568 add_uevent_var(env, "EVENT=destroy"); 4569 } 4570 add_uevent_var(env, "PID=%d", kvm->userspace_pid); 4571 4572 if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) { 4573 char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT); 4574 4575 if (p) { 4576 tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX); 4577 if (!IS_ERR(tmp)) 4578 add_uevent_var(env, "STATS_PATH=%s", tmp); 4579 kfree(p); 4580 } 4581 } 4582 /* no need for checks, since we are adding at most only 5 keys */ 4583 env->envp[env->envp_idx++] = NULL; 4584 kobject_uevent_env(&kvm_dev.this_device->kobj, KOBJ_CHANGE, env->envp); 4585 kfree(env); 4586 } 4587 4588 static void kvm_init_debug(void) 4589 { 4590 struct kvm_stats_debugfs_item *p; 4591 4592 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 4593 4594 kvm_debugfs_num_entries = 0; 4595 for (p = debugfs_entries; p->name; ++p, kvm_debugfs_num_entries++) { 4596 debugfs_create_file(p->name, KVM_DBGFS_GET_MODE(p), 4597 kvm_debugfs_dir, (void *)(long)p->offset, 4598 stat_fops[p->kind]); 4599 } 4600 } 4601 4602 static int kvm_suspend(void) 4603 { 4604 if (kvm_usage_count) 4605 hardware_disable_nolock(NULL); 4606 return 0; 4607 } 4608 4609 static void kvm_resume(void) 4610 { 4611 if (kvm_usage_count) { 4612 #ifdef CONFIG_LOCKDEP 4613 WARN_ON(lockdep_is_held(&kvm_count_lock)); 4614 #endif 4615 hardware_enable_nolock(NULL); 4616 } 4617 } 4618 4619 static struct syscore_ops kvm_syscore_ops = { 4620 .suspend = kvm_suspend, 4621 .resume = kvm_resume, 4622 }; 4623 4624 static inline 4625 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 4626 { 4627 return container_of(pn, struct kvm_vcpu, preempt_notifier); 4628 } 4629 4630 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 4631 { 4632 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4633 4634 WRITE_ONCE(vcpu->preempted, false); 4635 WRITE_ONCE(vcpu->ready, false); 4636 4637 __this_cpu_write(kvm_running_vcpu, vcpu); 4638 kvm_arch_sched_in(vcpu, cpu); 4639 kvm_arch_vcpu_load(vcpu, cpu); 4640 } 4641 4642 static void kvm_sched_out(struct preempt_notifier *pn, 4643 struct task_struct *next) 4644 { 4645 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 4646 4647 if (current->state == TASK_RUNNING) { 4648 WRITE_ONCE(vcpu->preempted, true); 4649 WRITE_ONCE(vcpu->ready, true); 4650 } 4651 kvm_arch_vcpu_put(vcpu); 4652 __this_cpu_write(kvm_running_vcpu, NULL); 4653 } 4654 4655 /** 4656 * kvm_get_running_vcpu - get the vcpu running on the current CPU. 4657 * 4658 * We can disable preemption locally around accessing the per-CPU variable, 4659 * and use the resolved vcpu pointer after enabling preemption again, 4660 * because even if the current thread is migrated to another CPU, reading 4661 * the per-CPU value later will give us the same value as we update the 4662 * per-CPU variable in the preempt notifier handlers. 4663 */ 4664 struct kvm_vcpu *kvm_get_running_vcpu(void) 4665 { 4666 struct kvm_vcpu *vcpu; 4667 4668 preempt_disable(); 4669 vcpu = __this_cpu_read(kvm_running_vcpu); 4670 preempt_enable(); 4671 4672 return vcpu; 4673 } 4674 EXPORT_SYMBOL_GPL(kvm_get_running_vcpu); 4675 4676 /** 4677 * kvm_get_running_vcpus - get the per-CPU array of currently running vcpus. 4678 */ 4679 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void) 4680 { 4681 return &kvm_running_vcpu; 4682 } 4683 4684 struct kvm_cpu_compat_check { 4685 void *opaque; 4686 int *ret; 4687 }; 4688 4689 static void check_processor_compat(void *data) 4690 { 4691 struct kvm_cpu_compat_check *c = data; 4692 4693 *c->ret = kvm_arch_check_processor_compat(c->opaque); 4694 } 4695 4696 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, 4697 struct module *module) 4698 { 4699 struct kvm_cpu_compat_check c; 4700 int r; 4701 int cpu; 4702 4703 r = kvm_arch_init(opaque); 4704 if (r) 4705 goto out_fail; 4706 4707 /* 4708 * kvm_arch_init makes sure there's at most one caller 4709 * for architectures that support multiple implementations, 4710 * like intel and amd on x86. 4711 * kvm_arch_init must be called before kvm_irqfd_init to avoid creating 4712 * conflicts in case kvm is already setup for another implementation. 4713 */ 4714 r = kvm_irqfd_init(); 4715 if (r) 4716 goto out_irqfd; 4717 4718 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 4719 r = -ENOMEM; 4720 goto out_free_0; 4721 } 4722 4723 r = kvm_arch_hardware_setup(opaque); 4724 if (r < 0) 4725 goto out_free_1; 4726 4727 c.ret = &r; 4728 c.opaque = opaque; 4729 for_each_online_cpu(cpu) { 4730 smp_call_function_single(cpu, check_processor_compat, &c, 1); 4731 if (r < 0) 4732 goto out_free_2; 4733 } 4734 4735 r = cpuhp_setup_state_nocalls(CPUHP_AP_KVM_STARTING, "kvm/cpu:starting", 4736 kvm_starting_cpu, kvm_dying_cpu); 4737 if (r) 4738 goto out_free_2; 4739 register_reboot_notifier(&kvm_reboot_notifier); 4740 4741 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 4742 if (!vcpu_align) 4743 vcpu_align = __alignof__(struct kvm_vcpu); 4744 kvm_vcpu_cache = 4745 kmem_cache_create_usercopy("kvm_vcpu", vcpu_size, vcpu_align, 4746 SLAB_ACCOUNT, 4747 offsetof(struct kvm_vcpu, arch), 4748 sizeof_field(struct kvm_vcpu, arch), 4749 NULL); 4750 if (!kvm_vcpu_cache) { 4751 r = -ENOMEM; 4752 goto out_free_3; 4753 } 4754 4755 r = kvm_async_pf_init(); 4756 if (r) 4757 goto out_free; 4758 4759 kvm_chardev_ops.owner = module; 4760 kvm_vm_fops.owner = module; 4761 kvm_vcpu_fops.owner = module; 4762 4763 r = misc_register(&kvm_dev); 4764 if (r) { 4765 pr_err("kvm: misc device register failed\n"); 4766 goto out_unreg; 4767 } 4768 4769 register_syscore_ops(&kvm_syscore_ops); 4770 4771 kvm_preempt_ops.sched_in = kvm_sched_in; 4772 kvm_preempt_ops.sched_out = kvm_sched_out; 4773 4774 kvm_init_debug(); 4775 4776 r = kvm_vfio_ops_init(); 4777 WARN_ON(r); 4778 4779 return 0; 4780 4781 out_unreg: 4782 kvm_async_pf_deinit(); 4783 out_free: 4784 kmem_cache_destroy(kvm_vcpu_cache); 4785 out_free_3: 4786 unregister_reboot_notifier(&kvm_reboot_notifier); 4787 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4788 out_free_2: 4789 kvm_arch_hardware_unsetup(); 4790 out_free_1: 4791 free_cpumask_var(cpus_hardware_enabled); 4792 out_free_0: 4793 kvm_irqfd_exit(); 4794 out_irqfd: 4795 kvm_arch_exit(); 4796 out_fail: 4797 return r; 4798 } 4799 EXPORT_SYMBOL_GPL(kvm_init); 4800 4801 void kvm_exit(void) 4802 { 4803 debugfs_remove_recursive(kvm_debugfs_dir); 4804 misc_deregister(&kvm_dev); 4805 kmem_cache_destroy(kvm_vcpu_cache); 4806 kvm_async_pf_deinit(); 4807 unregister_syscore_ops(&kvm_syscore_ops); 4808 unregister_reboot_notifier(&kvm_reboot_notifier); 4809 cpuhp_remove_state_nocalls(CPUHP_AP_KVM_STARTING); 4810 on_each_cpu(hardware_disable_nolock, NULL, 1); 4811 kvm_arch_hardware_unsetup(); 4812 kvm_arch_exit(); 4813 kvm_irqfd_exit(); 4814 free_cpumask_var(cpus_hardware_enabled); 4815 kvm_vfio_ops_exit(); 4816 } 4817 EXPORT_SYMBOL_GPL(kvm_exit); 4818 4819 struct kvm_vm_worker_thread_context { 4820 struct kvm *kvm; 4821 struct task_struct *parent; 4822 struct completion init_done; 4823 kvm_vm_thread_fn_t thread_fn; 4824 uintptr_t data; 4825 int err; 4826 }; 4827 4828 static int kvm_vm_worker_thread(void *context) 4829 { 4830 /* 4831 * The init_context is allocated on the stack of the parent thread, so 4832 * we have to locally copy anything that is needed beyond initialization 4833 */ 4834 struct kvm_vm_worker_thread_context *init_context = context; 4835 struct kvm *kvm = init_context->kvm; 4836 kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; 4837 uintptr_t data = init_context->data; 4838 int err; 4839 4840 err = kthread_park(current); 4841 /* kthread_park(current) is never supposed to return an error */ 4842 WARN_ON(err != 0); 4843 if (err) 4844 goto init_complete; 4845 4846 err = cgroup_attach_task_all(init_context->parent, current); 4847 if (err) { 4848 kvm_err("%s: cgroup_attach_task_all failed with err %d\n", 4849 __func__, err); 4850 goto init_complete; 4851 } 4852 4853 set_user_nice(current, task_nice(init_context->parent)); 4854 4855 init_complete: 4856 init_context->err = err; 4857 complete(&init_context->init_done); 4858 init_context = NULL; 4859 4860 if (err) 4861 return err; 4862 4863 /* Wait to be woken up by the spawner before proceeding. */ 4864 kthread_parkme(); 4865 4866 if (!kthread_should_stop()) 4867 err = thread_fn(kvm, data); 4868 4869 return err; 4870 } 4871 4872 int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, 4873 uintptr_t data, const char *name, 4874 struct task_struct **thread_ptr) 4875 { 4876 struct kvm_vm_worker_thread_context init_context = {}; 4877 struct task_struct *thread; 4878 4879 *thread_ptr = NULL; 4880 init_context.kvm = kvm; 4881 init_context.parent = current; 4882 init_context.thread_fn = thread_fn; 4883 init_context.data = data; 4884 init_completion(&init_context.init_done); 4885 4886 thread = kthread_run(kvm_vm_worker_thread, &init_context, 4887 "%s-%d", name, task_pid_nr(current)); 4888 if (IS_ERR(thread)) 4889 return PTR_ERR(thread); 4890 4891 /* kthread_run is never supposed to return NULL */ 4892 WARN_ON(thread == NULL); 4893 4894 wait_for_completion(&init_context.init_done); 4895 4896 if (!init_context.err) 4897 *thread_ptr = thread; 4898 4899 return init_context.err; 4900 } 4901