1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 45 #include <asm/processor.h> 46 #include <asm/io.h> 47 #include <asm/uaccess.h> 48 #include <asm/pgtable.h> 49 50 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 51 #include "coalesced_mmio.h" 52 #endif 53 54 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 55 #include <linux/pci.h> 56 #include <linux/interrupt.h> 57 #include "irq.h" 58 #endif 59 60 MODULE_AUTHOR("Qumranet"); 61 MODULE_LICENSE("GPL"); 62 63 static int msi2intx = 1; 64 module_param(msi2intx, bool, 0); 65 66 DEFINE_SPINLOCK(kvm_lock); 67 LIST_HEAD(vm_list); 68 69 static cpumask_var_t cpus_hardware_enabled; 70 71 struct kmem_cache *kvm_vcpu_cache; 72 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 73 74 static __read_mostly struct preempt_ops kvm_preempt_ops; 75 76 struct dentry *kvm_debugfs_dir; 77 78 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 79 unsigned long arg); 80 81 static bool kvm_rebooting; 82 83 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 84 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 85 int assigned_dev_id) 86 { 87 struct list_head *ptr; 88 struct kvm_assigned_dev_kernel *match; 89 90 list_for_each(ptr, head) { 91 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 92 if (match->assigned_dev_id == assigned_dev_id) 93 return match; 94 } 95 return NULL; 96 } 97 98 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 99 { 100 struct kvm_assigned_dev_kernel *assigned_dev; 101 102 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 103 interrupt_work); 104 105 /* This is taken to safely inject irq inside the guest. When 106 * the interrupt injection (or the ioapic code) uses a 107 * finer-grained lock, update this 108 */ 109 mutex_lock(&assigned_dev->kvm->lock); 110 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 111 assigned_dev->guest_irq, 1); 112 113 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) { 114 enable_irq(assigned_dev->host_irq); 115 assigned_dev->host_irq_disabled = false; 116 } 117 mutex_unlock(&assigned_dev->kvm->lock); 118 } 119 120 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 121 { 122 struct kvm_assigned_dev_kernel *assigned_dev = 123 (struct kvm_assigned_dev_kernel *) dev_id; 124 125 schedule_work(&assigned_dev->interrupt_work); 126 127 disable_irq_nosync(irq); 128 assigned_dev->host_irq_disabled = true; 129 130 return IRQ_HANDLED; 131 } 132 133 /* Ack the irq line for an assigned device */ 134 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 135 { 136 struct kvm_assigned_dev_kernel *dev; 137 138 if (kian->gsi == -1) 139 return; 140 141 dev = container_of(kian, struct kvm_assigned_dev_kernel, 142 ack_notifier); 143 144 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 145 146 /* The guest irq may be shared so this ack may be 147 * from another device. 148 */ 149 if (dev->host_irq_disabled) { 150 enable_irq(dev->host_irq); 151 dev->host_irq_disabled = false; 152 } 153 } 154 155 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 156 static void kvm_free_assigned_irq(struct kvm *kvm, 157 struct kvm_assigned_dev_kernel *assigned_dev) 158 { 159 if (!irqchip_in_kernel(kvm)) 160 return; 161 162 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 163 164 if (assigned_dev->irq_source_id != -1) 165 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 166 assigned_dev->irq_source_id = -1; 167 168 if (!assigned_dev->irq_requested_type) 169 return; 170 171 /* 172 * In kvm_free_device_irq, cancel_work_sync return true if: 173 * 1. work is scheduled, and then cancelled. 174 * 2. work callback is executed. 175 * 176 * The first one ensured that the irq is disabled and no more events 177 * would happen. But for the second one, the irq may be enabled (e.g. 178 * for MSI). So we disable irq here to prevent further events. 179 * 180 * Notice this maybe result in nested disable if the interrupt type is 181 * INTx, but it's OK for we are going to free it. 182 * 183 * If this function is a part of VM destroy, please ensure that till 184 * now, the kvm state is still legal for probably we also have to wait 185 * interrupt_work done. 186 */ 187 disable_irq_nosync(assigned_dev->host_irq); 188 cancel_work_sync(&assigned_dev->interrupt_work); 189 190 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 191 192 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 193 pci_disable_msi(assigned_dev->dev); 194 195 assigned_dev->irq_requested_type = 0; 196 } 197 198 199 static void kvm_free_assigned_device(struct kvm *kvm, 200 struct kvm_assigned_dev_kernel 201 *assigned_dev) 202 { 203 kvm_free_assigned_irq(kvm, assigned_dev); 204 205 pci_reset_function(assigned_dev->dev); 206 207 pci_release_regions(assigned_dev->dev); 208 pci_disable_device(assigned_dev->dev); 209 pci_dev_put(assigned_dev->dev); 210 211 list_del(&assigned_dev->list); 212 kfree(assigned_dev); 213 } 214 215 void kvm_free_all_assigned_devices(struct kvm *kvm) 216 { 217 struct list_head *ptr, *ptr2; 218 struct kvm_assigned_dev_kernel *assigned_dev; 219 220 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 221 assigned_dev = list_entry(ptr, 222 struct kvm_assigned_dev_kernel, 223 list); 224 225 kvm_free_assigned_device(kvm, assigned_dev); 226 } 227 } 228 229 static int assigned_device_update_intx(struct kvm *kvm, 230 struct kvm_assigned_dev_kernel *adev, 231 struct kvm_assigned_irq *airq) 232 { 233 adev->guest_irq = airq->guest_irq; 234 adev->ack_notifier.gsi = airq->guest_irq; 235 236 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX) 237 return 0; 238 239 if (irqchip_in_kernel(kvm)) { 240 if (!msi2intx && 241 (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { 242 free_irq(adev->host_irq, (void *)adev); 243 pci_disable_msi(adev->dev); 244 } 245 246 if (!capable(CAP_SYS_RAWIO)) 247 return -EPERM; 248 249 if (airq->host_irq) 250 adev->host_irq = airq->host_irq; 251 else 252 adev->host_irq = adev->dev->irq; 253 254 /* Even though this is PCI, we don't want to use shared 255 * interrupts. Sharing host devices with guest-assigned devices 256 * on the same interrupt line is not a happy situation: there 257 * are going to be long delays in accepting, acking, etc. 258 */ 259 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 260 0, "kvm_assigned_intx_device", (void *)adev)) 261 return -EIO; 262 } 263 264 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX | 265 KVM_ASSIGNED_DEV_HOST_INTX; 266 return 0; 267 } 268 269 #ifdef CONFIG_X86 270 static int assigned_device_update_msi(struct kvm *kvm, 271 struct kvm_assigned_dev_kernel *adev, 272 struct kvm_assigned_irq *airq) 273 { 274 int r; 275 276 adev->guest_irq = airq->guest_irq; 277 if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { 278 /* x86 don't care upper address of guest msi message addr */ 279 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; 280 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX; 281 adev->ack_notifier.gsi = -1; 282 } else if (msi2intx) { 283 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; 284 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; 285 adev->ack_notifier.gsi = airq->guest_irq; 286 } else { 287 /* 288 * Guest require to disable device MSI, we disable MSI and 289 * re-enable INTx by default again. Notice it's only for 290 * non-msi2intx. 291 */ 292 assigned_device_update_intx(kvm, adev, airq); 293 return 0; 294 } 295 296 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 297 return 0; 298 299 if (irqchip_in_kernel(kvm)) { 300 if (!msi2intx) { 301 if (adev->irq_requested_type & 302 KVM_ASSIGNED_DEV_HOST_INTX) 303 free_irq(adev->host_irq, (void *)adev); 304 305 r = pci_enable_msi(adev->dev); 306 if (r) 307 return r; 308 } 309 310 adev->host_irq = adev->dev->irq; 311 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0, 312 "kvm_assigned_msi_device", (void *)adev)) 313 return -EIO; 314 } 315 316 if (!msi2intx) 317 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI; 318 319 adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI; 320 return 0; 321 } 322 #endif 323 324 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 325 struct kvm_assigned_irq 326 *assigned_irq) 327 { 328 int r = 0; 329 struct kvm_assigned_dev_kernel *match; 330 u32 current_flags = 0, changed_flags; 331 332 mutex_lock(&kvm->lock); 333 334 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 335 assigned_irq->assigned_dev_id); 336 if (!match) { 337 mutex_unlock(&kvm->lock); 338 return -EINVAL; 339 } 340 341 if (!match->irq_requested_type) { 342 INIT_WORK(&match->interrupt_work, 343 kvm_assigned_dev_interrupt_work_handler); 344 if (irqchip_in_kernel(kvm)) { 345 /* Register ack nofitier */ 346 match->ack_notifier.gsi = -1; 347 match->ack_notifier.irq_acked = 348 kvm_assigned_dev_ack_irq; 349 kvm_register_irq_ack_notifier(kvm, 350 &match->ack_notifier); 351 352 /* Request IRQ source ID */ 353 r = kvm_request_irq_source_id(kvm); 354 if (r < 0) 355 goto out_release; 356 else 357 match->irq_source_id = r; 358 359 #ifdef CONFIG_X86 360 /* Determine host device irq type, we can know the 361 * result from dev->msi_enabled */ 362 if (msi2intx) 363 pci_enable_msi(match->dev); 364 #endif 365 } 366 } 367 368 if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) && 369 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI)) 370 current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI; 371 372 changed_flags = assigned_irq->flags ^ current_flags; 373 374 if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) || 375 (msi2intx && match->dev->msi_enabled)) { 376 #ifdef CONFIG_X86 377 r = assigned_device_update_msi(kvm, match, assigned_irq); 378 if (r) { 379 printk(KERN_WARNING "kvm: failed to enable " 380 "MSI device!\n"); 381 goto out_release; 382 } 383 #else 384 r = -ENOTTY; 385 #endif 386 } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) { 387 /* Host device IRQ 0 means don't support INTx */ 388 if (!msi2intx) { 389 printk(KERN_WARNING 390 "kvm: wait device to enable MSI!\n"); 391 r = 0; 392 } else { 393 printk(KERN_WARNING 394 "kvm: failed to enable MSI device!\n"); 395 r = -ENOTTY; 396 goto out_release; 397 } 398 } else { 399 /* Non-sharing INTx mode */ 400 r = assigned_device_update_intx(kvm, match, assigned_irq); 401 if (r) { 402 printk(KERN_WARNING "kvm: failed to enable " 403 "INTx device!\n"); 404 goto out_release; 405 } 406 } 407 408 mutex_unlock(&kvm->lock); 409 return r; 410 out_release: 411 mutex_unlock(&kvm->lock); 412 kvm_free_assigned_device(kvm, match); 413 return r; 414 } 415 416 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 417 struct kvm_assigned_pci_dev *assigned_dev) 418 { 419 int r = 0; 420 struct kvm_assigned_dev_kernel *match; 421 struct pci_dev *dev; 422 423 down_read(&kvm->slots_lock); 424 mutex_lock(&kvm->lock); 425 426 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 427 assigned_dev->assigned_dev_id); 428 if (match) { 429 /* device already assigned */ 430 r = -EINVAL; 431 goto out; 432 } 433 434 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 435 if (match == NULL) { 436 printk(KERN_INFO "%s: Couldn't allocate memory\n", 437 __func__); 438 r = -ENOMEM; 439 goto out; 440 } 441 dev = pci_get_bus_and_slot(assigned_dev->busnr, 442 assigned_dev->devfn); 443 if (!dev) { 444 printk(KERN_INFO "%s: host device not found\n", __func__); 445 r = -EINVAL; 446 goto out_free; 447 } 448 if (pci_enable_device(dev)) { 449 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 450 r = -EBUSY; 451 goto out_put; 452 } 453 r = pci_request_regions(dev, "kvm_assigned_device"); 454 if (r) { 455 printk(KERN_INFO "%s: Could not get access to device regions\n", 456 __func__); 457 goto out_disable; 458 } 459 460 pci_reset_function(dev); 461 462 match->assigned_dev_id = assigned_dev->assigned_dev_id; 463 match->host_busnr = assigned_dev->busnr; 464 match->host_devfn = assigned_dev->devfn; 465 match->flags = assigned_dev->flags; 466 match->dev = dev; 467 match->irq_source_id = -1; 468 match->kvm = kvm; 469 470 list_add(&match->list, &kvm->arch.assigned_dev_head); 471 472 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 473 if (!kvm->arch.iommu_domain) { 474 r = kvm_iommu_map_guest(kvm); 475 if (r) 476 goto out_list_del; 477 } 478 r = kvm_assign_device(kvm, match); 479 if (r) 480 goto out_list_del; 481 } 482 483 out: 484 mutex_unlock(&kvm->lock); 485 up_read(&kvm->slots_lock); 486 return r; 487 out_list_del: 488 list_del(&match->list); 489 pci_release_regions(dev); 490 out_disable: 491 pci_disable_device(dev); 492 out_put: 493 pci_dev_put(dev); 494 out_free: 495 kfree(match); 496 mutex_unlock(&kvm->lock); 497 up_read(&kvm->slots_lock); 498 return r; 499 } 500 #endif 501 502 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 503 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 504 struct kvm_assigned_pci_dev *assigned_dev) 505 { 506 int r = 0; 507 struct kvm_assigned_dev_kernel *match; 508 509 mutex_lock(&kvm->lock); 510 511 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 512 assigned_dev->assigned_dev_id); 513 if (!match) { 514 printk(KERN_INFO "%s: device hasn't been assigned before, " 515 "so cannot be deassigned\n", __func__); 516 r = -EINVAL; 517 goto out; 518 } 519 520 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 521 kvm_deassign_device(kvm, match); 522 523 kvm_free_assigned_device(kvm, match); 524 525 out: 526 mutex_unlock(&kvm->lock); 527 return r; 528 } 529 #endif 530 531 static inline int valid_vcpu(int n) 532 { 533 return likely(n >= 0 && n < KVM_MAX_VCPUS); 534 } 535 536 inline int kvm_is_mmio_pfn(pfn_t pfn) 537 { 538 if (pfn_valid(pfn)) { 539 struct page *page = compound_head(pfn_to_page(pfn)); 540 return PageReserved(page); 541 } 542 543 return true; 544 } 545 546 /* 547 * Switches to specified vcpu, until a matching vcpu_put() 548 */ 549 void vcpu_load(struct kvm_vcpu *vcpu) 550 { 551 int cpu; 552 553 mutex_lock(&vcpu->mutex); 554 cpu = get_cpu(); 555 preempt_notifier_register(&vcpu->preempt_notifier); 556 kvm_arch_vcpu_load(vcpu, cpu); 557 put_cpu(); 558 } 559 560 void vcpu_put(struct kvm_vcpu *vcpu) 561 { 562 preempt_disable(); 563 kvm_arch_vcpu_put(vcpu); 564 preempt_notifier_unregister(&vcpu->preempt_notifier); 565 preempt_enable(); 566 mutex_unlock(&vcpu->mutex); 567 } 568 569 static void ack_flush(void *_completed) 570 { 571 } 572 573 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 574 { 575 int i, cpu, me; 576 cpumask_var_t cpus; 577 bool called = true; 578 struct kvm_vcpu *vcpu; 579 580 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 581 cpumask_clear(cpus); 582 583 me = get_cpu(); 584 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 585 vcpu = kvm->vcpus[i]; 586 if (!vcpu) 587 continue; 588 if (test_and_set_bit(req, &vcpu->requests)) 589 continue; 590 cpu = vcpu->cpu; 591 if (cpus != NULL && cpu != -1 && cpu != me) 592 cpumask_set_cpu(cpu, cpus); 593 } 594 if (unlikely(cpus == NULL)) 595 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 596 else if (!cpumask_empty(cpus)) 597 smp_call_function_many(cpus, ack_flush, NULL, 1); 598 else 599 called = false; 600 put_cpu(); 601 free_cpumask_var(cpus); 602 return called; 603 } 604 605 void kvm_flush_remote_tlbs(struct kvm *kvm) 606 { 607 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 608 ++kvm->stat.remote_tlb_flush; 609 } 610 611 void kvm_reload_remote_mmus(struct kvm *kvm) 612 { 613 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 614 } 615 616 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 617 { 618 struct page *page; 619 int r; 620 621 mutex_init(&vcpu->mutex); 622 vcpu->cpu = -1; 623 vcpu->kvm = kvm; 624 vcpu->vcpu_id = id; 625 init_waitqueue_head(&vcpu->wq); 626 627 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 628 if (!page) { 629 r = -ENOMEM; 630 goto fail; 631 } 632 vcpu->run = page_address(page); 633 634 r = kvm_arch_vcpu_init(vcpu); 635 if (r < 0) 636 goto fail_free_run; 637 return 0; 638 639 fail_free_run: 640 free_page((unsigned long)vcpu->run); 641 fail: 642 return r; 643 } 644 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 645 646 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 647 { 648 kvm_arch_vcpu_uninit(vcpu); 649 free_page((unsigned long)vcpu->run); 650 } 651 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 652 653 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 654 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 655 { 656 return container_of(mn, struct kvm, mmu_notifier); 657 } 658 659 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 660 struct mm_struct *mm, 661 unsigned long address) 662 { 663 struct kvm *kvm = mmu_notifier_to_kvm(mn); 664 int need_tlb_flush; 665 666 /* 667 * When ->invalidate_page runs, the linux pte has been zapped 668 * already but the page is still allocated until 669 * ->invalidate_page returns. So if we increase the sequence 670 * here the kvm page fault will notice if the spte can't be 671 * established because the page is going to be freed. If 672 * instead the kvm page fault establishes the spte before 673 * ->invalidate_page runs, kvm_unmap_hva will release it 674 * before returning. 675 * 676 * The sequence increase only need to be seen at spin_unlock 677 * time, and not at spin_lock time. 678 * 679 * Increasing the sequence after the spin_unlock would be 680 * unsafe because the kvm page fault could then establish the 681 * pte after kvm_unmap_hva returned, without noticing the page 682 * is going to be freed. 683 */ 684 spin_lock(&kvm->mmu_lock); 685 kvm->mmu_notifier_seq++; 686 need_tlb_flush = kvm_unmap_hva(kvm, address); 687 spin_unlock(&kvm->mmu_lock); 688 689 /* we've to flush the tlb before the pages can be freed */ 690 if (need_tlb_flush) 691 kvm_flush_remote_tlbs(kvm); 692 693 } 694 695 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 696 struct mm_struct *mm, 697 unsigned long start, 698 unsigned long end) 699 { 700 struct kvm *kvm = mmu_notifier_to_kvm(mn); 701 int need_tlb_flush = 0; 702 703 spin_lock(&kvm->mmu_lock); 704 /* 705 * The count increase must become visible at unlock time as no 706 * spte can be established without taking the mmu_lock and 707 * count is also read inside the mmu_lock critical section. 708 */ 709 kvm->mmu_notifier_count++; 710 for (; start < end; start += PAGE_SIZE) 711 need_tlb_flush |= kvm_unmap_hva(kvm, start); 712 spin_unlock(&kvm->mmu_lock); 713 714 /* we've to flush the tlb before the pages can be freed */ 715 if (need_tlb_flush) 716 kvm_flush_remote_tlbs(kvm); 717 } 718 719 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 720 struct mm_struct *mm, 721 unsigned long start, 722 unsigned long end) 723 { 724 struct kvm *kvm = mmu_notifier_to_kvm(mn); 725 726 spin_lock(&kvm->mmu_lock); 727 /* 728 * This sequence increase will notify the kvm page fault that 729 * the page that is going to be mapped in the spte could have 730 * been freed. 731 */ 732 kvm->mmu_notifier_seq++; 733 /* 734 * The above sequence increase must be visible before the 735 * below count decrease but both values are read by the kvm 736 * page fault under mmu_lock spinlock so we don't need to add 737 * a smb_wmb() here in between the two. 738 */ 739 kvm->mmu_notifier_count--; 740 spin_unlock(&kvm->mmu_lock); 741 742 BUG_ON(kvm->mmu_notifier_count < 0); 743 } 744 745 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 746 struct mm_struct *mm, 747 unsigned long address) 748 { 749 struct kvm *kvm = mmu_notifier_to_kvm(mn); 750 int young; 751 752 spin_lock(&kvm->mmu_lock); 753 young = kvm_age_hva(kvm, address); 754 spin_unlock(&kvm->mmu_lock); 755 756 if (young) 757 kvm_flush_remote_tlbs(kvm); 758 759 return young; 760 } 761 762 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 763 struct mm_struct *mm) 764 { 765 struct kvm *kvm = mmu_notifier_to_kvm(mn); 766 kvm_arch_flush_shadow(kvm); 767 } 768 769 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 770 .invalidate_page = kvm_mmu_notifier_invalidate_page, 771 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 772 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 773 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 774 .release = kvm_mmu_notifier_release, 775 }; 776 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 777 778 static struct kvm *kvm_create_vm(void) 779 { 780 struct kvm *kvm = kvm_arch_create_vm(); 781 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 782 struct page *page; 783 #endif 784 785 if (IS_ERR(kvm)) 786 goto out; 787 #ifdef CONFIG_HAVE_KVM_IRQCHIP 788 INIT_LIST_HEAD(&kvm->irq_routing); 789 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 790 #endif 791 792 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 793 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 794 if (!page) { 795 kfree(kvm); 796 return ERR_PTR(-ENOMEM); 797 } 798 kvm->coalesced_mmio_ring = 799 (struct kvm_coalesced_mmio_ring *)page_address(page); 800 #endif 801 802 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 803 { 804 int err; 805 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 806 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 807 if (err) { 808 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 809 put_page(page); 810 #endif 811 kfree(kvm); 812 return ERR_PTR(err); 813 } 814 } 815 #endif 816 817 kvm->mm = current->mm; 818 atomic_inc(&kvm->mm->mm_count); 819 spin_lock_init(&kvm->mmu_lock); 820 kvm_io_bus_init(&kvm->pio_bus); 821 mutex_init(&kvm->lock); 822 kvm_io_bus_init(&kvm->mmio_bus); 823 init_rwsem(&kvm->slots_lock); 824 atomic_set(&kvm->users_count, 1); 825 spin_lock(&kvm_lock); 826 list_add(&kvm->vm_list, &vm_list); 827 spin_unlock(&kvm_lock); 828 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 829 kvm_coalesced_mmio_init(kvm); 830 #endif 831 out: 832 return kvm; 833 } 834 835 /* 836 * Free any memory in @free but not in @dont. 837 */ 838 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 839 struct kvm_memory_slot *dont) 840 { 841 if (!dont || free->rmap != dont->rmap) 842 vfree(free->rmap); 843 844 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 845 vfree(free->dirty_bitmap); 846 847 if (!dont || free->lpage_info != dont->lpage_info) 848 vfree(free->lpage_info); 849 850 free->npages = 0; 851 free->dirty_bitmap = NULL; 852 free->rmap = NULL; 853 free->lpage_info = NULL; 854 } 855 856 void kvm_free_physmem(struct kvm *kvm) 857 { 858 int i; 859 860 for (i = 0; i < kvm->nmemslots; ++i) 861 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 862 } 863 864 static void kvm_destroy_vm(struct kvm *kvm) 865 { 866 struct mm_struct *mm = kvm->mm; 867 868 kvm_arch_sync_events(kvm); 869 spin_lock(&kvm_lock); 870 list_del(&kvm->vm_list); 871 spin_unlock(&kvm_lock); 872 kvm_free_irq_routing(kvm); 873 kvm_io_bus_destroy(&kvm->pio_bus); 874 kvm_io_bus_destroy(&kvm->mmio_bus); 875 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 876 if (kvm->coalesced_mmio_ring != NULL) 877 free_page((unsigned long)kvm->coalesced_mmio_ring); 878 #endif 879 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 880 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 881 #endif 882 kvm_arch_destroy_vm(kvm); 883 mmdrop(mm); 884 } 885 886 void kvm_get_kvm(struct kvm *kvm) 887 { 888 atomic_inc(&kvm->users_count); 889 } 890 EXPORT_SYMBOL_GPL(kvm_get_kvm); 891 892 void kvm_put_kvm(struct kvm *kvm) 893 { 894 if (atomic_dec_and_test(&kvm->users_count)) 895 kvm_destroy_vm(kvm); 896 } 897 EXPORT_SYMBOL_GPL(kvm_put_kvm); 898 899 900 static int kvm_vm_release(struct inode *inode, struct file *filp) 901 { 902 struct kvm *kvm = filp->private_data; 903 904 kvm_put_kvm(kvm); 905 return 0; 906 } 907 908 /* 909 * Allocate some memory and give it an address in the guest physical address 910 * space. 911 * 912 * Discontiguous memory is allowed, mostly for framebuffers. 913 * 914 * Must be called holding mmap_sem for write. 915 */ 916 int __kvm_set_memory_region(struct kvm *kvm, 917 struct kvm_userspace_memory_region *mem, 918 int user_alloc) 919 { 920 int r; 921 gfn_t base_gfn; 922 unsigned long npages; 923 int largepages; 924 unsigned long i; 925 struct kvm_memory_slot *memslot; 926 struct kvm_memory_slot old, new; 927 928 r = -EINVAL; 929 /* General sanity checks */ 930 if (mem->memory_size & (PAGE_SIZE - 1)) 931 goto out; 932 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 933 goto out; 934 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 935 goto out; 936 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 937 goto out; 938 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 939 goto out; 940 941 memslot = &kvm->memslots[mem->slot]; 942 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 943 npages = mem->memory_size >> PAGE_SHIFT; 944 945 if (!npages) 946 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 947 948 new = old = *memslot; 949 950 new.base_gfn = base_gfn; 951 new.npages = npages; 952 new.flags = mem->flags; 953 954 /* Disallow changing a memory slot's size. */ 955 r = -EINVAL; 956 if (npages && old.npages && npages != old.npages) 957 goto out_free; 958 959 /* Check for overlaps */ 960 r = -EEXIST; 961 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 962 struct kvm_memory_slot *s = &kvm->memslots[i]; 963 964 if (s == memslot || !s->npages) 965 continue; 966 if (!((base_gfn + npages <= s->base_gfn) || 967 (base_gfn >= s->base_gfn + s->npages))) 968 goto out_free; 969 } 970 971 /* Free page dirty bitmap if unneeded */ 972 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 973 new.dirty_bitmap = NULL; 974 975 r = -ENOMEM; 976 977 /* Allocate if a slot is being created */ 978 #ifndef CONFIG_S390 979 if (npages && !new.rmap) { 980 new.rmap = vmalloc(npages * sizeof(struct page *)); 981 982 if (!new.rmap) 983 goto out_free; 984 985 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 986 987 new.user_alloc = user_alloc; 988 /* 989 * hva_to_rmmap() serialzies with the mmu_lock and to be 990 * safe it has to ignore memslots with !user_alloc && 991 * !userspace_addr. 992 */ 993 if (user_alloc) 994 new.userspace_addr = mem->userspace_addr; 995 else 996 new.userspace_addr = 0; 997 } 998 if (npages && !new.lpage_info) { 999 largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; 1000 largepages -= base_gfn / KVM_PAGES_PER_HPAGE; 1001 1002 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1003 1004 if (!new.lpage_info) 1005 goto out_free; 1006 1007 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1008 1009 if (base_gfn % KVM_PAGES_PER_HPAGE) 1010 new.lpage_info[0].write_count = 1; 1011 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1012 new.lpage_info[largepages-1].write_count = 1; 1013 } 1014 1015 /* Allocate page dirty bitmap if needed */ 1016 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1017 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1018 1019 new.dirty_bitmap = vmalloc(dirty_bytes); 1020 if (!new.dirty_bitmap) 1021 goto out_free; 1022 memset(new.dirty_bitmap, 0, dirty_bytes); 1023 } 1024 #endif /* not defined CONFIG_S390 */ 1025 1026 if (!npages) 1027 kvm_arch_flush_shadow(kvm); 1028 1029 spin_lock(&kvm->mmu_lock); 1030 if (mem->slot >= kvm->nmemslots) 1031 kvm->nmemslots = mem->slot + 1; 1032 1033 *memslot = new; 1034 spin_unlock(&kvm->mmu_lock); 1035 1036 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1037 if (r) { 1038 spin_lock(&kvm->mmu_lock); 1039 *memslot = old; 1040 spin_unlock(&kvm->mmu_lock); 1041 goto out_free; 1042 } 1043 1044 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1045 /* Slot deletion case: we have to update the current slot */ 1046 if (!npages) 1047 *memslot = old; 1048 #ifdef CONFIG_DMAR 1049 /* map the pages in iommu page table */ 1050 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1051 if (r) 1052 goto out; 1053 #endif 1054 return 0; 1055 1056 out_free: 1057 kvm_free_physmem_slot(&new, &old); 1058 out: 1059 return r; 1060 1061 } 1062 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1063 1064 int kvm_set_memory_region(struct kvm *kvm, 1065 struct kvm_userspace_memory_region *mem, 1066 int user_alloc) 1067 { 1068 int r; 1069 1070 down_write(&kvm->slots_lock); 1071 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1072 up_write(&kvm->slots_lock); 1073 return r; 1074 } 1075 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1076 1077 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1078 struct 1079 kvm_userspace_memory_region *mem, 1080 int user_alloc) 1081 { 1082 if (mem->slot >= KVM_MEMORY_SLOTS) 1083 return -EINVAL; 1084 return kvm_set_memory_region(kvm, mem, user_alloc); 1085 } 1086 1087 int kvm_get_dirty_log(struct kvm *kvm, 1088 struct kvm_dirty_log *log, int *is_dirty) 1089 { 1090 struct kvm_memory_slot *memslot; 1091 int r, i; 1092 int n; 1093 unsigned long any = 0; 1094 1095 r = -EINVAL; 1096 if (log->slot >= KVM_MEMORY_SLOTS) 1097 goto out; 1098 1099 memslot = &kvm->memslots[log->slot]; 1100 r = -ENOENT; 1101 if (!memslot->dirty_bitmap) 1102 goto out; 1103 1104 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1105 1106 for (i = 0; !any && i < n/sizeof(long); ++i) 1107 any = memslot->dirty_bitmap[i]; 1108 1109 r = -EFAULT; 1110 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1111 goto out; 1112 1113 if (any) 1114 *is_dirty = 1; 1115 1116 r = 0; 1117 out: 1118 return r; 1119 } 1120 1121 int is_error_page(struct page *page) 1122 { 1123 return page == bad_page; 1124 } 1125 EXPORT_SYMBOL_GPL(is_error_page); 1126 1127 int is_error_pfn(pfn_t pfn) 1128 { 1129 return pfn == bad_pfn; 1130 } 1131 EXPORT_SYMBOL_GPL(is_error_pfn); 1132 1133 static inline unsigned long bad_hva(void) 1134 { 1135 return PAGE_OFFSET; 1136 } 1137 1138 int kvm_is_error_hva(unsigned long addr) 1139 { 1140 return addr == bad_hva(); 1141 } 1142 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1143 1144 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1145 { 1146 int i; 1147 1148 for (i = 0; i < kvm->nmemslots; ++i) { 1149 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1150 1151 if (gfn >= memslot->base_gfn 1152 && gfn < memslot->base_gfn + memslot->npages) 1153 return memslot; 1154 } 1155 return NULL; 1156 } 1157 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1158 1159 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1160 { 1161 gfn = unalias_gfn(kvm, gfn); 1162 return gfn_to_memslot_unaliased(kvm, gfn); 1163 } 1164 1165 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1166 { 1167 int i; 1168 1169 gfn = unalias_gfn(kvm, gfn); 1170 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1171 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1172 1173 if (gfn >= memslot->base_gfn 1174 && gfn < memslot->base_gfn + memslot->npages) 1175 return 1; 1176 } 1177 return 0; 1178 } 1179 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1180 1181 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1182 { 1183 struct kvm_memory_slot *slot; 1184 1185 gfn = unalias_gfn(kvm, gfn); 1186 slot = gfn_to_memslot_unaliased(kvm, gfn); 1187 if (!slot) 1188 return bad_hva(); 1189 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1190 } 1191 EXPORT_SYMBOL_GPL(gfn_to_hva); 1192 1193 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1194 { 1195 struct page *page[1]; 1196 unsigned long addr; 1197 int npages; 1198 pfn_t pfn; 1199 1200 might_sleep(); 1201 1202 addr = gfn_to_hva(kvm, gfn); 1203 if (kvm_is_error_hva(addr)) { 1204 get_page(bad_page); 1205 return page_to_pfn(bad_page); 1206 } 1207 1208 npages = get_user_pages_fast(addr, 1, 1, page); 1209 1210 if (unlikely(npages != 1)) { 1211 struct vm_area_struct *vma; 1212 1213 down_read(¤t->mm->mmap_sem); 1214 vma = find_vma(current->mm, addr); 1215 1216 if (vma == NULL || addr < vma->vm_start || 1217 !(vma->vm_flags & VM_PFNMAP)) { 1218 up_read(¤t->mm->mmap_sem); 1219 get_page(bad_page); 1220 return page_to_pfn(bad_page); 1221 } 1222 1223 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1224 up_read(¤t->mm->mmap_sem); 1225 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1226 } else 1227 pfn = page_to_pfn(page[0]); 1228 1229 return pfn; 1230 } 1231 1232 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1233 1234 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1235 { 1236 pfn_t pfn; 1237 1238 pfn = gfn_to_pfn(kvm, gfn); 1239 if (!kvm_is_mmio_pfn(pfn)) 1240 return pfn_to_page(pfn); 1241 1242 WARN_ON(kvm_is_mmio_pfn(pfn)); 1243 1244 get_page(bad_page); 1245 return bad_page; 1246 } 1247 1248 EXPORT_SYMBOL_GPL(gfn_to_page); 1249 1250 void kvm_release_page_clean(struct page *page) 1251 { 1252 kvm_release_pfn_clean(page_to_pfn(page)); 1253 } 1254 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1255 1256 void kvm_release_pfn_clean(pfn_t pfn) 1257 { 1258 if (!kvm_is_mmio_pfn(pfn)) 1259 put_page(pfn_to_page(pfn)); 1260 } 1261 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1262 1263 void kvm_release_page_dirty(struct page *page) 1264 { 1265 kvm_release_pfn_dirty(page_to_pfn(page)); 1266 } 1267 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1268 1269 void kvm_release_pfn_dirty(pfn_t pfn) 1270 { 1271 kvm_set_pfn_dirty(pfn); 1272 kvm_release_pfn_clean(pfn); 1273 } 1274 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1275 1276 void kvm_set_page_dirty(struct page *page) 1277 { 1278 kvm_set_pfn_dirty(page_to_pfn(page)); 1279 } 1280 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1281 1282 void kvm_set_pfn_dirty(pfn_t pfn) 1283 { 1284 if (!kvm_is_mmio_pfn(pfn)) { 1285 struct page *page = pfn_to_page(pfn); 1286 if (!PageReserved(page)) 1287 SetPageDirty(page); 1288 } 1289 } 1290 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1291 1292 void kvm_set_pfn_accessed(pfn_t pfn) 1293 { 1294 if (!kvm_is_mmio_pfn(pfn)) 1295 mark_page_accessed(pfn_to_page(pfn)); 1296 } 1297 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1298 1299 void kvm_get_pfn(pfn_t pfn) 1300 { 1301 if (!kvm_is_mmio_pfn(pfn)) 1302 get_page(pfn_to_page(pfn)); 1303 } 1304 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1305 1306 static int next_segment(unsigned long len, int offset) 1307 { 1308 if (len > PAGE_SIZE - offset) 1309 return PAGE_SIZE - offset; 1310 else 1311 return len; 1312 } 1313 1314 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1315 int len) 1316 { 1317 int r; 1318 unsigned long addr; 1319 1320 addr = gfn_to_hva(kvm, gfn); 1321 if (kvm_is_error_hva(addr)) 1322 return -EFAULT; 1323 r = copy_from_user(data, (void __user *)addr + offset, len); 1324 if (r) 1325 return -EFAULT; 1326 return 0; 1327 } 1328 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1329 1330 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1331 { 1332 gfn_t gfn = gpa >> PAGE_SHIFT; 1333 int seg; 1334 int offset = offset_in_page(gpa); 1335 int ret; 1336 1337 while ((seg = next_segment(len, offset)) != 0) { 1338 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1339 if (ret < 0) 1340 return ret; 1341 offset = 0; 1342 len -= seg; 1343 data += seg; 1344 ++gfn; 1345 } 1346 return 0; 1347 } 1348 EXPORT_SYMBOL_GPL(kvm_read_guest); 1349 1350 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1351 unsigned long len) 1352 { 1353 int r; 1354 unsigned long addr; 1355 gfn_t gfn = gpa >> PAGE_SHIFT; 1356 int offset = offset_in_page(gpa); 1357 1358 addr = gfn_to_hva(kvm, gfn); 1359 if (kvm_is_error_hva(addr)) 1360 return -EFAULT; 1361 pagefault_disable(); 1362 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1363 pagefault_enable(); 1364 if (r) 1365 return -EFAULT; 1366 return 0; 1367 } 1368 EXPORT_SYMBOL(kvm_read_guest_atomic); 1369 1370 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1371 int offset, int len) 1372 { 1373 int r; 1374 unsigned long addr; 1375 1376 addr = gfn_to_hva(kvm, gfn); 1377 if (kvm_is_error_hva(addr)) 1378 return -EFAULT; 1379 r = copy_to_user((void __user *)addr + offset, data, len); 1380 if (r) 1381 return -EFAULT; 1382 mark_page_dirty(kvm, gfn); 1383 return 0; 1384 } 1385 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1386 1387 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1388 unsigned long len) 1389 { 1390 gfn_t gfn = gpa >> PAGE_SHIFT; 1391 int seg; 1392 int offset = offset_in_page(gpa); 1393 int ret; 1394 1395 while ((seg = next_segment(len, offset)) != 0) { 1396 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1397 if (ret < 0) 1398 return ret; 1399 offset = 0; 1400 len -= seg; 1401 data += seg; 1402 ++gfn; 1403 } 1404 return 0; 1405 } 1406 1407 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1408 { 1409 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1410 } 1411 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1412 1413 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1414 { 1415 gfn_t gfn = gpa >> PAGE_SHIFT; 1416 int seg; 1417 int offset = offset_in_page(gpa); 1418 int ret; 1419 1420 while ((seg = next_segment(len, offset)) != 0) { 1421 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1422 if (ret < 0) 1423 return ret; 1424 offset = 0; 1425 len -= seg; 1426 ++gfn; 1427 } 1428 return 0; 1429 } 1430 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1431 1432 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1433 { 1434 struct kvm_memory_slot *memslot; 1435 1436 gfn = unalias_gfn(kvm, gfn); 1437 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1438 if (memslot && memslot->dirty_bitmap) { 1439 unsigned long rel_gfn = gfn - memslot->base_gfn; 1440 1441 /* avoid RMW */ 1442 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1443 set_bit(rel_gfn, memslot->dirty_bitmap); 1444 } 1445 } 1446 1447 /* 1448 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1449 */ 1450 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1451 { 1452 DEFINE_WAIT(wait); 1453 1454 for (;;) { 1455 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1456 1457 if (kvm_cpu_has_interrupt(vcpu) || 1458 kvm_cpu_has_pending_timer(vcpu) || 1459 kvm_arch_vcpu_runnable(vcpu)) { 1460 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1461 break; 1462 } 1463 if (signal_pending(current)) 1464 break; 1465 1466 vcpu_put(vcpu); 1467 schedule(); 1468 vcpu_load(vcpu); 1469 } 1470 1471 finish_wait(&vcpu->wq, &wait); 1472 } 1473 1474 void kvm_resched(struct kvm_vcpu *vcpu) 1475 { 1476 if (!need_resched()) 1477 return; 1478 cond_resched(); 1479 } 1480 EXPORT_SYMBOL_GPL(kvm_resched); 1481 1482 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1483 { 1484 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1485 struct page *page; 1486 1487 if (vmf->pgoff == 0) 1488 page = virt_to_page(vcpu->run); 1489 #ifdef CONFIG_X86 1490 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1491 page = virt_to_page(vcpu->arch.pio_data); 1492 #endif 1493 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1494 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1495 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1496 #endif 1497 else 1498 return VM_FAULT_SIGBUS; 1499 get_page(page); 1500 vmf->page = page; 1501 return 0; 1502 } 1503 1504 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1505 .fault = kvm_vcpu_fault, 1506 }; 1507 1508 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1509 { 1510 vma->vm_ops = &kvm_vcpu_vm_ops; 1511 return 0; 1512 } 1513 1514 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1515 { 1516 struct kvm_vcpu *vcpu = filp->private_data; 1517 1518 kvm_put_kvm(vcpu->kvm); 1519 return 0; 1520 } 1521 1522 static struct file_operations kvm_vcpu_fops = { 1523 .release = kvm_vcpu_release, 1524 .unlocked_ioctl = kvm_vcpu_ioctl, 1525 .compat_ioctl = kvm_vcpu_ioctl, 1526 .mmap = kvm_vcpu_mmap, 1527 }; 1528 1529 /* 1530 * Allocates an inode for the vcpu. 1531 */ 1532 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1533 { 1534 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1535 if (fd < 0) 1536 kvm_put_kvm(vcpu->kvm); 1537 return fd; 1538 } 1539 1540 /* 1541 * Creates some virtual cpus. Good luck creating more than one. 1542 */ 1543 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1544 { 1545 int r; 1546 struct kvm_vcpu *vcpu; 1547 1548 if (!valid_vcpu(n)) 1549 return -EINVAL; 1550 1551 vcpu = kvm_arch_vcpu_create(kvm, n); 1552 if (IS_ERR(vcpu)) 1553 return PTR_ERR(vcpu); 1554 1555 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1556 1557 r = kvm_arch_vcpu_setup(vcpu); 1558 if (r) 1559 return r; 1560 1561 mutex_lock(&kvm->lock); 1562 if (kvm->vcpus[n]) { 1563 r = -EEXIST; 1564 goto vcpu_destroy; 1565 } 1566 kvm->vcpus[n] = vcpu; 1567 mutex_unlock(&kvm->lock); 1568 1569 /* Now it's all set up, let userspace reach it */ 1570 kvm_get_kvm(kvm); 1571 r = create_vcpu_fd(vcpu); 1572 if (r < 0) 1573 goto unlink; 1574 return r; 1575 1576 unlink: 1577 mutex_lock(&kvm->lock); 1578 kvm->vcpus[n] = NULL; 1579 vcpu_destroy: 1580 mutex_unlock(&kvm->lock); 1581 kvm_arch_vcpu_destroy(vcpu); 1582 return r; 1583 } 1584 1585 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1586 { 1587 if (sigset) { 1588 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1589 vcpu->sigset_active = 1; 1590 vcpu->sigset = *sigset; 1591 } else 1592 vcpu->sigset_active = 0; 1593 return 0; 1594 } 1595 1596 static long kvm_vcpu_ioctl(struct file *filp, 1597 unsigned int ioctl, unsigned long arg) 1598 { 1599 struct kvm_vcpu *vcpu = filp->private_data; 1600 void __user *argp = (void __user *)arg; 1601 int r; 1602 struct kvm_fpu *fpu = NULL; 1603 struct kvm_sregs *kvm_sregs = NULL; 1604 1605 if (vcpu->kvm->mm != current->mm) 1606 return -EIO; 1607 switch (ioctl) { 1608 case KVM_RUN: 1609 r = -EINVAL; 1610 if (arg) 1611 goto out; 1612 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1613 break; 1614 case KVM_GET_REGS: { 1615 struct kvm_regs *kvm_regs; 1616 1617 r = -ENOMEM; 1618 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1619 if (!kvm_regs) 1620 goto out; 1621 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1622 if (r) 1623 goto out_free1; 1624 r = -EFAULT; 1625 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1626 goto out_free1; 1627 r = 0; 1628 out_free1: 1629 kfree(kvm_regs); 1630 break; 1631 } 1632 case KVM_SET_REGS: { 1633 struct kvm_regs *kvm_regs; 1634 1635 r = -ENOMEM; 1636 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1637 if (!kvm_regs) 1638 goto out; 1639 r = -EFAULT; 1640 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1641 goto out_free2; 1642 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1643 if (r) 1644 goto out_free2; 1645 r = 0; 1646 out_free2: 1647 kfree(kvm_regs); 1648 break; 1649 } 1650 case KVM_GET_SREGS: { 1651 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1652 r = -ENOMEM; 1653 if (!kvm_sregs) 1654 goto out; 1655 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1656 if (r) 1657 goto out; 1658 r = -EFAULT; 1659 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1660 goto out; 1661 r = 0; 1662 break; 1663 } 1664 case KVM_SET_SREGS: { 1665 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1666 r = -ENOMEM; 1667 if (!kvm_sregs) 1668 goto out; 1669 r = -EFAULT; 1670 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1671 goto out; 1672 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1673 if (r) 1674 goto out; 1675 r = 0; 1676 break; 1677 } 1678 case KVM_GET_MP_STATE: { 1679 struct kvm_mp_state mp_state; 1680 1681 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1682 if (r) 1683 goto out; 1684 r = -EFAULT; 1685 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1686 goto out; 1687 r = 0; 1688 break; 1689 } 1690 case KVM_SET_MP_STATE: { 1691 struct kvm_mp_state mp_state; 1692 1693 r = -EFAULT; 1694 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1695 goto out; 1696 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1697 if (r) 1698 goto out; 1699 r = 0; 1700 break; 1701 } 1702 case KVM_TRANSLATE: { 1703 struct kvm_translation tr; 1704 1705 r = -EFAULT; 1706 if (copy_from_user(&tr, argp, sizeof tr)) 1707 goto out; 1708 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1709 if (r) 1710 goto out; 1711 r = -EFAULT; 1712 if (copy_to_user(argp, &tr, sizeof tr)) 1713 goto out; 1714 r = 0; 1715 break; 1716 } 1717 case KVM_SET_GUEST_DEBUG: { 1718 struct kvm_guest_debug dbg; 1719 1720 r = -EFAULT; 1721 if (copy_from_user(&dbg, argp, sizeof dbg)) 1722 goto out; 1723 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1724 if (r) 1725 goto out; 1726 r = 0; 1727 break; 1728 } 1729 case KVM_SET_SIGNAL_MASK: { 1730 struct kvm_signal_mask __user *sigmask_arg = argp; 1731 struct kvm_signal_mask kvm_sigmask; 1732 sigset_t sigset, *p; 1733 1734 p = NULL; 1735 if (argp) { 1736 r = -EFAULT; 1737 if (copy_from_user(&kvm_sigmask, argp, 1738 sizeof kvm_sigmask)) 1739 goto out; 1740 r = -EINVAL; 1741 if (kvm_sigmask.len != sizeof sigset) 1742 goto out; 1743 r = -EFAULT; 1744 if (copy_from_user(&sigset, sigmask_arg->sigset, 1745 sizeof sigset)) 1746 goto out; 1747 p = &sigset; 1748 } 1749 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1750 break; 1751 } 1752 case KVM_GET_FPU: { 1753 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1754 r = -ENOMEM; 1755 if (!fpu) 1756 goto out; 1757 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1758 if (r) 1759 goto out; 1760 r = -EFAULT; 1761 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1762 goto out; 1763 r = 0; 1764 break; 1765 } 1766 case KVM_SET_FPU: { 1767 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1768 r = -ENOMEM; 1769 if (!fpu) 1770 goto out; 1771 r = -EFAULT; 1772 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1773 goto out; 1774 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1775 if (r) 1776 goto out; 1777 r = 0; 1778 break; 1779 } 1780 default: 1781 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1782 } 1783 out: 1784 kfree(fpu); 1785 kfree(kvm_sregs); 1786 return r; 1787 } 1788 1789 static long kvm_vm_ioctl(struct file *filp, 1790 unsigned int ioctl, unsigned long arg) 1791 { 1792 struct kvm *kvm = filp->private_data; 1793 void __user *argp = (void __user *)arg; 1794 int r; 1795 1796 if (kvm->mm != current->mm) 1797 return -EIO; 1798 switch (ioctl) { 1799 case KVM_CREATE_VCPU: 1800 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1801 if (r < 0) 1802 goto out; 1803 break; 1804 case KVM_SET_USER_MEMORY_REGION: { 1805 struct kvm_userspace_memory_region kvm_userspace_mem; 1806 1807 r = -EFAULT; 1808 if (copy_from_user(&kvm_userspace_mem, argp, 1809 sizeof kvm_userspace_mem)) 1810 goto out; 1811 1812 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1813 if (r) 1814 goto out; 1815 break; 1816 } 1817 case KVM_GET_DIRTY_LOG: { 1818 struct kvm_dirty_log log; 1819 1820 r = -EFAULT; 1821 if (copy_from_user(&log, argp, sizeof log)) 1822 goto out; 1823 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1824 if (r) 1825 goto out; 1826 break; 1827 } 1828 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1829 case KVM_REGISTER_COALESCED_MMIO: { 1830 struct kvm_coalesced_mmio_zone zone; 1831 r = -EFAULT; 1832 if (copy_from_user(&zone, argp, sizeof zone)) 1833 goto out; 1834 r = -ENXIO; 1835 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1836 if (r) 1837 goto out; 1838 r = 0; 1839 break; 1840 } 1841 case KVM_UNREGISTER_COALESCED_MMIO: { 1842 struct kvm_coalesced_mmio_zone zone; 1843 r = -EFAULT; 1844 if (copy_from_user(&zone, argp, sizeof zone)) 1845 goto out; 1846 r = -ENXIO; 1847 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1848 if (r) 1849 goto out; 1850 r = 0; 1851 break; 1852 } 1853 #endif 1854 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 1855 case KVM_ASSIGN_PCI_DEVICE: { 1856 struct kvm_assigned_pci_dev assigned_dev; 1857 1858 r = -EFAULT; 1859 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1860 goto out; 1861 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 1862 if (r) 1863 goto out; 1864 break; 1865 } 1866 case KVM_ASSIGN_IRQ: { 1867 struct kvm_assigned_irq assigned_irq; 1868 1869 r = -EFAULT; 1870 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 1871 goto out; 1872 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 1873 if (r) 1874 goto out; 1875 break; 1876 } 1877 #endif 1878 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 1879 case KVM_DEASSIGN_PCI_DEVICE: { 1880 struct kvm_assigned_pci_dev assigned_dev; 1881 1882 r = -EFAULT; 1883 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1884 goto out; 1885 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 1886 if (r) 1887 goto out; 1888 break; 1889 } 1890 #endif 1891 #ifdef KVM_CAP_IRQ_ROUTING 1892 case KVM_SET_GSI_ROUTING: { 1893 struct kvm_irq_routing routing; 1894 struct kvm_irq_routing __user *urouting; 1895 struct kvm_irq_routing_entry *entries; 1896 1897 r = -EFAULT; 1898 if (copy_from_user(&routing, argp, sizeof(routing))) 1899 goto out; 1900 r = -EINVAL; 1901 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 1902 goto out; 1903 if (routing.flags) 1904 goto out; 1905 r = -ENOMEM; 1906 entries = vmalloc(routing.nr * sizeof(*entries)); 1907 if (!entries) 1908 goto out; 1909 r = -EFAULT; 1910 urouting = argp; 1911 if (copy_from_user(entries, urouting->entries, 1912 routing.nr * sizeof(*entries))) 1913 goto out_free_irq_routing; 1914 r = kvm_set_irq_routing(kvm, entries, routing.nr, 1915 routing.flags); 1916 out_free_irq_routing: 1917 vfree(entries); 1918 break; 1919 } 1920 #endif 1921 default: 1922 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1923 } 1924 out: 1925 return r; 1926 } 1927 1928 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1929 { 1930 struct page *page[1]; 1931 unsigned long addr; 1932 int npages; 1933 gfn_t gfn = vmf->pgoff; 1934 struct kvm *kvm = vma->vm_file->private_data; 1935 1936 addr = gfn_to_hva(kvm, gfn); 1937 if (kvm_is_error_hva(addr)) 1938 return VM_FAULT_SIGBUS; 1939 1940 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1941 NULL); 1942 if (unlikely(npages != 1)) 1943 return VM_FAULT_SIGBUS; 1944 1945 vmf->page = page[0]; 1946 return 0; 1947 } 1948 1949 static struct vm_operations_struct kvm_vm_vm_ops = { 1950 .fault = kvm_vm_fault, 1951 }; 1952 1953 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1954 { 1955 vma->vm_ops = &kvm_vm_vm_ops; 1956 return 0; 1957 } 1958 1959 static struct file_operations kvm_vm_fops = { 1960 .release = kvm_vm_release, 1961 .unlocked_ioctl = kvm_vm_ioctl, 1962 .compat_ioctl = kvm_vm_ioctl, 1963 .mmap = kvm_vm_mmap, 1964 }; 1965 1966 static int kvm_dev_ioctl_create_vm(void) 1967 { 1968 int fd; 1969 struct kvm *kvm; 1970 1971 kvm = kvm_create_vm(); 1972 if (IS_ERR(kvm)) 1973 return PTR_ERR(kvm); 1974 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 1975 if (fd < 0) 1976 kvm_put_kvm(kvm); 1977 1978 return fd; 1979 } 1980 1981 static long kvm_dev_ioctl_check_extension_generic(long arg) 1982 { 1983 switch (arg) { 1984 case KVM_CAP_USER_MEMORY: 1985 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1986 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 1987 return 1; 1988 #ifdef CONFIG_HAVE_KVM_IRQCHIP 1989 case KVM_CAP_IRQ_ROUTING: 1990 return KVM_MAX_IRQ_ROUTES; 1991 #endif 1992 default: 1993 break; 1994 } 1995 return kvm_dev_ioctl_check_extension(arg); 1996 } 1997 1998 static long kvm_dev_ioctl(struct file *filp, 1999 unsigned int ioctl, unsigned long arg) 2000 { 2001 long r = -EINVAL; 2002 2003 switch (ioctl) { 2004 case KVM_GET_API_VERSION: 2005 r = -EINVAL; 2006 if (arg) 2007 goto out; 2008 r = KVM_API_VERSION; 2009 break; 2010 case KVM_CREATE_VM: 2011 r = -EINVAL; 2012 if (arg) 2013 goto out; 2014 r = kvm_dev_ioctl_create_vm(); 2015 break; 2016 case KVM_CHECK_EXTENSION: 2017 r = kvm_dev_ioctl_check_extension_generic(arg); 2018 break; 2019 case KVM_GET_VCPU_MMAP_SIZE: 2020 r = -EINVAL; 2021 if (arg) 2022 goto out; 2023 r = PAGE_SIZE; /* struct kvm_run */ 2024 #ifdef CONFIG_X86 2025 r += PAGE_SIZE; /* pio data page */ 2026 #endif 2027 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2028 r += PAGE_SIZE; /* coalesced mmio ring page */ 2029 #endif 2030 break; 2031 case KVM_TRACE_ENABLE: 2032 case KVM_TRACE_PAUSE: 2033 case KVM_TRACE_DISABLE: 2034 r = kvm_trace_ioctl(ioctl, arg); 2035 break; 2036 default: 2037 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2038 } 2039 out: 2040 return r; 2041 } 2042 2043 static struct file_operations kvm_chardev_ops = { 2044 .unlocked_ioctl = kvm_dev_ioctl, 2045 .compat_ioctl = kvm_dev_ioctl, 2046 }; 2047 2048 static struct miscdevice kvm_dev = { 2049 KVM_MINOR, 2050 "kvm", 2051 &kvm_chardev_ops, 2052 }; 2053 2054 static void hardware_enable(void *junk) 2055 { 2056 int cpu = raw_smp_processor_id(); 2057 2058 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2059 return; 2060 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2061 kvm_arch_hardware_enable(NULL); 2062 } 2063 2064 static void hardware_disable(void *junk) 2065 { 2066 int cpu = raw_smp_processor_id(); 2067 2068 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2069 return; 2070 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2071 kvm_arch_hardware_disable(NULL); 2072 } 2073 2074 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2075 void *v) 2076 { 2077 int cpu = (long)v; 2078 2079 val &= ~CPU_TASKS_FROZEN; 2080 switch (val) { 2081 case CPU_DYING: 2082 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2083 cpu); 2084 hardware_disable(NULL); 2085 break; 2086 case CPU_UP_CANCELED: 2087 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2088 cpu); 2089 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2090 break; 2091 case CPU_ONLINE: 2092 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2093 cpu); 2094 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2095 break; 2096 } 2097 return NOTIFY_OK; 2098 } 2099 2100 2101 asmlinkage void kvm_handle_fault_on_reboot(void) 2102 { 2103 if (kvm_rebooting) 2104 /* spin while reset goes on */ 2105 while (true) 2106 ; 2107 /* Fault while not rebooting. We want the trace. */ 2108 BUG(); 2109 } 2110 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2111 2112 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2113 void *v) 2114 { 2115 if (val == SYS_RESTART) { 2116 /* 2117 * Some (well, at least mine) BIOSes hang on reboot if 2118 * in vmx root mode. 2119 */ 2120 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2121 kvm_rebooting = true; 2122 on_each_cpu(hardware_disable, NULL, 1); 2123 } 2124 return NOTIFY_OK; 2125 } 2126 2127 static struct notifier_block kvm_reboot_notifier = { 2128 .notifier_call = kvm_reboot, 2129 .priority = 0, 2130 }; 2131 2132 void kvm_io_bus_init(struct kvm_io_bus *bus) 2133 { 2134 memset(bus, 0, sizeof(*bus)); 2135 } 2136 2137 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2138 { 2139 int i; 2140 2141 for (i = 0; i < bus->dev_count; i++) { 2142 struct kvm_io_device *pos = bus->devs[i]; 2143 2144 kvm_iodevice_destructor(pos); 2145 } 2146 } 2147 2148 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2149 gpa_t addr, int len, int is_write) 2150 { 2151 int i; 2152 2153 for (i = 0; i < bus->dev_count; i++) { 2154 struct kvm_io_device *pos = bus->devs[i]; 2155 2156 if (pos->in_range(pos, addr, len, is_write)) 2157 return pos; 2158 } 2159 2160 return NULL; 2161 } 2162 2163 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2164 { 2165 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2166 2167 bus->devs[bus->dev_count++] = dev; 2168 } 2169 2170 static struct notifier_block kvm_cpu_notifier = { 2171 .notifier_call = kvm_cpu_hotplug, 2172 .priority = 20, /* must be > scheduler priority */ 2173 }; 2174 2175 static int vm_stat_get(void *_offset, u64 *val) 2176 { 2177 unsigned offset = (long)_offset; 2178 struct kvm *kvm; 2179 2180 *val = 0; 2181 spin_lock(&kvm_lock); 2182 list_for_each_entry(kvm, &vm_list, vm_list) 2183 *val += *(u32 *)((void *)kvm + offset); 2184 spin_unlock(&kvm_lock); 2185 return 0; 2186 } 2187 2188 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2189 2190 static int vcpu_stat_get(void *_offset, u64 *val) 2191 { 2192 unsigned offset = (long)_offset; 2193 struct kvm *kvm; 2194 struct kvm_vcpu *vcpu; 2195 int i; 2196 2197 *val = 0; 2198 spin_lock(&kvm_lock); 2199 list_for_each_entry(kvm, &vm_list, vm_list) 2200 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2201 vcpu = kvm->vcpus[i]; 2202 if (vcpu) 2203 *val += *(u32 *)((void *)vcpu + offset); 2204 } 2205 spin_unlock(&kvm_lock); 2206 return 0; 2207 } 2208 2209 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2210 2211 static struct file_operations *stat_fops[] = { 2212 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2213 [KVM_STAT_VM] = &vm_stat_fops, 2214 }; 2215 2216 static void kvm_init_debug(void) 2217 { 2218 struct kvm_stats_debugfs_item *p; 2219 2220 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2221 for (p = debugfs_entries; p->name; ++p) 2222 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2223 (void *)(long)p->offset, 2224 stat_fops[p->kind]); 2225 } 2226 2227 static void kvm_exit_debug(void) 2228 { 2229 struct kvm_stats_debugfs_item *p; 2230 2231 for (p = debugfs_entries; p->name; ++p) 2232 debugfs_remove(p->dentry); 2233 debugfs_remove(kvm_debugfs_dir); 2234 } 2235 2236 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2237 { 2238 hardware_disable(NULL); 2239 return 0; 2240 } 2241 2242 static int kvm_resume(struct sys_device *dev) 2243 { 2244 hardware_enable(NULL); 2245 return 0; 2246 } 2247 2248 static struct sysdev_class kvm_sysdev_class = { 2249 .name = "kvm", 2250 .suspend = kvm_suspend, 2251 .resume = kvm_resume, 2252 }; 2253 2254 static struct sys_device kvm_sysdev = { 2255 .id = 0, 2256 .cls = &kvm_sysdev_class, 2257 }; 2258 2259 struct page *bad_page; 2260 pfn_t bad_pfn; 2261 2262 static inline 2263 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2264 { 2265 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2266 } 2267 2268 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2269 { 2270 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2271 2272 kvm_arch_vcpu_load(vcpu, cpu); 2273 } 2274 2275 static void kvm_sched_out(struct preempt_notifier *pn, 2276 struct task_struct *next) 2277 { 2278 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2279 2280 kvm_arch_vcpu_put(vcpu); 2281 } 2282 2283 int kvm_init(void *opaque, unsigned int vcpu_size, 2284 struct module *module) 2285 { 2286 int r; 2287 int cpu; 2288 2289 kvm_init_debug(); 2290 2291 r = kvm_arch_init(opaque); 2292 if (r) 2293 goto out_fail; 2294 2295 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2296 2297 if (bad_page == NULL) { 2298 r = -ENOMEM; 2299 goto out; 2300 } 2301 2302 bad_pfn = page_to_pfn(bad_page); 2303 2304 if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2305 r = -ENOMEM; 2306 goto out_free_0; 2307 } 2308 2309 r = kvm_arch_hardware_setup(); 2310 if (r < 0) 2311 goto out_free_0a; 2312 2313 for_each_online_cpu(cpu) { 2314 smp_call_function_single(cpu, 2315 kvm_arch_check_processor_compat, 2316 &r, 1); 2317 if (r < 0) 2318 goto out_free_1; 2319 } 2320 2321 on_each_cpu(hardware_enable, NULL, 1); 2322 r = register_cpu_notifier(&kvm_cpu_notifier); 2323 if (r) 2324 goto out_free_2; 2325 register_reboot_notifier(&kvm_reboot_notifier); 2326 2327 r = sysdev_class_register(&kvm_sysdev_class); 2328 if (r) 2329 goto out_free_3; 2330 2331 r = sysdev_register(&kvm_sysdev); 2332 if (r) 2333 goto out_free_4; 2334 2335 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2336 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2337 __alignof__(struct kvm_vcpu), 2338 0, NULL); 2339 if (!kvm_vcpu_cache) { 2340 r = -ENOMEM; 2341 goto out_free_5; 2342 } 2343 2344 kvm_chardev_ops.owner = module; 2345 kvm_vm_fops.owner = module; 2346 kvm_vcpu_fops.owner = module; 2347 2348 r = misc_register(&kvm_dev); 2349 if (r) { 2350 printk(KERN_ERR "kvm: misc device register failed\n"); 2351 goto out_free; 2352 } 2353 2354 kvm_preempt_ops.sched_in = kvm_sched_in; 2355 kvm_preempt_ops.sched_out = kvm_sched_out; 2356 #ifndef CONFIG_X86 2357 msi2intx = 0; 2358 #endif 2359 2360 return 0; 2361 2362 out_free: 2363 kmem_cache_destroy(kvm_vcpu_cache); 2364 out_free_5: 2365 sysdev_unregister(&kvm_sysdev); 2366 out_free_4: 2367 sysdev_class_unregister(&kvm_sysdev_class); 2368 out_free_3: 2369 unregister_reboot_notifier(&kvm_reboot_notifier); 2370 unregister_cpu_notifier(&kvm_cpu_notifier); 2371 out_free_2: 2372 on_each_cpu(hardware_disable, NULL, 1); 2373 out_free_1: 2374 kvm_arch_hardware_unsetup(); 2375 out_free_0a: 2376 free_cpumask_var(cpus_hardware_enabled); 2377 out_free_0: 2378 __free_page(bad_page); 2379 out: 2380 kvm_arch_exit(); 2381 kvm_exit_debug(); 2382 out_fail: 2383 return r; 2384 } 2385 EXPORT_SYMBOL_GPL(kvm_init); 2386 2387 void kvm_exit(void) 2388 { 2389 kvm_trace_cleanup(); 2390 misc_deregister(&kvm_dev); 2391 kmem_cache_destroy(kvm_vcpu_cache); 2392 sysdev_unregister(&kvm_sysdev); 2393 sysdev_class_unregister(&kvm_sysdev_class); 2394 unregister_reboot_notifier(&kvm_reboot_notifier); 2395 unregister_cpu_notifier(&kvm_cpu_notifier); 2396 on_each_cpu(hardware_disable, NULL, 1); 2397 kvm_arch_hardware_unsetup(); 2398 kvm_arch_exit(); 2399 kvm_exit_debug(); 2400 free_cpumask_var(cpus_hardware_enabled); 2401 __free_page(bad_page); 2402 } 2403 EXPORT_SYMBOL_GPL(kvm_exit); 2404