1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 45 #include <asm/processor.h> 46 #include <asm/io.h> 47 #include <asm/uaccess.h> 48 #include <asm/pgtable.h> 49 50 #ifdef CONFIG_X86 51 #include <asm/msidef.h> 52 #endif 53 54 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 55 #include "coalesced_mmio.h" 56 #endif 57 58 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 59 #include <linux/pci.h> 60 #include <linux/interrupt.h> 61 #include "irq.h" 62 #endif 63 64 MODULE_AUTHOR("Qumranet"); 65 MODULE_LICENSE("GPL"); 66 67 static int msi2intx = 1; 68 module_param(msi2intx, bool, 0); 69 70 DEFINE_SPINLOCK(kvm_lock); 71 LIST_HEAD(vm_list); 72 73 static cpumask_var_t cpus_hardware_enabled; 74 75 struct kmem_cache *kvm_vcpu_cache; 76 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 77 78 static __read_mostly struct preempt_ops kvm_preempt_ops; 79 80 struct dentry *kvm_debugfs_dir; 81 82 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 83 unsigned long arg); 84 85 static bool kvm_rebooting; 86 87 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 88 89 #ifdef CONFIG_X86 90 static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) 91 { 92 int vcpu_id; 93 struct kvm_vcpu *vcpu; 94 struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm); 95 int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK) 96 >> MSI_ADDR_DEST_ID_SHIFT; 97 int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK) 98 >> MSI_DATA_VECTOR_SHIFT; 99 int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT, 100 (unsigned long *)&dev->guest_msi.address_lo); 101 int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT, 102 (unsigned long *)&dev->guest_msi.data); 103 int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT, 104 (unsigned long *)&dev->guest_msi.data); 105 u32 deliver_bitmask; 106 107 BUG_ON(!ioapic); 108 109 deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, 110 dest_id, dest_mode); 111 /* IOAPIC delivery mode value is the same as MSI here */ 112 switch (delivery_mode) { 113 case IOAPIC_LOWEST_PRIORITY: 114 vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector, 115 deliver_bitmask); 116 if (vcpu != NULL) 117 kvm_apic_set_irq(vcpu, vector, trig_mode); 118 else 119 printk(KERN_INFO "kvm: null lowest priority vcpu!\n"); 120 break; 121 case IOAPIC_FIXED: 122 for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) { 123 if (!(deliver_bitmask & (1 << vcpu_id))) 124 continue; 125 deliver_bitmask &= ~(1 << vcpu_id); 126 vcpu = ioapic->kvm->vcpus[vcpu_id]; 127 if (vcpu) 128 kvm_apic_set_irq(vcpu, vector, trig_mode); 129 } 130 break; 131 default: 132 printk(KERN_INFO "kvm: unsupported MSI delivery mode\n"); 133 } 134 } 135 #else 136 static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {} 137 #endif 138 139 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 140 int assigned_dev_id) 141 { 142 struct list_head *ptr; 143 struct kvm_assigned_dev_kernel *match; 144 145 list_for_each(ptr, head) { 146 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 147 if (match->assigned_dev_id == assigned_dev_id) 148 return match; 149 } 150 return NULL; 151 } 152 153 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 154 { 155 struct kvm_assigned_dev_kernel *assigned_dev; 156 157 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 158 interrupt_work); 159 160 /* This is taken to safely inject irq inside the guest. When 161 * the interrupt injection (or the ioapic code) uses a 162 * finer-grained lock, update this 163 */ 164 mutex_lock(&assigned_dev->kvm->lock); 165 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX) 166 kvm_set_irq(assigned_dev->kvm, 167 assigned_dev->irq_source_id, 168 assigned_dev->guest_irq, 1); 169 else if (assigned_dev->irq_requested_type & 170 KVM_ASSIGNED_DEV_GUEST_MSI) { 171 assigned_device_msi_dispatch(assigned_dev); 172 enable_irq(assigned_dev->host_irq); 173 assigned_dev->host_irq_disabled = false; 174 } 175 mutex_unlock(&assigned_dev->kvm->lock); 176 } 177 178 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 179 { 180 struct kvm_assigned_dev_kernel *assigned_dev = 181 (struct kvm_assigned_dev_kernel *) dev_id; 182 183 schedule_work(&assigned_dev->interrupt_work); 184 185 disable_irq_nosync(irq); 186 assigned_dev->host_irq_disabled = true; 187 188 return IRQ_HANDLED; 189 } 190 191 /* Ack the irq line for an assigned device */ 192 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 193 { 194 struct kvm_assigned_dev_kernel *dev; 195 196 if (kian->gsi == -1) 197 return; 198 199 dev = container_of(kian, struct kvm_assigned_dev_kernel, 200 ack_notifier); 201 202 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 203 204 /* The guest irq may be shared so this ack may be 205 * from another device. 206 */ 207 if (dev->host_irq_disabled) { 208 enable_irq(dev->host_irq); 209 dev->host_irq_disabled = false; 210 } 211 } 212 213 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 214 static void kvm_free_assigned_irq(struct kvm *kvm, 215 struct kvm_assigned_dev_kernel *assigned_dev) 216 { 217 if (!irqchip_in_kernel(kvm)) 218 return; 219 220 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 221 222 if (assigned_dev->irq_source_id != -1) 223 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 224 assigned_dev->irq_source_id = -1; 225 226 if (!assigned_dev->irq_requested_type) 227 return; 228 229 /* 230 * In kvm_free_device_irq, cancel_work_sync return true if: 231 * 1. work is scheduled, and then cancelled. 232 * 2. work callback is executed. 233 * 234 * The first one ensured that the irq is disabled and no more events 235 * would happen. But for the second one, the irq may be enabled (e.g. 236 * for MSI). So we disable irq here to prevent further events. 237 * 238 * Notice this maybe result in nested disable if the interrupt type is 239 * INTx, but it's OK for we are going to free it. 240 * 241 * If this function is a part of VM destroy, please ensure that till 242 * now, the kvm state is still legal for probably we also have to wait 243 * interrupt_work done. 244 */ 245 disable_irq_nosync(assigned_dev->host_irq); 246 cancel_work_sync(&assigned_dev->interrupt_work); 247 248 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 249 250 if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 251 pci_disable_msi(assigned_dev->dev); 252 253 assigned_dev->irq_requested_type = 0; 254 } 255 256 257 static void kvm_free_assigned_device(struct kvm *kvm, 258 struct kvm_assigned_dev_kernel 259 *assigned_dev) 260 { 261 kvm_free_assigned_irq(kvm, assigned_dev); 262 263 pci_reset_function(assigned_dev->dev); 264 265 pci_release_regions(assigned_dev->dev); 266 pci_disable_device(assigned_dev->dev); 267 pci_dev_put(assigned_dev->dev); 268 269 list_del(&assigned_dev->list); 270 kfree(assigned_dev); 271 } 272 273 void kvm_free_all_assigned_devices(struct kvm *kvm) 274 { 275 struct list_head *ptr, *ptr2; 276 struct kvm_assigned_dev_kernel *assigned_dev; 277 278 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 279 assigned_dev = list_entry(ptr, 280 struct kvm_assigned_dev_kernel, 281 list); 282 283 kvm_free_assigned_device(kvm, assigned_dev); 284 } 285 } 286 287 static int assigned_device_update_intx(struct kvm *kvm, 288 struct kvm_assigned_dev_kernel *adev, 289 struct kvm_assigned_irq *airq) 290 { 291 adev->guest_irq = airq->guest_irq; 292 adev->ack_notifier.gsi = airq->guest_irq; 293 294 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX) 295 return 0; 296 297 if (irqchip_in_kernel(kvm)) { 298 if (!msi2intx && 299 (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) { 300 free_irq(adev->host_irq, (void *)adev); 301 pci_disable_msi(adev->dev); 302 } 303 304 if (!capable(CAP_SYS_RAWIO)) 305 return -EPERM; 306 307 if (airq->host_irq) 308 adev->host_irq = airq->host_irq; 309 else 310 adev->host_irq = adev->dev->irq; 311 312 /* Even though this is PCI, we don't want to use shared 313 * interrupts. Sharing host devices with guest-assigned devices 314 * on the same interrupt line is not a happy situation: there 315 * are going to be long delays in accepting, acking, etc. 316 */ 317 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 318 0, "kvm_assigned_intx_device", (void *)adev)) 319 return -EIO; 320 } 321 322 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX | 323 KVM_ASSIGNED_DEV_HOST_INTX; 324 return 0; 325 } 326 327 #ifdef CONFIG_X86 328 static int assigned_device_update_msi(struct kvm *kvm, 329 struct kvm_assigned_dev_kernel *adev, 330 struct kvm_assigned_irq *airq) 331 { 332 int r; 333 334 if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) { 335 /* x86 don't care upper address of guest msi message addr */ 336 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI; 337 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX; 338 adev->guest_msi.address_lo = airq->guest_msi.addr_lo; 339 adev->guest_msi.data = airq->guest_msi.data; 340 adev->ack_notifier.gsi = -1; 341 } else if (msi2intx) { 342 adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX; 343 adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI; 344 adev->guest_irq = airq->guest_irq; 345 adev->ack_notifier.gsi = airq->guest_irq; 346 } 347 348 if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) 349 return 0; 350 351 if (irqchip_in_kernel(kvm)) { 352 if (!msi2intx) { 353 if (adev->irq_requested_type & 354 KVM_ASSIGNED_DEV_HOST_INTX) 355 free_irq(adev->host_irq, (void *)adev); 356 357 r = pci_enable_msi(adev->dev); 358 if (r) 359 return r; 360 } 361 362 adev->host_irq = adev->dev->irq; 363 if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0, 364 "kvm_assigned_msi_device", (void *)adev)) 365 return -EIO; 366 } 367 368 if (!msi2intx) 369 adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI; 370 371 adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI; 372 return 0; 373 } 374 #endif 375 376 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 377 struct kvm_assigned_irq 378 *assigned_irq) 379 { 380 int r = 0; 381 struct kvm_assigned_dev_kernel *match; 382 383 mutex_lock(&kvm->lock); 384 385 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 386 assigned_irq->assigned_dev_id); 387 if (!match) { 388 mutex_unlock(&kvm->lock); 389 return -EINVAL; 390 } 391 392 if (!match->irq_requested_type) { 393 INIT_WORK(&match->interrupt_work, 394 kvm_assigned_dev_interrupt_work_handler); 395 if (irqchip_in_kernel(kvm)) { 396 /* Register ack nofitier */ 397 match->ack_notifier.gsi = -1; 398 match->ack_notifier.irq_acked = 399 kvm_assigned_dev_ack_irq; 400 kvm_register_irq_ack_notifier(kvm, 401 &match->ack_notifier); 402 403 /* Request IRQ source ID */ 404 r = kvm_request_irq_source_id(kvm); 405 if (r < 0) 406 goto out_release; 407 else 408 match->irq_source_id = r; 409 410 #ifdef CONFIG_X86 411 /* Determine host device irq type, we can know the 412 * result from dev->msi_enabled */ 413 if (msi2intx) 414 pci_enable_msi(match->dev); 415 #endif 416 } 417 } 418 419 if ((!msi2intx && 420 (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) || 421 (msi2intx && match->dev->msi_enabled)) { 422 #ifdef CONFIG_X86 423 r = assigned_device_update_msi(kvm, match, assigned_irq); 424 if (r) { 425 printk(KERN_WARNING "kvm: failed to enable " 426 "MSI device!\n"); 427 goto out_release; 428 } 429 #else 430 r = -ENOTTY; 431 #endif 432 } else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) { 433 /* Host device IRQ 0 means don't support INTx */ 434 if (!msi2intx) { 435 printk(KERN_WARNING 436 "kvm: wait device to enable MSI!\n"); 437 r = 0; 438 } else { 439 printk(KERN_WARNING 440 "kvm: failed to enable MSI device!\n"); 441 r = -ENOTTY; 442 goto out_release; 443 } 444 } else { 445 /* Non-sharing INTx mode */ 446 r = assigned_device_update_intx(kvm, match, assigned_irq); 447 if (r) { 448 printk(KERN_WARNING "kvm: failed to enable " 449 "INTx device!\n"); 450 goto out_release; 451 } 452 } 453 454 mutex_unlock(&kvm->lock); 455 return r; 456 out_release: 457 mutex_unlock(&kvm->lock); 458 kvm_free_assigned_device(kvm, match); 459 return r; 460 } 461 462 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 463 struct kvm_assigned_pci_dev *assigned_dev) 464 { 465 int r = 0; 466 struct kvm_assigned_dev_kernel *match; 467 struct pci_dev *dev; 468 469 down_read(&kvm->slots_lock); 470 mutex_lock(&kvm->lock); 471 472 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 473 assigned_dev->assigned_dev_id); 474 if (match) { 475 /* device already assigned */ 476 r = -EINVAL; 477 goto out; 478 } 479 480 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 481 if (match == NULL) { 482 printk(KERN_INFO "%s: Couldn't allocate memory\n", 483 __func__); 484 r = -ENOMEM; 485 goto out; 486 } 487 dev = pci_get_bus_and_slot(assigned_dev->busnr, 488 assigned_dev->devfn); 489 if (!dev) { 490 printk(KERN_INFO "%s: host device not found\n", __func__); 491 r = -EINVAL; 492 goto out_free; 493 } 494 if (pci_enable_device(dev)) { 495 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 496 r = -EBUSY; 497 goto out_put; 498 } 499 r = pci_request_regions(dev, "kvm_assigned_device"); 500 if (r) { 501 printk(KERN_INFO "%s: Could not get access to device regions\n", 502 __func__); 503 goto out_disable; 504 } 505 506 pci_reset_function(dev); 507 508 match->assigned_dev_id = assigned_dev->assigned_dev_id; 509 match->host_busnr = assigned_dev->busnr; 510 match->host_devfn = assigned_dev->devfn; 511 match->flags = assigned_dev->flags; 512 match->dev = dev; 513 match->irq_source_id = -1; 514 match->kvm = kvm; 515 516 list_add(&match->list, &kvm->arch.assigned_dev_head); 517 518 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 519 if (!kvm->arch.iommu_domain) { 520 r = kvm_iommu_map_guest(kvm); 521 if (r) 522 goto out_list_del; 523 } 524 r = kvm_assign_device(kvm, match); 525 if (r) 526 goto out_list_del; 527 } 528 529 out: 530 mutex_unlock(&kvm->lock); 531 up_read(&kvm->slots_lock); 532 return r; 533 out_list_del: 534 list_del(&match->list); 535 pci_release_regions(dev); 536 out_disable: 537 pci_disable_device(dev); 538 out_put: 539 pci_dev_put(dev); 540 out_free: 541 kfree(match); 542 mutex_unlock(&kvm->lock); 543 up_read(&kvm->slots_lock); 544 return r; 545 } 546 #endif 547 548 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 549 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 550 struct kvm_assigned_pci_dev *assigned_dev) 551 { 552 int r = 0; 553 struct kvm_assigned_dev_kernel *match; 554 555 mutex_lock(&kvm->lock); 556 557 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 558 assigned_dev->assigned_dev_id); 559 if (!match) { 560 printk(KERN_INFO "%s: device hasn't been assigned before, " 561 "so cannot be deassigned\n", __func__); 562 r = -EINVAL; 563 goto out; 564 } 565 566 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 567 kvm_deassign_device(kvm, match); 568 569 kvm_free_assigned_device(kvm, match); 570 571 out: 572 mutex_unlock(&kvm->lock); 573 return r; 574 } 575 #endif 576 577 static inline int valid_vcpu(int n) 578 { 579 return likely(n >= 0 && n < KVM_MAX_VCPUS); 580 } 581 582 inline int kvm_is_mmio_pfn(pfn_t pfn) 583 { 584 if (pfn_valid(pfn)) 585 return PageReserved(pfn_to_page(pfn)); 586 587 return true; 588 } 589 590 /* 591 * Switches to specified vcpu, until a matching vcpu_put() 592 */ 593 void vcpu_load(struct kvm_vcpu *vcpu) 594 { 595 int cpu; 596 597 mutex_lock(&vcpu->mutex); 598 cpu = get_cpu(); 599 preempt_notifier_register(&vcpu->preempt_notifier); 600 kvm_arch_vcpu_load(vcpu, cpu); 601 put_cpu(); 602 } 603 604 void vcpu_put(struct kvm_vcpu *vcpu) 605 { 606 preempt_disable(); 607 kvm_arch_vcpu_put(vcpu); 608 preempt_notifier_unregister(&vcpu->preempt_notifier); 609 preempt_enable(); 610 mutex_unlock(&vcpu->mutex); 611 } 612 613 static void ack_flush(void *_completed) 614 { 615 } 616 617 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 618 { 619 int i, cpu, me; 620 cpumask_var_t cpus; 621 bool called = true; 622 struct kvm_vcpu *vcpu; 623 624 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 625 cpumask_clear(cpus); 626 627 me = get_cpu(); 628 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 629 vcpu = kvm->vcpus[i]; 630 if (!vcpu) 631 continue; 632 if (test_and_set_bit(req, &vcpu->requests)) 633 continue; 634 cpu = vcpu->cpu; 635 if (cpus != NULL && cpu != -1 && cpu != me) 636 cpumask_set_cpu(cpu, cpus); 637 } 638 if (unlikely(cpus == NULL)) 639 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 640 else if (!cpumask_empty(cpus)) 641 smp_call_function_many(cpus, ack_flush, NULL, 1); 642 else 643 called = false; 644 put_cpu(); 645 free_cpumask_var(cpus); 646 return called; 647 } 648 649 void kvm_flush_remote_tlbs(struct kvm *kvm) 650 { 651 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 652 ++kvm->stat.remote_tlb_flush; 653 } 654 655 void kvm_reload_remote_mmus(struct kvm *kvm) 656 { 657 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 658 } 659 660 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 661 { 662 struct page *page; 663 int r; 664 665 mutex_init(&vcpu->mutex); 666 vcpu->cpu = -1; 667 vcpu->kvm = kvm; 668 vcpu->vcpu_id = id; 669 init_waitqueue_head(&vcpu->wq); 670 671 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 672 if (!page) { 673 r = -ENOMEM; 674 goto fail; 675 } 676 vcpu->run = page_address(page); 677 678 r = kvm_arch_vcpu_init(vcpu); 679 if (r < 0) 680 goto fail_free_run; 681 return 0; 682 683 fail_free_run: 684 free_page((unsigned long)vcpu->run); 685 fail: 686 return r; 687 } 688 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 689 690 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 691 { 692 kvm_arch_vcpu_uninit(vcpu); 693 free_page((unsigned long)vcpu->run); 694 } 695 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 696 697 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 698 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 699 { 700 return container_of(mn, struct kvm, mmu_notifier); 701 } 702 703 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 704 struct mm_struct *mm, 705 unsigned long address) 706 { 707 struct kvm *kvm = mmu_notifier_to_kvm(mn); 708 int need_tlb_flush; 709 710 /* 711 * When ->invalidate_page runs, the linux pte has been zapped 712 * already but the page is still allocated until 713 * ->invalidate_page returns. So if we increase the sequence 714 * here the kvm page fault will notice if the spte can't be 715 * established because the page is going to be freed. If 716 * instead the kvm page fault establishes the spte before 717 * ->invalidate_page runs, kvm_unmap_hva will release it 718 * before returning. 719 * 720 * The sequence increase only need to be seen at spin_unlock 721 * time, and not at spin_lock time. 722 * 723 * Increasing the sequence after the spin_unlock would be 724 * unsafe because the kvm page fault could then establish the 725 * pte after kvm_unmap_hva returned, without noticing the page 726 * is going to be freed. 727 */ 728 spin_lock(&kvm->mmu_lock); 729 kvm->mmu_notifier_seq++; 730 need_tlb_flush = kvm_unmap_hva(kvm, address); 731 spin_unlock(&kvm->mmu_lock); 732 733 /* we've to flush the tlb before the pages can be freed */ 734 if (need_tlb_flush) 735 kvm_flush_remote_tlbs(kvm); 736 737 } 738 739 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 740 struct mm_struct *mm, 741 unsigned long start, 742 unsigned long end) 743 { 744 struct kvm *kvm = mmu_notifier_to_kvm(mn); 745 int need_tlb_flush = 0; 746 747 spin_lock(&kvm->mmu_lock); 748 /* 749 * The count increase must become visible at unlock time as no 750 * spte can be established without taking the mmu_lock and 751 * count is also read inside the mmu_lock critical section. 752 */ 753 kvm->mmu_notifier_count++; 754 for (; start < end; start += PAGE_SIZE) 755 need_tlb_flush |= kvm_unmap_hva(kvm, start); 756 spin_unlock(&kvm->mmu_lock); 757 758 /* we've to flush the tlb before the pages can be freed */ 759 if (need_tlb_flush) 760 kvm_flush_remote_tlbs(kvm); 761 } 762 763 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 764 struct mm_struct *mm, 765 unsigned long start, 766 unsigned long end) 767 { 768 struct kvm *kvm = mmu_notifier_to_kvm(mn); 769 770 spin_lock(&kvm->mmu_lock); 771 /* 772 * This sequence increase will notify the kvm page fault that 773 * the page that is going to be mapped in the spte could have 774 * been freed. 775 */ 776 kvm->mmu_notifier_seq++; 777 /* 778 * The above sequence increase must be visible before the 779 * below count decrease but both values are read by the kvm 780 * page fault under mmu_lock spinlock so we don't need to add 781 * a smb_wmb() here in between the two. 782 */ 783 kvm->mmu_notifier_count--; 784 spin_unlock(&kvm->mmu_lock); 785 786 BUG_ON(kvm->mmu_notifier_count < 0); 787 } 788 789 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 790 struct mm_struct *mm, 791 unsigned long address) 792 { 793 struct kvm *kvm = mmu_notifier_to_kvm(mn); 794 int young; 795 796 spin_lock(&kvm->mmu_lock); 797 young = kvm_age_hva(kvm, address); 798 spin_unlock(&kvm->mmu_lock); 799 800 if (young) 801 kvm_flush_remote_tlbs(kvm); 802 803 return young; 804 } 805 806 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 807 struct mm_struct *mm) 808 { 809 struct kvm *kvm = mmu_notifier_to_kvm(mn); 810 kvm_arch_flush_shadow(kvm); 811 } 812 813 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 814 .invalidate_page = kvm_mmu_notifier_invalidate_page, 815 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 816 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 817 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 818 .release = kvm_mmu_notifier_release, 819 }; 820 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 821 822 static struct kvm *kvm_create_vm(void) 823 { 824 struct kvm *kvm = kvm_arch_create_vm(); 825 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 826 struct page *page; 827 #endif 828 829 if (IS_ERR(kvm)) 830 goto out; 831 832 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 833 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 834 if (!page) { 835 kfree(kvm); 836 return ERR_PTR(-ENOMEM); 837 } 838 kvm->coalesced_mmio_ring = 839 (struct kvm_coalesced_mmio_ring *)page_address(page); 840 #endif 841 842 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 843 { 844 int err; 845 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 846 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 847 if (err) { 848 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 849 put_page(page); 850 #endif 851 kfree(kvm); 852 return ERR_PTR(err); 853 } 854 } 855 #endif 856 857 kvm->mm = current->mm; 858 atomic_inc(&kvm->mm->mm_count); 859 spin_lock_init(&kvm->mmu_lock); 860 kvm_io_bus_init(&kvm->pio_bus); 861 mutex_init(&kvm->lock); 862 kvm_io_bus_init(&kvm->mmio_bus); 863 init_rwsem(&kvm->slots_lock); 864 atomic_set(&kvm->users_count, 1); 865 spin_lock(&kvm_lock); 866 list_add(&kvm->vm_list, &vm_list); 867 spin_unlock(&kvm_lock); 868 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 869 kvm_coalesced_mmio_init(kvm); 870 #endif 871 out: 872 return kvm; 873 } 874 875 /* 876 * Free any memory in @free but not in @dont. 877 */ 878 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 879 struct kvm_memory_slot *dont) 880 { 881 if (!dont || free->rmap != dont->rmap) 882 vfree(free->rmap); 883 884 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 885 vfree(free->dirty_bitmap); 886 887 if (!dont || free->lpage_info != dont->lpage_info) 888 vfree(free->lpage_info); 889 890 free->npages = 0; 891 free->dirty_bitmap = NULL; 892 free->rmap = NULL; 893 free->lpage_info = NULL; 894 } 895 896 void kvm_free_physmem(struct kvm *kvm) 897 { 898 int i; 899 900 for (i = 0; i < kvm->nmemslots; ++i) 901 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 902 } 903 904 static void kvm_destroy_vm(struct kvm *kvm) 905 { 906 struct mm_struct *mm = kvm->mm; 907 908 kvm_arch_sync_events(kvm); 909 spin_lock(&kvm_lock); 910 list_del(&kvm->vm_list); 911 spin_unlock(&kvm_lock); 912 kvm_io_bus_destroy(&kvm->pio_bus); 913 kvm_io_bus_destroy(&kvm->mmio_bus); 914 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 915 if (kvm->coalesced_mmio_ring != NULL) 916 free_page((unsigned long)kvm->coalesced_mmio_ring); 917 #endif 918 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 919 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 920 #endif 921 kvm_arch_destroy_vm(kvm); 922 mmdrop(mm); 923 } 924 925 void kvm_get_kvm(struct kvm *kvm) 926 { 927 atomic_inc(&kvm->users_count); 928 } 929 EXPORT_SYMBOL_GPL(kvm_get_kvm); 930 931 void kvm_put_kvm(struct kvm *kvm) 932 { 933 if (atomic_dec_and_test(&kvm->users_count)) 934 kvm_destroy_vm(kvm); 935 } 936 EXPORT_SYMBOL_GPL(kvm_put_kvm); 937 938 939 static int kvm_vm_release(struct inode *inode, struct file *filp) 940 { 941 struct kvm *kvm = filp->private_data; 942 943 kvm_put_kvm(kvm); 944 return 0; 945 } 946 947 /* 948 * Allocate some memory and give it an address in the guest physical address 949 * space. 950 * 951 * Discontiguous memory is allowed, mostly for framebuffers. 952 * 953 * Must be called holding mmap_sem for write. 954 */ 955 int __kvm_set_memory_region(struct kvm *kvm, 956 struct kvm_userspace_memory_region *mem, 957 int user_alloc) 958 { 959 int r; 960 gfn_t base_gfn; 961 unsigned long npages; 962 unsigned long i; 963 struct kvm_memory_slot *memslot; 964 struct kvm_memory_slot old, new; 965 966 r = -EINVAL; 967 /* General sanity checks */ 968 if (mem->memory_size & (PAGE_SIZE - 1)) 969 goto out; 970 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 971 goto out; 972 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 973 goto out; 974 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 975 goto out; 976 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 977 goto out; 978 979 memslot = &kvm->memslots[mem->slot]; 980 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 981 npages = mem->memory_size >> PAGE_SHIFT; 982 983 if (!npages) 984 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 985 986 new = old = *memslot; 987 988 new.base_gfn = base_gfn; 989 new.npages = npages; 990 new.flags = mem->flags; 991 992 /* Disallow changing a memory slot's size. */ 993 r = -EINVAL; 994 if (npages && old.npages && npages != old.npages) 995 goto out_free; 996 997 /* Check for overlaps */ 998 r = -EEXIST; 999 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1000 struct kvm_memory_slot *s = &kvm->memslots[i]; 1001 1002 if (s == memslot) 1003 continue; 1004 if (!((base_gfn + npages <= s->base_gfn) || 1005 (base_gfn >= s->base_gfn + s->npages))) 1006 goto out_free; 1007 } 1008 1009 /* Free page dirty bitmap if unneeded */ 1010 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1011 new.dirty_bitmap = NULL; 1012 1013 r = -ENOMEM; 1014 1015 /* Allocate if a slot is being created */ 1016 #ifndef CONFIG_S390 1017 if (npages && !new.rmap) { 1018 new.rmap = vmalloc(npages * sizeof(struct page *)); 1019 1020 if (!new.rmap) 1021 goto out_free; 1022 1023 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 1024 1025 new.user_alloc = user_alloc; 1026 /* 1027 * hva_to_rmmap() serialzies with the mmu_lock and to be 1028 * safe it has to ignore memslots with !user_alloc && 1029 * !userspace_addr. 1030 */ 1031 if (user_alloc) 1032 new.userspace_addr = mem->userspace_addr; 1033 else 1034 new.userspace_addr = 0; 1035 } 1036 if (npages && !new.lpage_info) { 1037 int largepages = npages / KVM_PAGES_PER_HPAGE; 1038 if (npages % KVM_PAGES_PER_HPAGE) 1039 largepages++; 1040 if (base_gfn % KVM_PAGES_PER_HPAGE) 1041 largepages++; 1042 1043 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1044 1045 if (!new.lpage_info) 1046 goto out_free; 1047 1048 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1049 1050 if (base_gfn % KVM_PAGES_PER_HPAGE) 1051 new.lpage_info[0].write_count = 1; 1052 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1053 new.lpage_info[largepages-1].write_count = 1; 1054 } 1055 1056 /* Allocate page dirty bitmap if needed */ 1057 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1058 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1059 1060 new.dirty_bitmap = vmalloc(dirty_bytes); 1061 if (!new.dirty_bitmap) 1062 goto out_free; 1063 memset(new.dirty_bitmap, 0, dirty_bytes); 1064 } 1065 #endif /* not defined CONFIG_S390 */ 1066 1067 if (!npages) 1068 kvm_arch_flush_shadow(kvm); 1069 1070 spin_lock(&kvm->mmu_lock); 1071 if (mem->slot >= kvm->nmemslots) 1072 kvm->nmemslots = mem->slot + 1; 1073 1074 *memslot = new; 1075 spin_unlock(&kvm->mmu_lock); 1076 1077 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1078 if (r) { 1079 spin_lock(&kvm->mmu_lock); 1080 *memslot = old; 1081 spin_unlock(&kvm->mmu_lock); 1082 goto out_free; 1083 } 1084 1085 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1086 /* Slot deletion case: we have to update the current slot */ 1087 if (!npages) 1088 *memslot = old; 1089 #ifdef CONFIG_DMAR 1090 /* map the pages in iommu page table */ 1091 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1092 if (r) 1093 goto out; 1094 #endif 1095 return 0; 1096 1097 out_free: 1098 kvm_free_physmem_slot(&new, &old); 1099 out: 1100 return r; 1101 1102 } 1103 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1104 1105 int kvm_set_memory_region(struct kvm *kvm, 1106 struct kvm_userspace_memory_region *mem, 1107 int user_alloc) 1108 { 1109 int r; 1110 1111 down_write(&kvm->slots_lock); 1112 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1113 up_write(&kvm->slots_lock); 1114 return r; 1115 } 1116 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1117 1118 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1119 struct 1120 kvm_userspace_memory_region *mem, 1121 int user_alloc) 1122 { 1123 if (mem->slot >= KVM_MEMORY_SLOTS) 1124 return -EINVAL; 1125 return kvm_set_memory_region(kvm, mem, user_alloc); 1126 } 1127 1128 int kvm_get_dirty_log(struct kvm *kvm, 1129 struct kvm_dirty_log *log, int *is_dirty) 1130 { 1131 struct kvm_memory_slot *memslot; 1132 int r, i; 1133 int n; 1134 unsigned long any = 0; 1135 1136 r = -EINVAL; 1137 if (log->slot >= KVM_MEMORY_SLOTS) 1138 goto out; 1139 1140 memslot = &kvm->memslots[log->slot]; 1141 r = -ENOENT; 1142 if (!memslot->dirty_bitmap) 1143 goto out; 1144 1145 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1146 1147 for (i = 0; !any && i < n/sizeof(long); ++i) 1148 any = memslot->dirty_bitmap[i]; 1149 1150 r = -EFAULT; 1151 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1152 goto out; 1153 1154 if (any) 1155 *is_dirty = 1; 1156 1157 r = 0; 1158 out: 1159 return r; 1160 } 1161 1162 int is_error_page(struct page *page) 1163 { 1164 return page == bad_page; 1165 } 1166 EXPORT_SYMBOL_GPL(is_error_page); 1167 1168 int is_error_pfn(pfn_t pfn) 1169 { 1170 return pfn == bad_pfn; 1171 } 1172 EXPORT_SYMBOL_GPL(is_error_pfn); 1173 1174 static inline unsigned long bad_hva(void) 1175 { 1176 return PAGE_OFFSET; 1177 } 1178 1179 int kvm_is_error_hva(unsigned long addr) 1180 { 1181 return addr == bad_hva(); 1182 } 1183 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1184 1185 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1186 { 1187 int i; 1188 1189 for (i = 0; i < kvm->nmemslots; ++i) { 1190 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1191 1192 if (gfn >= memslot->base_gfn 1193 && gfn < memslot->base_gfn + memslot->npages) 1194 return memslot; 1195 } 1196 return NULL; 1197 } 1198 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1199 1200 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1201 { 1202 gfn = unalias_gfn(kvm, gfn); 1203 return gfn_to_memslot_unaliased(kvm, gfn); 1204 } 1205 1206 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1207 { 1208 int i; 1209 1210 gfn = unalias_gfn(kvm, gfn); 1211 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1212 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1213 1214 if (gfn >= memslot->base_gfn 1215 && gfn < memslot->base_gfn + memslot->npages) 1216 return 1; 1217 } 1218 return 0; 1219 } 1220 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1221 1222 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1223 { 1224 struct kvm_memory_slot *slot; 1225 1226 gfn = unalias_gfn(kvm, gfn); 1227 slot = gfn_to_memslot_unaliased(kvm, gfn); 1228 if (!slot) 1229 return bad_hva(); 1230 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1231 } 1232 EXPORT_SYMBOL_GPL(gfn_to_hva); 1233 1234 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1235 { 1236 struct page *page[1]; 1237 unsigned long addr; 1238 int npages; 1239 pfn_t pfn; 1240 1241 might_sleep(); 1242 1243 addr = gfn_to_hva(kvm, gfn); 1244 if (kvm_is_error_hva(addr)) { 1245 get_page(bad_page); 1246 return page_to_pfn(bad_page); 1247 } 1248 1249 npages = get_user_pages_fast(addr, 1, 1, page); 1250 1251 if (unlikely(npages != 1)) { 1252 struct vm_area_struct *vma; 1253 1254 down_read(¤t->mm->mmap_sem); 1255 vma = find_vma(current->mm, addr); 1256 1257 if (vma == NULL || addr < vma->vm_start || 1258 !(vma->vm_flags & VM_PFNMAP)) { 1259 up_read(¤t->mm->mmap_sem); 1260 get_page(bad_page); 1261 return page_to_pfn(bad_page); 1262 } 1263 1264 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1265 up_read(¤t->mm->mmap_sem); 1266 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1267 } else 1268 pfn = page_to_pfn(page[0]); 1269 1270 return pfn; 1271 } 1272 1273 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1274 1275 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1276 { 1277 pfn_t pfn; 1278 1279 pfn = gfn_to_pfn(kvm, gfn); 1280 if (!kvm_is_mmio_pfn(pfn)) 1281 return pfn_to_page(pfn); 1282 1283 WARN_ON(kvm_is_mmio_pfn(pfn)); 1284 1285 get_page(bad_page); 1286 return bad_page; 1287 } 1288 1289 EXPORT_SYMBOL_GPL(gfn_to_page); 1290 1291 void kvm_release_page_clean(struct page *page) 1292 { 1293 kvm_release_pfn_clean(page_to_pfn(page)); 1294 } 1295 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1296 1297 void kvm_release_pfn_clean(pfn_t pfn) 1298 { 1299 if (!kvm_is_mmio_pfn(pfn)) 1300 put_page(pfn_to_page(pfn)); 1301 } 1302 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1303 1304 void kvm_release_page_dirty(struct page *page) 1305 { 1306 kvm_release_pfn_dirty(page_to_pfn(page)); 1307 } 1308 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1309 1310 void kvm_release_pfn_dirty(pfn_t pfn) 1311 { 1312 kvm_set_pfn_dirty(pfn); 1313 kvm_release_pfn_clean(pfn); 1314 } 1315 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1316 1317 void kvm_set_page_dirty(struct page *page) 1318 { 1319 kvm_set_pfn_dirty(page_to_pfn(page)); 1320 } 1321 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1322 1323 void kvm_set_pfn_dirty(pfn_t pfn) 1324 { 1325 if (!kvm_is_mmio_pfn(pfn)) { 1326 struct page *page = pfn_to_page(pfn); 1327 if (!PageReserved(page)) 1328 SetPageDirty(page); 1329 } 1330 } 1331 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1332 1333 void kvm_set_pfn_accessed(pfn_t pfn) 1334 { 1335 if (!kvm_is_mmio_pfn(pfn)) 1336 mark_page_accessed(pfn_to_page(pfn)); 1337 } 1338 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1339 1340 void kvm_get_pfn(pfn_t pfn) 1341 { 1342 if (!kvm_is_mmio_pfn(pfn)) 1343 get_page(pfn_to_page(pfn)); 1344 } 1345 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1346 1347 static int next_segment(unsigned long len, int offset) 1348 { 1349 if (len > PAGE_SIZE - offset) 1350 return PAGE_SIZE - offset; 1351 else 1352 return len; 1353 } 1354 1355 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1356 int len) 1357 { 1358 int r; 1359 unsigned long addr; 1360 1361 addr = gfn_to_hva(kvm, gfn); 1362 if (kvm_is_error_hva(addr)) 1363 return -EFAULT; 1364 r = copy_from_user(data, (void __user *)addr + offset, len); 1365 if (r) 1366 return -EFAULT; 1367 return 0; 1368 } 1369 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1370 1371 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1372 { 1373 gfn_t gfn = gpa >> PAGE_SHIFT; 1374 int seg; 1375 int offset = offset_in_page(gpa); 1376 int ret; 1377 1378 while ((seg = next_segment(len, offset)) != 0) { 1379 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1380 if (ret < 0) 1381 return ret; 1382 offset = 0; 1383 len -= seg; 1384 data += seg; 1385 ++gfn; 1386 } 1387 return 0; 1388 } 1389 EXPORT_SYMBOL_GPL(kvm_read_guest); 1390 1391 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1392 unsigned long len) 1393 { 1394 int r; 1395 unsigned long addr; 1396 gfn_t gfn = gpa >> PAGE_SHIFT; 1397 int offset = offset_in_page(gpa); 1398 1399 addr = gfn_to_hva(kvm, gfn); 1400 if (kvm_is_error_hva(addr)) 1401 return -EFAULT; 1402 pagefault_disable(); 1403 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1404 pagefault_enable(); 1405 if (r) 1406 return -EFAULT; 1407 return 0; 1408 } 1409 EXPORT_SYMBOL(kvm_read_guest_atomic); 1410 1411 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1412 int offset, int len) 1413 { 1414 int r; 1415 unsigned long addr; 1416 1417 addr = gfn_to_hva(kvm, gfn); 1418 if (kvm_is_error_hva(addr)) 1419 return -EFAULT; 1420 r = copy_to_user((void __user *)addr + offset, data, len); 1421 if (r) 1422 return -EFAULT; 1423 mark_page_dirty(kvm, gfn); 1424 return 0; 1425 } 1426 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1427 1428 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1429 unsigned long len) 1430 { 1431 gfn_t gfn = gpa >> PAGE_SHIFT; 1432 int seg; 1433 int offset = offset_in_page(gpa); 1434 int ret; 1435 1436 while ((seg = next_segment(len, offset)) != 0) { 1437 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1438 if (ret < 0) 1439 return ret; 1440 offset = 0; 1441 len -= seg; 1442 data += seg; 1443 ++gfn; 1444 } 1445 return 0; 1446 } 1447 1448 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1449 { 1450 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1451 } 1452 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1453 1454 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1455 { 1456 gfn_t gfn = gpa >> PAGE_SHIFT; 1457 int seg; 1458 int offset = offset_in_page(gpa); 1459 int ret; 1460 1461 while ((seg = next_segment(len, offset)) != 0) { 1462 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1463 if (ret < 0) 1464 return ret; 1465 offset = 0; 1466 len -= seg; 1467 ++gfn; 1468 } 1469 return 0; 1470 } 1471 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1472 1473 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1474 { 1475 struct kvm_memory_slot *memslot; 1476 1477 gfn = unalias_gfn(kvm, gfn); 1478 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1479 if (memslot && memslot->dirty_bitmap) { 1480 unsigned long rel_gfn = gfn - memslot->base_gfn; 1481 1482 /* avoid RMW */ 1483 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1484 set_bit(rel_gfn, memslot->dirty_bitmap); 1485 } 1486 } 1487 1488 /* 1489 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1490 */ 1491 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1492 { 1493 DEFINE_WAIT(wait); 1494 1495 for (;;) { 1496 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1497 1498 if (kvm_cpu_has_interrupt(vcpu) || 1499 kvm_cpu_has_pending_timer(vcpu) || 1500 kvm_arch_vcpu_runnable(vcpu)) { 1501 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1502 break; 1503 } 1504 if (signal_pending(current)) 1505 break; 1506 1507 vcpu_put(vcpu); 1508 schedule(); 1509 vcpu_load(vcpu); 1510 } 1511 1512 finish_wait(&vcpu->wq, &wait); 1513 } 1514 1515 void kvm_resched(struct kvm_vcpu *vcpu) 1516 { 1517 if (!need_resched()) 1518 return; 1519 cond_resched(); 1520 } 1521 EXPORT_SYMBOL_GPL(kvm_resched); 1522 1523 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1524 { 1525 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1526 struct page *page; 1527 1528 if (vmf->pgoff == 0) 1529 page = virt_to_page(vcpu->run); 1530 #ifdef CONFIG_X86 1531 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1532 page = virt_to_page(vcpu->arch.pio_data); 1533 #endif 1534 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1535 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1536 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1537 #endif 1538 else 1539 return VM_FAULT_SIGBUS; 1540 get_page(page); 1541 vmf->page = page; 1542 return 0; 1543 } 1544 1545 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1546 .fault = kvm_vcpu_fault, 1547 }; 1548 1549 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1550 { 1551 vma->vm_ops = &kvm_vcpu_vm_ops; 1552 return 0; 1553 } 1554 1555 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1556 { 1557 struct kvm_vcpu *vcpu = filp->private_data; 1558 1559 kvm_put_kvm(vcpu->kvm); 1560 return 0; 1561 } 1562 1563 static struct file_operations kvm_vcpu_fops = { 1564 .release = kvm_vcpu_release, 1565 .unlocked_ioctl = kvm_vcpu_ioctl, 1566 .compat_ioctl = kvm_vcpu_ioctl, 1567 .mmap = kvm_vcpu_mmap, 1568 }; 1569 1570 /* 1571 * Allocates an inode for the vcpu. 1572 */ 1573 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1574 { 1575 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1576 if (fd < 0) 1577 kvm_put_kvm(vcpu->kvm); 1578 return fd; 1579 } 1580 1581 /* 1582 * Creates some virtual cpus. Good luck creating more than one. 1583 */ 1584 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1585 { 1586 int r; 1587 struct kvm_vcpu *vcpu; 1588 1589 if (!valid_vcpu(n)) 1590 return -EINVAL; 1591 1592 vcpu = kvm_arch_vcpu_create(kvm, n); 1593 if (IS_ERR(vcpu)) 1594 return PTR_ERR(vcpu); 1595 1596 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1597 1598 r = kvm_arch_vcpu_setup(vcpu); 1599 if (r) 1600 return r; 1601 1602 mutex_lock(&kvm->lock); 1603 if (kvm->vcpus[n]) { 1604 r = -EEXIST; 1605 goto vcpu_destroy; 1606 } 1607 kvm->vcpus[n] = vcpu; 1608 mutex_unlock(&kvm->lock); 1609 1610 /* Now it's all set up, let userspace reach it */ 1611 kvm_get_kvm(kvm); 1612 r = create_vcpu_fd(vcpu); 1613 if (r < 0) 1614 goto unlink; 1615 return r; 1616 1617 unlink: 1618 mutex_lock(&kvm->lock); 1619 kvm->vcpus[n] = NULL; 1620 vcpu_destroy: 1621 mutex_unlock(&kvm->lock); 1622 kvm_arch_vcpu_destroy(vcpu); 1623 return r; 1624 } 1625 1626 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1627 { 1628 if (sigset) { 1629 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1630 vcpu->sigset_active = 1; 1631 vcpu->sigset = *sigset; 1632 } else 1633 vcpu->sigset_active = 0; 1634 return 0; 1635 } 1636 1637 static long kvm_vcpu_ioctl(struct file *filp, 1638 unsigned int ioctl, unsigned long arg) 1639 { 1640 struct kvm_vcpu *vcpu = filp->private_data; 1641 void __user *argp = (void __user *)arg; 1642 int r; 1643 struct kvm_fpu *fpu = NULL; 1644 struct kvm_sregs *kvm_sregs = NULL; 1645 1646 if (vcpu->kvm->mm != current->mm) 1647 return -EIO; 1648 switch (ioctl) { 1649 case KVM_RUN: 1650 r = -EINVAL; 1651 if (arg) 1652 goto out; 1653 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1654 break; 1655 case KVM_GET_REGS: { 1656 struct kvm_regs *kvm_regs; 1657 1658 r = -ENOMEM; 1659 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1660 if (!kvm_regs) 1661 goto out; 1662 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1663 if (r) 1664 goto out_free1; 1665 r = -EFAULT; 1666 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1667 goto out_free1; 1668 r = 0; 1669 out_free1: 1670 kfree(kvm_regs); 1671 break; 1672 } 1673 case KVM_SET_REGS: { 1674 struct kvm_regs *kvm_regs; 1675 1676 r = -ENOMEM; 1677 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1678 if (!kvm_regs) 1679 goto out; 1680 r = -EFAULT; 1681 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1682 goto out_free2; 1683 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1684 if (r) 1685 goto out_free2; 1686 r = 0; 1687 out_free2: 1688 kfree(kvm_regs); 1689 break; 1690 } 1691 case KVM_GET_SREGS: { 1692 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1693 r = -ENOMEM; 1694 if (!kvm_sregs) 1695 goto out; 1696 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1697 if (r) 1698 goto out; 1699 r = -EFAULT; 1700 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1701 goto out; 1702 r = 0; 1703 break; 1704 } 1705 case KVM_SET_SREGS: { 1706 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1707 r = -ENOMEM; 1708 if (!kvm_sregs) 1709 goto out; 1710 r = -EFAULT; 1711 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1712 goto out; 1713 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1714 if (r) 1715 goto out; 1716 r = 0; 1717 break; 1718 } 1719 case KVM_GET_MP_STATE: { 1720 struct kvm_mp_state mp_state; 1721 1722 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1723 if (r) 1724 goto out; 1725 r = -EFAULT; 1726 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1727 goto out; 1728 r = 0; 1729 break; 1730 } 1731 case KVM_SET_MP_STATE: { 1732 struct kvm_mp_state mp_state; 1733 1734 r = -EFAULT; 1735 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1736 goto out; 1737 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1738 if (r) 1739 goto out; 1740 r = 0; 1741 break; 1742 } 1743 case KVM_TRANSLATE: { 1744 struct kvm_translation tr; 1745 1746 r = -EFAULT; 1747 if (copy_from_user(&tr, argp, sizeof tr)) 1748 goto out; 1749 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1750 if (r) 1751 goto out; 1752 r = -EFAULT; 1753 if (copy_to_user(argp, &tr, sizeof tr)) 1754 goto out; 1755 r = 0; 1756 break; 1757 } 1758 case KVM_DEBUG_GUEST: { 1759 struct kvm_debug_guest dbg; 1760 1761 r = -EFAULT; 1762 if (copy_from_user(&dbg, argp, sizeof dbg)) 1763 goto out; 1764 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); 1765 if (r) 1766 goto out; 1767 r = 0; 1768 break; 1769 } 1770 case KVM_SET_SIGNAL_MASK: { 1771 struct kvm_signal_mask __user *sigmask_arg = argp; 1772 struct kvm_signal_mask kvm_sigmask; 1773 sigset_t sigset, *p; 1774 1775 p = NULL; 1776 if (argp) { 1777 r = -EFAULT; 1778 if (copy_from_user(&kvm_sigmask, argp, 1779 sizeof kvm_sigmask)) 1780 goto out; 1781 r = -EINVAL; 1782 if (kvm_sigmask.len != sizeof sigset) 1783 goto out; 1784 r = -EFAULT; 1785 if (copy_from_user(&sigset, sigmask_arg->sigset, 1786 sizeof sigset)) 1787 goto out; 1788 p = &sigset; 1789 } 1790 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1791 break; 1792 } 1793 case KVM_GET_FPU: { 1794 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1795 r = -ENOMEM; 1796 if (!fpu) 1797 goto out; 1798 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 1799 if (r) 1800 goto out; 1801 r = -EFAULT; 1802 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 1803 goto out; 1804 r = 0; 1805 break; 1806 } 1807 case KVM_SET_FPU: { 1808 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 1809 r = -ENOMEM; 1810 if (!fpu) 1811 goto out; 1812 r = -EFAULT; 1813 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 1814 goto out; 1815 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 1816 if (r) 1817 goto out; 1818 r = 0; 1819 break; 1820 } 1821 default: 1822 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1823 } 1824 out: 1825 kfree(fpu); 1826 kfree(kvm_sregs); 1827 return r; 1828 } 1829 1830 static long kvm_vm_ioctl(struct file *filp, 1831 unsigned int ioctl, unsigned long arg) 1832 { 1833 struct kvm *kvm = filp->private_data; 1834 void __user *argp = (void __user *)arg; 1835 int r; 1836 1837 if (kvm->mm != current->mm) 1838 return -EIO; 1839 switch (ioctl) { 1840 case KVM_CREATE_VCPU: 1841 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 1842 if (r < 0) 1843 goto out; 1844 break; 1845 case KVM_SET_USER_MEMORY_REGION: { 1846 struct kvm_userspace_memory_region kvm_userspace_mem; 1847 1848 r = -EFAULT; 1849 if (copy_from_user(&kvm_userspace_mem, argp, 1850 sizeof kvm_userspace_mem)) 1851 goto out; 1852 1853 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 1854 if (r) 1855 goto out; 1856 break; 1857 } 1858 case KVM_GET_DIRTY_LOG: { 1859 struct kvm_dirty_log log; 1860 1861 r = -EFAULT; 1862 if (copy_from_user(&log, argp, sizeof log)) 1863 goto out; 1864 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 1865 if (r) 1866 goto out; 1867 break; 1868 } 1869 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1870 case KVM_REGISTER_COALESCED_MMIO: { 1871 struct kvm_coalesced_mmio_zone zone; 1872 r = -EFAULT; 1873 if (copy_from_user(&zone, argp, sizeof zone)) 1874 goto out; 1875 r = -ENXIO; 1876 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 1877 if (r) 1878 goto out; 1879 r = 0; 1880 break; 1881 } 1882 case KVM_UNREGISTER_COALESCED_MMIO: { 1883 struct kvm_coalesced_mmio_zone zone; 1884 r = -EFAULT; 1885 if (copy_from_user(&zone, argp, sizeof zone)) 1886 goto out; 1887 r = -ENXIO; 1888 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 1889 if (r) 1890 goto out; 1891 r = 0; 1892 break; 1893 } 1894 #endif 1895 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 1896 case KVM_ASSIGN_PCI_DEVICE: { 1897 struct kvm_assigned_pci_dev assigned_dev; 1898 1899 r = -EFAULT; 1900 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1901 goto out; 1902 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 1903 if (r) 1904 goto out; 1905 break; 1906 } 1907 case KVM_ASSIGN_IRQ: { 1908 struct kvm_assigned_irq assigned_irq; 1909 1910 r = -EFAULT; 1911 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 1912 goto out; 1913 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 1914 if (r) 1915 goto out; 1916 break; 1917 } 1918 #endif 1919 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 1920 case KVM_DEASSIGN_PCI_DEVICE: { 1921 struct kvm_assigned_pci_dev assigned_dev; 1922 1923 r = -EFAULT; 1924 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 1925 goto out; 1926 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 1927 if (r) 1928 goto out; 1929 break; 1930 } 1931 #endif 1932 default: 1933 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 1934 } 1935 out: 1936 return r; 1937 } 1938 1939 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1940 { 1941 struct page *page[1]; 1942 unsigned long addr; 1943 int npages; 1944 gfn_t gfn = vmf->pgoff; 1945 struct kvm *kvm = vma->vm_file->private_data; 1946 1947 addr = gfn_to_hva(kvm, gfn); 1948 if (kvm_is_error_hva(addr)) 1949 return VM_FAULT_SIGBUS; 1950 1951 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 1952 NULL); 1953 if (unlikely(npages != 1)) 1954 return VM_FAULT_SIGBUS; 1955 1956 vmf->page = page[0]; 1957 return 0; 1958 } 1959 1960 static struct vm_operations_struct kvm_vm_vm_ops = { 1961 .fault = kvm_vm_fault, 1962 }; 1963 1964 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1965 { 1966 vma->vm_ops = &kvm_vm_vm_ops; 1967 return 0; 1968 } 1969 1970 static struct file_operations kvm_vm_fops = { 1971 .release = kvm_vm_release, 1972 .unlocked_ioctl = kvm_vm_ioctl, 1973 .compat_ioctl = kvm_vm_ioctl, 1974 .mmap = kvm_vm_mmap, 1975 }; 1976 1977 static int kvm_dev_ioctl_create_vm(void) 1978 { 1979 int fd; 1980 struct kvm *kvm; 1981 1982 kvm = kvm_create_vm(); 1983 if (IS_ERR(kvm)) 1984 return PTR_ERR(kvm); 1985 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 1986 if (fd < 0) 1987 kvm_put_kvm(kvm); 1988 1989 return fd; 1990 } 1991 1992 static long kvm_dev_ioctl_check_extension_generic(long arg) 1993 { 1994 switch (arg) { 1995 case KVM_CAP_USER_MEMORY: 1996 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 1997 return 1; 1998 default: 1999 break; 2000 } 2001 return kvm_dev_ioctl_check_extension(arg); 2002 } 2003 2004 static long kvm_dev_ioctl(struct file *filp, 2005 unsigned int ioctl, unsigned long arg) 2006 { 2007 long r = -EINVAL; 2008 2009 switch (ioctl) { 2010 case KVM_GET_API_VERSION: 2011 r = -EINVAL; 2012 if (arg) 2013 goto out; 2014 r = KVM_API_VERSION; 2015 break; 2016 case KVM_CREATE_VM: 2017 r = -EINVAL; 2018 if (arg) 2019 goto out; 2020 r = kvm_dev_ioctl_create_vm(); 2021 break; 2022 case KVM_CHECK_EXTENSION: 2023 r = kvm_dev_ioctl_check_extension_generic(arg); 2024 break; 2025 case KVM_GET_VCPU_MMAP_SIZE: 2026 r = -EINVAL; 2027 if (arg) 2028 goto out; 2029 r = PAGE_SIZE; /* struct kvm_run */ 2030 #ifdef CONFIG_X86 2031 r += PAGE_SIZE; /* pio data page */ 2032 #endif 2033 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2034 r += PAGE_SIZE; /* coalesced mmio ring page */ 2035 #endif 2036 break; 2037 case KVM_TRACE_ENABLE: 2038 case KVM_TRACE_PAUSE: 2039 case KVM_TRACE_DISABLE: 2040 r = kvm_trace_ioctl(ioctl, arg); 2041 break; 2042 default: 2043 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2044 } 2045 out: 2046 return r; 2047 } 2048 2049 static struct file_operations kvm_chardev_ops = { 2050 .unlocked_ioctl = kvm_dev_ioctl, 2051 .compat_ioctl = kvm_dev_ioctl, 2052 }; 2053 2054 static struct miscdevice kvm_dev = { 2055 KVM_MINOR, 2056 "kvm", 2057 &kvm_chardev_ops, 2058 }; 2059 2060 static void hardware_enable(void *junk) 2061 { 2062 int cpu = raw_smp_processor_id(); 2063 2064 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2065 return; 2066 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2067 kvm_arch_hardware_enable(NULL); 2068 } 2069 2070 static void hardware_disable(void *junk) 2071 { 2072 int cpu = raw_smp_processor_id(); 2073 2074 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2075 return; 2076 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2077 kvm_arch_hardware_disable(NULL); 2078 } 2079 2080 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2081 void *v) 2082 { 2083 int cpu = (long)v; 2084 2085 val &= ~CPU_TASKS_FROZEN; 2086 switch (val) { 2087 case CPU_DYING: 2088 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2089 cpu); 2090 hardware_disable(NULL); 2091 break; 2092 case CPU_UP_CANCELED: 2093 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2094 cpu); 2095 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2096 break; 2097 case CPU_ONLINE: 2098 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2099 cpu); 2100 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2101 break; 2102 } 2103 return NOTIFY_OK; 2104 } 2105 2106 2107 asmlinkage void kvm_handle_fault_on_reboot(void) 2108 { 2109 if (kvm_rebooting) 2110 /* spin while reset goes on */ 2111 while (true) 2112 ; 2113 /* Fault while not rebooting. We want the trace. */ 2114 BUG(); 2115 } 2116 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2117 2118 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2119 void *v) 2120 { 2121 if (val == SYS_RESTART) { 2122 /* 2123 * Some (well, at least mine) BIOSes hang on reboot if 2124 * in vmx root mode. 2125 */ 2126 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2127 kvm_rebooting = true; 2128 on_each_cpu(hardware_disable, NULL, 1); 2129 } 2130 return NOTIFY_OK; 2131 } 2132 2133 static struct notifier_block kvm_reboot_notifier = { 2134 .notifier_call = kvm_reboot, 2135 .priority = 0, 2136 }; 2137 2138 void kvm_io_bus_init(struct kvm_io_bus *bus) 2139 { 2140 memset(bus, 0, sizeof(*bus)); 2141 } 2142 2143 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2144 { 2145 int i; 2146 2147 for (i = 0; i < bus->dev_count; i++) { 2148 struct kvm_io_device *pos = bus->devs[i]; 2149 2150 kvm_iodevice_destructor(pos); 2151 } 2152 } 2153 2154 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2155 gpa_t addr, int len, int is_write) 2156 { 2157 int i; 2158 2159 for (i = 0; i < bus->dev_count; i++) { 2160 struct kvm_io_device *pos = bus->devs[i]; 2161 2162 if (pos->in_range(pos, addr, len, is_write)) 2163 return pos; 2164 } 2165 2166 return NULL; 2167 } 2168 2169 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2170 { 2171 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2172 2173 bus->devs[bus->dev_count++] = dev; 2174 } 2175 2176 static struct notifier_block kvm_cpu_notifier = { 2177 .notifier_call = kvm_cpu_hotplug, 2178 .priority = 20, /* must be > scheduler priority */ 2179 }; 2180 2181 static int vm_stat_get(void *_offset, u64 *val) 2182 { 2183 unsigned offset = (long)_offset; 2184 struct kvm *kvm; 2185 2186 *val = 0; 2187 spin_lock(&kvm_lock); 2188 list_for_each_entry(kvm, &vm_list, vm_list) 2189 *val += *(u32 *)((void *)kvm + offset); 2190 spin_unlock(&kvm_lock); 2191 return 0; 2192 } 2193 2194 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2195 2196 static int vcpu_stat_get(void *_offset, u64 *val) 2197 { 2198 unsigned offset = (long)_offset; 2199 struct kvm *kvm; 2200 struct kvm_vcpu *vcpu; 2201 int i; 2202 2203 *val = 0; 2204 spin_lock(&kvm_lock); 2205 list_for_each_entry(kvm, &vm_list, vm_list) 2206 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2207 vcpu = kvm->vcpus[i]; 2208 if (vcpu) 2209 *val += *(u32 *)((void *)vcpu + offset); 2210 } 2211 spin_unlock(&kvm_lock); 2212 return 0; 2213 } 2214 2215 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2216 2217 static struct file_operations *stat_fops[] = { 2218 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2219 [KVM_STAT_VM] = &vm_stat_fops, 2220 }; 2221 2222 static void kvm_init_debug(void) 2223 { 2224 struct kvm_stats_debugfs_item *p; 2225 2226 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2227 for (p = debugfs_entries; p->name; ++p) 2228 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2229 (void *)(long)p->offset, 2230 stat_fops[p->kind]); 2231 } 2232 2233 static void kvm_exit_debug(void) 2234 { 2235 struct kvm_stats_debugfs_item *p; 2236 2237 for (p = debugfs_entries; p->name; ++p) 2238 debugfs_remove(p->dentry); 2239 debugfs_remove(kvm_debugfs_dir); 2240 } 2241 2242 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2243 { 2244 hardware_disable(NULL); 2245 return 0; 2246 } 2247 2248 static int kvm_resume(struct sys_device *dev) 2249 { 2250 hardware_enable(NULL); 2251 return 0; 2252 } 2253 2254 static struct sysdev_class kvm_sysdev_class = { 2255 .name = "kvm", 2256 .suspend = kvm_suspend, 2257 .resume = kvm_resume, 2258 }; 2259 2260 static struct sys_device kvm_sysdev = { 2261 .id = 0, 2262 .cls = &kvm_sysdev_class, 2263 }; 2264 2265 struct page *bad_page; 2266 pfn_t bad_pfn; 2267 2268 static inline 2269 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2270 { 2271 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2272 } 2273 2274 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2275 { 2276 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2277 2278 kvm_arch_vcpu_load(vcpu, cpu); 2279 } 2280 2281 static void kvm_sched_out(struct preempt_notifier *pn, 2282 struct task_struct *next) 2283 { 2284 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2285 2286 kvm_arch_vcpu_put(vcpu); 2287 } 2288 2289 int kvm_init(void *opaque, unsigned int vcpu_size, 2290 struct module *module) 2291 { 2292 int r; 2293 int cpu; 2294 2295 kvm_init_debug(); 2296 2297 r = kvm_arch_init(opaque); 2298 if (r) 2299 goto out_fail; 2300 2301 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2302 2303 if (bad_page == NULL) { 2304 r = -ENOMEM; 2305 goto out; 2306 } 2307 2308 bad_pfn = page_to_pfn(bad_page); 2309 2310 if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2311 r = -ENOMEM; 2312 goto out_free_0; 2313 } 2314 2315 r = kvm_arch_hardware_setup(); 2316 if (r < 0) 2317 goto out_free_0a; 2318 2319 for_each_online_cpu(cpu) { 2320 smp_call_function_single(cpu, 2321 kvm_arch_check_processor_compat, 2322 &r, 1); 2323 if (r < 0) 2324 goto out_free_1; 2325 } 2326 2327 on_each_cpu(hardware_enable, NULL, 1); 2328 r = register_cpu_notifier(&kvm_cpu_notifier); 2329 if (r) 2330 goto out_free_2; 2331 register_reboot_notifier(&kvm_reboot_notifier); 2332 2333 r = sysdev_class_register(&kvm_sysdev_class); 2334 if (r) 2335 goto out_free_3; 2336 2337 r = sysdev_register(&kvm_sysdev); 2338 if (r) 2339 goto out_free_4; 2340 2341 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2342 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2343 __alignof__(struct kvm_vcpu), 2344 0, NULL); 2345 if (!kvm_vcpu_cache) { 2346 r = -ENOMEM; 2347 goto out_free_5; 2348 } 2349 2350 kvm_chardev_ops.owner = module; 2351 kvm_vm_fops.owner = module; 2352 kvm_vcpu_fops.owner = module; 2353 2354 r = misc_register(&kvm_dev); 2355 if (r) { 2356 printk(KERN_ERR "kvm: misc device register failed\n"); 2357 goto out_free; 2358 } 2359 2360 kvm_preempt_ops.sched_in = kvm_sched_in; 2361 kvm_preempt_ops.sched_out = kvm_sched_out; 2362 #ifndef CONFIG_X86 2363 msi2intx = 0; 2364 #endif 2365 2366 return 0; 2367 2368 out_free: 2369 kmem_cache_destroy(kvm_vcpu_cache); 2370 out_free_5: 2371 sysdev_unregister(&kvm_sysdev); 2372 out_free_4: 2373 sysdev_class_unregister(&kvm_sysdev_class); 2374 out_free_3: 2375 unregister_reboot_notifier(&kvm_reboot_notifier); 2376 unregister_cpu_notifier(&kvm_cpu_notifier); 2377 out_free_2: 2378 on_each_cpu(hardware_disable, NULL, 1); 2379 out_free_1: 2380 kvm_arch_hardware_unsetup(); 2381 out_free_0a: 2382 free_cpumask_var(cpus_hardware_enabled); 2383 out_free_0: 2384 __free_page(bad_page); 2385 out: 2386 kvm_arch_exit(); 2387 kvm_exit_debug(); 2388 out_fail: 2389 return r; 2390 } 2391 EXPORT_SYMBOL_GPL(kvm_init); 2392 2393 void kvm_exit(void) 2394 { 2395 kvm_trace_cleanup(); 2396 misc_deregister(&kvm_dev); 2397 kmem_cache_destroy(kvm_vcpu_cache); 2398 sysdev_unregister(&kvm_sysdev); 2399 sysdev_class_unregister(&kvm_sysdev_class); 2400 unregister_reboot_notifier(&kvm_reboot_notifier); 2401 unregister_cpu_notifier(&kvm_cpu_notifier); 2402 on_each_cpu(hardware_disable, NULL, 1); 2403 kvm_arch_hardware_unsetup(); 2404 kvm_arch_exit(); 2405 kvm_exit_debug(); 2406 free_cpumask_var(cpus_hardware_enabled); 2407 __free_page(bad_page); 2408 } 2409 EXPORT_SYMBOL_GPL(kvm_exit); 2410