1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 47 #include <asm/processor.h> 48 #include <asm/io.h> 49 #include <asm/uaccess.h> 50 #include <asm/pgtable.h> 51 52 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 53 #include "coalesced_mmio.h" 54 #endif 55 56 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 57 #include <linux/pci.h> 58 #include <linux/interrupt.h> 59 #include "irq.h" 60 #endif 61 62 #define CREATE_TRACE_POINTS 63 #include <trace/events/kvm.h> 64 65 MODULE_AUTHOR("Qumranet"); 66 MODULE_LICENSE("GPL"); 67 68 /* 69 * Ordering of locks: 70 * 71 * kvm->slots_lock --> kvm->lock --> kvm->irq_lock 72 */ 73 74 DEFINE_SPINLOCK(kvm_lock); 75 LIST_HEAD(vm_list); 76 77 static cpumask_var_t cpus_hardware_enabled; 78 79 struct kmem_cache *kvm_vcpu_cache; 80 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 81 82 static __read_mostly struct preempt_ops kvm_preempt_ops; 83 84 struct dentry *kvm_debugfs_dir; 85 86 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 87 unsigned long arg); 88 89 static bool kvm_rebooting; 90 91 static bool largepages_enabled = true; 92 93 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 94 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 95 int assigned_dev_id) 96 { 97 struct list_head *ptr; 98 struct kvm_assigned_dev_kernel *match; 99 100 list_for_each(ptr, head) { 101 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 102 if (match->assigned_dev_id == assigned_dev_id) 103 return match; 104 } 105 return NULL; 106 } 107 108 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel 109 *assigned_dev, int irq) 110 { 111 int i, index; 112 struct msix_entry *host_msix_entries; 113 114 host_msix_entries = assigned_dev->host_msix_entries; 115 116 index = -1; 117 for (i = 0; i < assigned_dev->entries_nr; i++) 118 if (irq == host_msix_entries[i].vector) { 119 index = i; 120 break; 121 } 122 if (index < 0) { 123 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 124 return 0; 125 } 126 127 return index; 128 } 129 130 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 131 { 132 struct kvm_assigned_dev_kernel *assigned_dev; 133 struct kvm *kvm; 134 int i; 135 136 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 137 interrupt_work); 138 kvm = assigned_dev->kvm; 139 140 mutex_lock(&kvm->irq_lock); 141 spin_lock_irq(&assigned_dev->assigned_dev_lock); 142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 143 struct kvm_guest_msix_entry *guest_entries = 144 assigned_dev->guest_msix_entries; 145 for (i = 0; i < assigned_dev->entries_nr; i++) { 146 if (!(guest_entries[i].flags & 147 KVM_ASSIGNED_MSIX_PENDING)) 148 continue; 149 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; 150 kvm_set_irq(assigned_dev->kvm, 151 assigned_dev->irq_source_id, 152 guest_entries[i].vector, 1); 153 } 154 } else 155 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 156 assigned_dev->guest_irq, 1); 157 158 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 159 mutex_unlock(&assigned_dev->kvm->irq_lock); 160 } 161 162 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 163 { 164 unsigned long flags; 165 struct kvm_assigned_dev_kernel *assigned_dev = 166 (struct kvm_assigned_dev_kernel *) dev_id; 167 168 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); 169 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 170 int index = find_index_from_host_irq(assigned_dev, irq); 171 if (index < 0) 172 goto out; 173 assigned_dev->guest_msix_entries[index].flags |= 174 KVM_ASSIGNED_MSIX_PENDING; 175 } 176 177 schedule_work(&assigned_dev->interrupt_work); 178 179 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { 180 disable_irq_nosync(irq); 181 assigned_dev->host_irq_disabled = true; 182 } 183 184 out: 185 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 186 return IRQ_HANDLED; 187 } 188 189 /* Ack the irq line for an assigned device */ 190 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 191 { 192 struct kvm_assigned_dev_kernel *dev; 193 unsigned long flags; 194 195 if (kian->gsi == -1) 196 return; 197 198 dev = container_of(kian, struct kvm_assigned_dev_kernel, 199 ack_notifier); 200 201 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 202 203 /* The guest irq may be shared so this ack may be 204 * from another device. 205 */ 206 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 207 if (dev->host_irq_disabled) { 208 enable_irq(dev->host_irq); 209 dev->host_irq_disabled = false; 210 } 211 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 212 } 213 214 static void deassign_guest_irq(struct kvm *kvm, 215 struct kvm_assigned_dev_kernel *assigned_dev) 216 { 217 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 218 assigned_dev->ack_notifier.gsi = -1; 219 220 if (assigned_dev->irq_source_id != -1) 221 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 222 assigned_dev->irq_source_id = -1; 223 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); 224 } 225 226 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 227 static void deassign_host_irq(struct kvm *kvm, 228 struct kvm_assigned_dev_kernel *assigned_dev) 229 { 230 /* 231 * In kvm_free_device_irq, cancel_work_sync return true if: 232 * 1. work is scheduled, and then cancelled. 233 * 2. work callback is executed. 234 * 235 * The first one ensured that the irq is disabled and no more events 236 * would happen. But for the second one, the irq may be enabled (e.g. 237 * for MSI). So we disable irq here to prevent further events. 238 * 239 * Notice this maybe result in nested disable if the interrupt type is 240 * INTx, but it's OK for we are going to free it. 241 * 242 * If this function is a part of VM destroy, please ensure that till 243 * now, the kvm state is still legal for probably we also have to wait 244 * interrupt_work done. 245 */ 246 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 247 int i; 248 for (i = 0; i < assigned_dev->entries_nr; i++) 249 disable_irq_nosync(assigned_dev-> 250 host_msix_entries[i].vector); 251 252 cancel_work_sync(&assigned_dev->interrupt_work); 253 254 for (i = 0; i < assigned_dev->entries_nr; i++) 255 free_irq(assigned_dev->host_msix_entries[i].vector, 256 (void *)assigned_dev); 257 258 assigned_dev->entries_nr = 0; 259 kfree(assigned_dev->host_msix_entries); 260 kfree(assigned_dev->guest_msix_entries); 261 pci_disable_msix(assigned_dev->dev); 262 } else { 263 /* Deal with MSI and INTx */ 264 disable_irq_nosync(assigned_dev->host_irq); 265 cancel_work_sync(&assigned_dev->interrupt_work); 266 267 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 268 269 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 270 pci_disable_msi(assigned_dev->dev); 271 } 272 273 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); 274 } 275 276 static int kvm_deassign_irq(struct kvm *kvm, 277 struct kvm_assigned_dev_kernel *assigned_dev, 278 unsigned long irq_requested_type) 279 { 280 unsigned long guest_irq_type, host_irq_type; 281 282 if (!irqchip_in_kernel(kvm)) 283 return -EINVAL; 284 /* no irq assignment to deassign */ 285 if (!assigned_dev->irq_requested_type) 286 return -ENXIO; 287 288 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; 289 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; 290 291 if (host_irq_type) 292 deassign_host_irq(kvm, assigned_dev); 293 if (guest_irq_type) 294 deassign_guest_irq(kvm, assigned_dev); 295 296 return 0; 297 } 298 299 static void kvm_free_assigned_irq(struct kvm *kvm, 300 struct kvm_assigned_dev_kernel *assigned_dev) 301 { 302 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); 303 } 304 305 static void kvm_free_assigned_device(struct kvm *kvm, 306 struct kvm_assigned_dev_kernel 307 *assigned_dev) 308 { 309 kvm_free_assigned_irq(kvm, assigned_dev); 310 311 pci_reset_function(assigned_dev->dev); 312 313 pci_release_regions(assigned_dev->dev); 314 pci_disable_device(assigned_dev->dev); 315 pci_dev_put(assigned_dev->dev); 316 317 list_del(&assigned_dev->list); 318 kfree(assigned_dev); 319 } 320 321 void kvm_free_all_assigned_devices(struct kvm *kvm) 322 { 323 struct list_head *ptr, *ptr2; 324 struct kvm_assigned_dev_kernel *assigned_dev; 325 326 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 327 assigned_dev = list_entry(ptr, 328 struct kvm_assigned_dev_kernel, 329 list); 330 331 kvm_free_assigned_device(kvm, assigned_dev); 332 } 333 } 334 335 static int assigned_device_enable_host_intx(struct kvm *kvm, 336 struct kvm_assigned_dev_kernel *dev) 337 { 338 dev->host_irq = dev->dev->irq; 339 /* Even though this is PCI, we don't want to use shared 340 * interrupts. Sharing host devices with guest-assigned devices 341 * on the same interrupt line is not a happy situation: there 342 * are going to be long delays in accepting, acking, etc. 343 */ 344 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 345 0, "kvm_assigned_intx_device", (void *)dev)) 346 return -EIO; 347 return 0; 348 } 349 350 #ifdef __KVM_HAVE_MSI 351 static int assigned_device_enable_host_msi(struct kvm *kvm, 352 struct kvm_assigned_dev_kernel *dev) 353 { 354 int r; 355 356 if (!dev->dev->msi_enabled) { 357 r = pci_enable_msi(dev->dev); 358 if (r) 359 return r; 360 } 361 362 dev->host_irq = dev->dev->irq; 363 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 364 "kvm_assigned_msi_device", (void *)dev)) { 365 pci_disable_msi(dev->dev); 366 return -EIO; 367 } 368 369 return 0; 370 } 371 #endif 372 373 #ifdef __KVM_HAVE_MSIX 374 static int assigned_device_enable_host_msix(struct kvm *kvm, 375 struct kvm_assigned_dev_kernel *dev) 376 { 377 int i, r = -EINVAL; 378 379 /* host_msix_entries and guest_msix_entries should have been 380 * initialized */ 381 if (dev->entries_nr == 0) 382 return r; 383 384 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); 385 if (r) 386 return r; 387 388 for (i = 0; i < dev->entries_nr; i++) { 389 r = request_irq(dev->host_msix_entries[i].vector, 390 kvm_assigned_dev_intr, 0, 391 "kvm_assigned_msix_device", 392 (void *)dev); 393 /* FIXME: free requested_irq's on failure */ 394 if (r) 395 return r; 396 } 397 398 return 0; 399 } 400 401 #endif 402 403 static int assigned_device_enable_guest_intx(struct kvm *kvm, 404 struct kvm_assigned_dev_kernel *dev, 405 struct kvm_assigned_irq *irq) 406 { 407 dev->guest_irq = irq->guest_irq; 408 dev->ack_notifier.gsi = irq->guest_irq; 409 return 0; 410 } 411 412 #ifdef __KVM_HAVE_MSI 413 static int assigned_device_enable_guest_msi(struct kvm *kvm, 414 struct kvm_assigned_dev_kernel *dev, 415 struct kvm_assigned_irq *irq) 416 { 417 dev->guest_irq = irq->guest_irq; 418 dev->ack_notifier.gsi = -1; 419 dev->host_irq_disabled = false; 420 return 0; 421 } 422 #endif 423 #ifdef __KVM_HAVE_MSIX 424 static int assigned_device_enable_guest_msix(struct kvm *kvm, 425 struct kvm_assigned_dev_kernel *dev, 426 struct kvm_assigned_irq *irq) 427 { 428 dev->guest_irq = irq->guest_irq; 429 dev->ack_notifier.gsi = -1; 430 dev->host_irq_disabled = false; 431 return 0; 432 } 433 #endif 434 435 static int assign_host_irq(struct kvm *kvm, 436 struct kvm_assigned_dev_kernel *dev, 437 __u32 host_irq_type) 438 { 439 int r = -EEXIST; 440 441 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 442 return r; 443 444 switch (host_irq_type) { 445 case KVM_DEV_IRQ_HOST_INTX: 446 r = assigned_device_enable_host_intx(kvm, dev); 447 break; 448 #ifdef __KVM_HAVE_MSI 449 case KVM_DEV_IRQ_HOST_MSI: 450 r = assigned_device_enable_host_msi(kvm, dev); 451 break; 452 #endif 453 #ifdef __KVM_HAVE_MSIX 454 case KVM_DEV_IRQ_HOST_MSIX: 455 r = assigned_device_enable_host_msix(kvm, dev); 456 break; 457 #endif 458 default: 459 r = -EINVAL; 460 } 461 462 if (!r) 463 dev->irq_requested_type |= host_irq_type; 464 465 return r; 466 } 467 468 static int assign_guest_irq(struct kvm *kvm, 469 struct kvm_assigned_dev_kernel *dev, 470 struct kvm_assigned_irq *irq, 471 unsigned long guest_irq_type) 472 { 473 int id; 474 int r = -EEXIST; 475 476 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) 477 return r; 478 479 id = kvm_request_irq_source_id(kvm); 480 if (id < 0) 481 return id; 482 483 dev->irq_source_id = id; 484 485 switch (guest_irq_type) { 486 case KVM_DEV_IRQ_GUEST_INTX: 487 r = assigned_device_enable_guest_intx(kvm, dev, irq); 488 break; 489 #ifdef __KVM_HAVE_MSI 490 case KVM_DEV_IRQ_GUEST_MSI: 491 r = assigned_device_enable_guest_msi(kvm, dev, irq); 492 break; 493 #endif 494 #ifdef __KVM_HAVE_MSIX 495 case KVM_DEV_IRQ_GUEST_MSIX: 496 r = assigned_device_enable_guest_msix(kvm, dev, irq); 497 break; 498 #endif 499 default: 500 r = -EINVAL; 501 } 502 503 if (!r) { 504 dev->irq_requested_type |= guest_irq_type; 505 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 506 } else 507 kvm_free_irq_source_id(kvm, dev->irq_source_id); 508 509 return r; 510 } 511 512 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ 513 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 514 struct kvm_assigned_irq *assigned_irq) 515 { 516 int r = -EINVAL; 517 struct kvm_assigned_dev_kernel *match; 518 unsigned long host_irq_type, guest_irq_type; 519 520 if (!capable(CAP_SYS_RAWIO)) 521 return -EPERM; 522 523 if (!irqchip_in_kernel(kvm)) 524 return r; 525 526 mutex_lock(&kvm->lock); 527 r = -ENODEV; 528 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 529 assigned_irq->assigned_dev_id); 530 if (!match) 531 goto out; 532 533 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); 534 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); 535 536 r = -EINVAL; 537 /* can only assign one type at a time */ 538 if (hweight_long(host_irq_type) > 1) 539 goto out; 540 if (hweight_long(guest_irq_type) > 1) 541 goto out; 542 if (host_irq_type == 0 && guest_irq_type == 0) 543 goto out; 544 545 r = 0; 546 if (host_irq_type) 547 r = assign_host_irq(kvm, match, host_irq_type); 548 if (r) 549 goto out; 550 551 if (guest_irq_type) 552 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); 553 out: 554 mutex_unlock(&kvm->lock); 555 return r; 556 } 557 558 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, 559 struct kvm_assigned_irq 560 *assigned_irq) 561 { 562 int r = -ENODEV; 563 struct kvm_assigned_dev_kernel *match; 564 565 mutex_lock(&kvm->lock); 566 567 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 568 assigned_irq->assigned_dev_id); 569 if (!match) 570 goto out; 571 572 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 573 out: 574 mutex_unlock(&kvm->lock); 575 return r; 576 } 577 578 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 579 struct kvm_assigned_pci_dev *assigned_dev) 580 { 581 int r = 0; 582 struct kvm_assigned_dev_kernel *match; 583 struct pci_dev *dev; 584 585 down_read(&kvm->slots_lock); 586 mutex_lock(&kvm->lock); 587 588 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 589 assigned_dev->assigned_dev_id); 590 if (match) { 591 /* device already assigned */ 592 r = -EEXIST; 593 goto out; 594 } 595 596 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 597 if (match == NULL) { 598 printk(KERN_INFO "%s: Couldn't allocate memory\n", 599 __func__); 600 r = -ENOMEM; 601 goto out; 602 } 603 dev = pci_get_bus_and_slot(assigned_dev->busnr, 604 assigned_dev->devfn); 605 if (!dev) { 606 printk(KERN_INFO "%s: host device not found\n", __func__); 607 r = -EINVAL; 608 goto out_free; 609 } 610 if (pci_enable_device(dev)) { 611 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 612 r = -EBUSY; 613 goto out_put; 614 } 615 r = pci_request_regions(dev, "kvm_assigned_device"); 616 if (r) { 617 printk(KERN_INFO "%s: Could not get access to device regions\n", 618 __func__); 619 goto out_disable; 620 } 621 622 pci_reset_function(dev); 623 624 match->assigned_dev_id = assigned_dev->assigned_dev_id; 625 match->host_busnr = assigned_dev->busnr; 626 match->host_devfn = assigned_dev->devfn; 627 match->flags = assigned_dev->flags; 628 match->dev = dev; 629 spin_lock_init(&match->assigned_dev_lock); 630 match->irq_source_id = -1; 631 match->kvm = kvm; 632 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 633 INIT_WORK(&match->interrupt_work, 634 kvm_assigned_dev_interrupt_work_handler); 635 636 list_add(&match->list, &kvm->arch.assigned_dev_head); 637 638 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 639 if (!kvm->arch.iommu_domain) { 640 r = kvm_iommu_map_guest(kvm); 641 if (r) 642 goto out_list_del; 643 } 644 r = kvm_assign_device(kvm, match); 645 if (r) 646 goto out_list_del; 647 } 648 649 out: 650 mutex_unlock(&kvm->lock); 651 up_read(&kvm->slots_lock); 652 return r; 653 out_list_del: 654 list_del(&match->list); 655 pci_release_regions(dev); 656 out_disable: 657 pci_disable_device(dev); 658 out_put: 659 pci_dev_put(dev); 660 out_free: 661 kfree(match); 662 mutex_unlock(&kvm->lock); 663 up_read(&kvm->slots_lock); 664 return r; 665 } 666 #endif 667 668 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 669 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 670 struct kvm_assigned_pci_dev *assigned_dev) 671 { 672 int r = 0; 673 struct kvm_assigned_dev_kernel *match; 674 675 mutex_lock(&kvm->lock); 676 677 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 678 assigned_dev->assigned_dev_id); 679 if (!match) { 680 printk(KERN_INFO "%s: device hasn't been assigned before, " 681 "so cannot be deassigned\n", __func__); 682 r = -EINVAL; 683 goto out; 684 } 685 686 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 687 kvm_deassign_device(kvm, match); 688 689 kvm_free_assigned_device(kvm, match); 690 691 out: 692 mutex_unlock(&kvm->lock); 693 return r; 694 } 695 #endif 696 697 inline int kvm_is_mmio_pfn(pfn_t pfn) 698 { 699 if (pfn_valid(pfn)) { 700 struct page *page = compound_head(pfn_to_page(pfn)); 701 return PageReserved(page); 702 } 703 704 return true; 705 } 706 707 /* 708 * Switches to specified vcpu, until a matching vcpu_put() 709 */ 710 void vcpu_load(struct kvm_vcpu *vcpu) 711 { 712 int cpu; 713 714 mutex_lock(&vcpu->mutex); 715 cpu = get_cpu(); 716 preempt_notifier_register(&vcpu->preempt_notifier); 717 kvm_arch_vcpu_load(vcpu, cpu); 718 put_cpu(); 719 } 720 721 void vcpu_put(struct kvm_vcpu *vcpu) 722 { 723 preempt_disable(); 724 kvm_arch_vcpu_put(vcpu); 725 preempt_notifier_unregister(&vcpu->preempt_notifier); 726 preempt_enable(); 727 mutex_unlock(&vcpu->mutex); 728 } 729 730 static void ack_flush(void *_completed) 731 { 732 } 733 734 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 735 { 736 int i, cpu, me; 737 cpumask_var_t cpus; 738 bool called = true; 739 struct kvm_vcpu *vcpu; 740 741 zalloc_cpumask_var(&cpus, GFP_ATOMIC); 742 743 spin_lock(&kvm->requests_lock); 744 me = smp_processor_id(); 745 kvm_for_each_vcpu(i, vcpu, kvm) { 746 if (test_and_set_bit(req, &vcpu->requests)) 747 continue; 748 cpu = vcpu->cpu; 749 if (cpus != NULL && cpu != -1 && cpu != me) 750 cpumask_set_cpu(cpu, cpus); 751 } 752 if (unlikely(cpus == NULL)) 753 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 754 else if (!cpumask_empty(cpus)) 755 smp_call_function_many(cpus, ack_flush, NULL, 1); 756 else 757 called = false; 758 spin_unlock(&kvm->requests_lock); 759 free_cpumask_var(cpus); 760 return called; 761 } 762 763 void kvm_flush_remote_tlbs(struct kvm *kvm) 764 { 765 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 766 ++kvm->stat.remote_tlb_flush; 767 } 768 769 void kvm_reload_remote_mmus(struct kvm *kvm) 770 { 771 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 772 } 773 774 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 775 { 776 struct page *page; 777 int r; 778 779 mutex_init(&vcpu->mutex); 780 vcpu->cpu = -1; 781 vcpu->kvm = kvm; 782 vcpu->vcpu_id = id; 783 init_waitqueue_head(&vcpu->wq); 784 785 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 786 if (!page) { 787 r = -ENOMEM; 788 goto fail; 789 } 790 vcpu->run = page_address(page); 791 792 r = kvm_arch_vcpu_init(vcpu); 793 if (r < 0) 794 goto fail_free_run; 795 return 0; 796 797 fail_free_run: 798 free_page((unsigned long)vcpu->run); 799 fail: 800 return r; 801 } 802 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 803 804 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 805 { 806 kvm_arch_vcpu_uninit(vcpu); 807 free_page((unsigned long)vcpu->run); 808 } 809 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 810 811 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 812 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 813 { 814 return container_of(mn, struct kvm, mmu_notifier); 815 } 816 817 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 818 struct mm_struct *mm, 819 unsigned long address) 820 { 821 struct kvm *kvm = mmu_notifier_to_kvm(mn); 822 int need_tlb_flush; 823 824 /* 825 * When ->invalidate_page runs, the linux pte has been zapped 826 * already but the page is still allocated until 827 * ->invalidate_page returns. So if we increase the sequence 828 * here the kvm page fault will notice if the spte can't be 829 * established because the page is going to be freed. If 830 * instead the kvm page fault establishes the spte before 831 * ->invalidate_page runs, kvm_unmap_hva will release it 832 * before returning. 833 * 834 * The sequence increase only need to be seen at spin_unlock 835 * time, and not at spin_lock time. 836 * 837 * Increasing the sequence after the spin_unlock would be 838 * unsafe because the kvm page fault could then establish the 839 * pte after kvm_unmap_hva returned, without noticing the page 840 * is going to be freed. 841 */ 842 spin_lock(&kvm->mmu_lock); 843 kvm->mmu_notifier_seq++; 844 need_tlb_flush = kvm_unmap_hva(kvm, address); 845 spin_unlock(&kvm->mmu_lock); 846 847 /* we've to flush the tlb before the pages can be freed */ 848 if (need_tlb_flush) 849 kvm_flush_remote_tlbs(kvm); 850 851 } 852 853 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 854 struct mm_struct *mm, 855 unsigned long start, 856 unsigned long end) 857 { 858 struct kvm *kvm = mmu_notifier_to_kvm(mn); 859 int need_tlb_flush = 0; 860 861 spin_lock(&kvm->mmu_lock); 862 /* 863 * The count increase must become visible at unlock time as no 864 * spte can be established without taking the mmu_lock and 865 * count is also read inside the mmu_lock critical section. 866 */ 867 kvm->mmu_notifier_count++; 868 for (; start < end; start += PAGE_SIZE) 869 need_tlb_flush |= kvm_unmap_hva(kvm, start); 870 spin_unlock(&kvm->mmu_lock); 871 872 /* we've to flush the tlb before the pages can be freed */ 873 if (need_tlb_flush) 874 kvm_flush_remote_tlbs(kvm); 875 } 876 877 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 878 struct mm_struct *mm, 879 unsigned long start, 880 unsigned long end) 881 { 882 struct kvm *kvm = mmu_notifier_to_kvm(mn); 883 884 spin_lock(&kvm->mmu_lock); 885 /* 886 * This sequence increase will notify the kvm page fault that 887 * the page that is going to be mapped in the spte could have 888 * been freed. 889 */ 890 kvm->mmu_notifier_seq++; 891 /* 892 * The above sequence increase must be visible before the 893 * below count decrease but both values are read by the kvm 894 * page fault under mmu_lock spinlock so we don't need to add 895 * a smb_wmb() here in between the two. 896 */ 897 kvm->mmu_notifier_count--; 898 spin_unlock(&kvm->mmu_lock); 899 900 BUG_ON(kvm->mmu_notifier_count < 0); 901 } 902 903 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 904 struct mm_struct *mm, 905 unsigned long address) 906 { 907 struct kvm *kvm = mmu_notifier_to_kvm(mn); 908 int young; 909 910 spin_lock(&kvm->mmu_lock); 911 young = kvm_age_hva(kvm, address); 912 spin_unlock(&kvm->mmu_lock); 913 914 if (young) 915 kvm_flush_remote_tlbs(kvm); 916 917 return young; 918 } 919 920 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 921 struct mm_struct *mm) 922 { 923 struct kvm *kvm = mmu_notifier_to_kvm(mn); 924 kvm_arch_flush_shadow(kvm); 925 } 926 927 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 928 .invalidate_page = kvm_mmu_notifier_invalidate_page, 929 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 930 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 931 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 932 .release = kvm_mmu_notifier_release, 933 }; 934 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 935 936 static struct kvm *kvm_create_vm(void) 937 { 938 struct kvm *kvm = kvm_arch_create_vm(); 939 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 940 struct page *page; 941 #endif 942 943 if (IS_ERR(kvm)) 944 goto out; 945 #ifdef CONFIG_HAVE_KVM_IRQCHIP 946 INIT_LIST_HEAD(&kvm->irq_routing); 947 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 948 #endif 949 950 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 951 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 952 if (!page) { 953 kfree(kvm); 954 return ERR_PTR(-ENOMEM); 955 } 956 kvm->coalesced_mmio_ring = 957 (struct kvm_coalesced_mmio_ring *)page_address(page); 958 #endif 959 960 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 961 { 962 int err; 963 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 964 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 965 if (err) { 966 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 967 put_page(page); 968 #endif 969 kfree(kvm); 970 return ERR_PTR(err); 971 } 972 } 973 #endif 974 975 kvm->mm = current->mm; 976 atomic_inc(&kvm->mm->mm_count); 977 spin_lock_init(&kvm->mmu_lock); 978 spin_lock_init(&kvm->requests_lock); 979 kvm_io_bus_init(&kvm->pio_bus); 980 kvm_eventfd_init(kvm); 981 mutex_init(&kvm->lock); 982 mutex_init(&kvm->irq_lock); 983 kvm_io_bus_init(&kvm->mmio_bus); 984 init_rwsem(&kvm->slots_lock); 985 atomic_set(&kvm->users_count, 1); 986 spin_lock(&kvm_lock); 987 list_add(&kvm->vm_list, &vm_list); 988 spin_unlock(&kvm_lock); 989 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 990 kvm_coalesced_mmio_init(kvm); 991 #endif 992 out: 993 return kvm; 994 } 995 996 /* 997 * Free any memory in @free but not in @dont. 998 */ 999 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1000 struct kvm_memory_slot *dont) 1001 { 1002 int i; 1003 1004 if (!dont || free->rmap != dont->rmap) 1005 vfree(free->rmap); 1006 1007 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1008 vfree(free->dirty_bitmap); 1009 1010 1011 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 1012 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 1013 vfree(free->lpage_info[i]); 1014 free->lpage_info[i] = NULL; 1015 } 1016 } 1017 1018 free->npages = 0; 1019 free->dirty_bitmap = NULL; 1020 free->rmap = NULL; 1021 } 1022 1023 void kvm_free_physmem(struct kvm *kvm) 1024 { 1025 int i; 1026 1027 for (i = 0; i < kvm->nmemslots; ++i) 1028 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 1029 } 1030 1031 static void kvm_destroy_vm(struct kvm *kvm) 1032 { 1033 struct mm_struct *mm = kvm->mm; 1034 1035 kvm_arch_sync_events(kvm); 1036 spin_lock(&kvm_lock); 1037 list_del(&kvm->vm_list); 1038 spin_unlock(&kvm_lock); 1039 kvm_free_irq_routing(kvm); 1040 kvm_io_bus_destroy(&kvm->pio_bus); 1041 kvm_io_bus_destroy(&kvm->mmio_bus); 1042 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1043 if (kvm->coalesced_mmio_ring != NULL) 1044 free_page((unsigned long)kvm->coalesced_mmio_ring); 1045 #endif 1046 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1047 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 1048 #else 1049 kvm_arch_flush_shadow(kvm); 1050 #endif 1051 kvm_arch_destroy_vm(kvm); 1052 mmdrop(mm); 1053 } 1054 1055 void kvm_get_kvm(struct kvm *kvm) 1056 { 1057 atomic_inc(&kvm->users_count); 1058 } 1059 EXPORT_SYMBOL_GPL(kvm_get_kvm); 1060 1061 void kvm_put_kvm(struct kvm *kvm) 1062 { 1063 if (atomic_dec_and_test(&kvm->users_count)) 1064 kvm_destroy_vm(kvm); 1065 } 1066 EXPORT_SYMBOL_GPL(kvm_put_kvm); 1067 1068 1069 static int kvm_vm_release(struct inode *inode, struct file *filp) 1070 { 1071 struct kvm *kvm = filp->private_data; 1072 1073 kvm_irqfd_release(kvm); 1074 1075 kvm_put_kvm(kvm); 1076 return 0; 1077 } 1078 1079 /* 1080 * Allocate some memory and give it an address in the guest physical address 1081 * space. 1082 * 1083 * Discontiguous memory is allowed, mostly for framebuffers. 1084 * 1085 * Must be called holding mmap_sem for write. 1086 */ 1087 int __kvm_set_memory_region(struct kvm *kvm, 1088 struct kvm_userspace_memory_region *mem, 1089 int user_alloc) 1090 { 1091 int r; 1092 gfn_t base_gfn; 1093 unsigned long npages; 1094 unsigned long i; 1095 struct kvm_memory_slot *memslot; 1096 struct kvm_memory_slot old, new; 1097 1098 r = -EINVAL; 1099 /* General sanity checks */ 1100 if (mem->memory_size & (PAGE_SIZE - 1)) 1101 goto out; 1102 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1103 goto out; 1104 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 1105 goto out; 1106 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 1107 goto out; 1108 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1109 goto out; 1110 1111 memslot = &kvm->memslots[mem->slot]; 1112 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1113 npages = mem->memory_size >> PAGE_SHIFT; 1114 1115 if (!npages) 1116 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1117 1118 new = old = *memslot; 1119 1120 new.base_gfn = base_gfn; 1121 new.npages = npages; 1122 new.flags = mem->flags; 1123 1124 /* Disallow changing a memory slot's size. */ 1125 r = -EINVAL; 1126 if (npages && old.npages && npages != old.npages) 1127 goto out_free; 1128 1129 /* Check for overlaps */ 1130 r = -EEXIST; 1131 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1132 struct kvm_memory_slot *s = &kvm->memslots[i]; 1133 1134 if (s == memslot || !s->npages) 1135 continue; 1136 if (!((base_gfn + npages <= s->base_gfn) || 1137 (base_gfn >= s->base_gfn + s->npages))) 1138 goto out_free; 1139 } 1140 1141 /* Free page dirty bitmap if unneeded */ 1142 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1143 new.dirty_bitmap = NULL; 1144 1145 r = -ENOMEM; 1146 1147 /* Allocate if a slot is being created */ 1148 #ifndef CONFIG_S390 1149 if (npages && !new.rmap) { 1150 new.rmap = vmalloc(npages * sizeof(struct page *)); 1151 1152 if (!new.rmap) 1153 goto out_free; 1154 1155 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 1156 1157 new.user_alloc = user_alloc; 1158 /* 1159 * hva_to_rmmap() serialzies with the mmu_lock and to be 1160 * safe it has to ignore memslots with !user_alloc && 1161 * !userspace_addr. 1162 */ 1163 if (user_alloc) 1164 new.userspace_addr = mem->userspace_addr; 1165 else 1166 new.userspace_addr = 0; 1167 } 1168 if (!npages) 1169 goto skip_lpage; 1170 1171 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 1172 unsigned long ugfn; 1173 unsigned long j; 1174 int lpages; 1175 int level = i + 2; 1176 1177 /* Avoid unused variable warning if no large pages */ 1178 (void)level; 1179 1180 if (new.lpage_info[i]) 1181 continue; 1182 1183 lpages = 1 + (base_gfn + npages - 1) / 1184 KVM_PAGES_PER_HPAGE(level); 1185 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); 1186 1187 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 1188 1189 if (!new.lpage_info[i]) 1190 goto out_free; 1191 1192 memset(new.lpage_info[i], 0, 1193 lpages * sizeof(*new.lpage_info[i])); 1194 1195 if (base_gfn % KVM_PAGES_PER_HPAGE(level)) 1196 new.lpage_info[i][0].write_count = 1; 1197 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) 1198 new.lpage_info[i][lpages - 1].write_count = 1; 1199 ugfn = new.userspace_addr >> PAGE_SHIFT; 1200 /* 1201 * If the gfn and userspace address are not aligned wrt each 1202 * other, or if explicitly asked to, disable large page 1203 * support for this slot 1204 */ 1205 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 1206 !largepages_enabled) 1207 for (j = 0; j < lpages; ++j) 1208 new.lpage_info[i][j].write_count = 1; 1209 } 1210 1211 skip_lpage: 1212 1213 /* Allocate page dirty bitmap if needed */ 1214 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1215 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1216 1217 new.dirty_bitmap = vmalloc(dirty_bytes); 1218 if (!new.dirty_bitmap) 1219 goto out_free; 1220 memset(new.dirty_bitmap, 0, dirty_bytes); 1221 if (old.npages) 1222 kvm_arch_flush_shadow(kvm); 1223 } 1224 #else /* not defined CONFIG_S390 */ 1225 new.user_alloc = user_alloc; 1226 if (user_alloc) 1227 new.userspace_addr = mem->userspace_addr; 1228 #endif /* not defined CONFIG_S390 */ 1229 1230 if (!npages) 1231 kvm_arch_flush_shadow(kvm); 1232 1233 spin_lock(&kvm->mmu_lock); 1234 if (mem->slot >= kvm->nmemslots) 1235 kvm->nmemslots = mem->slot + 1; 1236 1237 *memslot = new; 1238 spin_unlock(&kvm->mmu_lock); 1239 1240 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1241 if (r) { 1242 spin_lock(&kvm->mmu_lock); 1243 *memslot = old; 1244 spin_unlock(&kvm->mmu_lock); 1245 goto out_free; 1246 } 1247 1248 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1249 /* Slot deletion case: we have to update the current slot */ 1250 spin_lock(&kvm->mmu_lock); 1251 if (!npages) 1252 *memslot = old; 1253 spin_unlock(&kvm->mmu_lock); 1254 #ifdef CONFIG_DMAR 1255 /* map the pages in iommu page table */ 1256 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1257 if (r) 1258 goto out; 1259 #endif 1260 return 0; 1261 1262 out_free: 1263 kvm_free_physmem_slot(&new, &old); 1264 out: 1265 return r; 1266 1267 } 1268 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1269 1270 int kvm_set_memory_region(struct kvm *kvm, 1271 struct kvm_userspace_memory_region *mem, 1272 int user_alloc) 1273 { 1274 int r; 1275 1276 down_write(&kvm->slots_lock); 1277 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1278 up_write(&kvm->slots_lock); 1279 return r; 1280 } 1281 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1282 1283 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1284 struct 1285 kvm_userspace_memory_region *mem, 1286 int user_alloc) 1287 { 1288 if (mem->slot >= KVM_MEMORY_SLOTS) 1289 return -EINVAL; 1290 return kvm_set_memory_region(kvm, mem, user_alloc); 1291 } 1292 1293 int kvm_get_dirty_log(struct kvm *kvm, 1294 struct kvm_dirty_log *log, int *is_dirty) 1295 { 1296 struct kvm_memory_slot *memslot; 1297 int r, i; 1298 int n; 1299 unsigned long any = 0; 1300 1301 r = -EINVAL; 1302 if (log->slot >= KVM_MEMORY_SLOTS) 1303 goto out; 1304 1305 memslot = &kvm->memslots[log->slot]; 1306 r = -ENOENT; 1307 if (!memslot->dirty_bitmap) 1308 goto out; 1309 1310 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1311 1312 for (i = 0; !any && i < n/sizeof(long); ++i) 1313 any = memslot->dirty_bitmap[i]; 1314 1315 r = -EFAULT; 1316 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1317 goto out; 1318 1319 if (any) 1320 *is_dirty = 1; 1321 1322 r = 0; 1323 out: 1324 return r; 1325 } 1326 1327 void kvm_disable_largepages(void) 1328 { 1329 largepages_enabled = false; 1330 } 1331 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1332 1333 int is_error_page(struct page *page) 1334 { 1335 return page == bad_page; 1336 } 1337 EXPORT_SYMBOL_GPL(is_error_page); 1338 1339 int is_error_pfn(pfn_t pfn) 1340 { 1341 return pfn == bad_pfn; 1342 } 1343 EXPORT_SYMBOL_GPL(is_error_pfn); 1344 1345 static inline unsigned long bad_hva(void) 1346 { 1347 return PAGE_OFFSET; 1348 } 1349 1350 int kvm_is_error_hva(unsigned long addr) 1351 { 1352 return addr == bad_hva(); 1353 } 1354 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1355 1356 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1357 { 1358 int i; 1359 1360 for (i = 0; i < kvm->nmemslots; ++i) { 1361 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1362 1363 if (gfn >= memslot->base_gfn 1364 && gfn < memslot->base_gfn + memslot->npages) 1365 return memslot; 1366 } 1367 return NULL; 1368 } 1369 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1370 1371 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1372 { 1373 gfn = unalias_gfn(kvm, gfn); 1374 return gfn_to_memslot_unaliased(kvm, gfn); 1375 } 1376 1377 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1378 { 1379 int i; 1380 1381 gfn = unalias_gfn(kvm, gfn); 1382 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1383 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1384 1385 if (gfn >= memslot->base_gfn 1386 && gfn < memslot->base_gfn + memslot->npages) 1387 return 1; 1388 } 1389 return 0; 1390 } 1391 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1392 1393 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1394 { 1395 struct kvm_memory_slot *slot; 1396 1397 gfn = unalias_gfn(kvm, gfn); 1398 slot = gfn_to_memslot_unaliased(kvm, gfn); 1399 if (!slot) 1400 return bad_hva(); 1401 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1402 } 1403 EXPORT_SYMBOL_GPL(gfn_to_hva); 1404 1405 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1406 { 1407 struct page *page[1]; 1408 unsigned long addr; 1409 int npages; 1410 pfn_t pfn; 1411 1412 might_sleep(); 1413 1414 addr = gfn_to_hva(kvm, gfn); 1415 if (kvm_is_error_hva(addr)) { 1416 get_page(bad_page); 1417 return page_to_pfn(bad_page); 1418 } 1419 1420 npages = get_user_pages_fast(addr, 1, 1, page); 1421 1422 if (unlikely(npages != 1)) { 1423 struct vm_area_struct *vma; 1424 1425 down_read(¤t->mm->mmap_sem); 1426 vma = find_vma(current->mm, addr); 1427 1428 if (vma == NULL || addr < vma->vm_start || 1429 !(vma->vm_flags & VM_PFNMAP)) { 1430 up_read(¤t->mm->mmap_sem); 1431 get_page(bad_page); 1432 return page_to_pfn(bad_page); 1433 } 1434 1435 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1436 up_read(¤t->mm->mmap_sem); 1437 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1438 } else 1439 pfn = page_to_pfn(page[0]); 1440 1441 return pfn; 1442 } 1443 1444 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1445 1446 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1447 { 1448 pfn_t pfn; 1449 1450 pfn = gfn_to_pfn(kvm, gfn); 1451 if (!kvm_is_mmio_pfn(pfn)) 1452 return pfn_to_page(pfn); 1453 1454 WARN_ON(kvm_is_mmio_pfn(pfn)); 1455 1456 get_page(bad_page); 1457 return bad_page; 1458 } 1459 1460 EXPORT_SYMBOL_GPL(gfn_to_page); 1461 1462 void kvm_release_page_clean(struct page *page) 1463 { 1464 kvm_release_pfn_clean(page_to_pfn(page)); 1465 } 1466 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1467 1468 void kvm_release_pfn_clean(pfn_t pfn) 1469 { 1470 if (!kvm_is_mmio_pfn(pfn)) 1471 put_page(pfn_to_page(pfn)); 1472 } 1473 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1474 1475 void kvm_release_page_dirty(struct page *page) 1476 { 1477 kvm_release_pfn_dirty(page_to_pfn(page)); 1478 } 1479 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1480 1481 void kvm_release_pfn_dirty(pfn_t pfn) 1482 { 1483 kvm_set_pfn_dirty(pfn); 1484 kvm_release_pfn_clean(pfn); 1485 } 1486 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1487 1488 void kvm_set_page_dirty(struct page *page) 1489 { 1490 kvm_set_pfn_dirty(page_to_pfn(page)); 1491 } 1492 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1493 1494 void kvm_set_pfn_dirty(pfn_t pfn) 1495 { 1496 if (!kvm_is_mmio_pfn(pfn)) { 1497 struct page *page = pfn_to_page(pfn); 1498 if (!PageReserved(page)) 1499 SetPageDirty(page); 1500 } 1501 } 1502 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1503 1504 void kvm_set_pfn_accessed(pfn_t pfn) 1505 { 1506 if (!kvm_is_mmio_pfn(pfn)) 1507 mark_page_accessed(pfn_to_page(pfn)); 1508 } 1509 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1510 1511 void kvm_get_pfn(pfn_t pfn) 1512 { 1513 if (!kvm_is_mmio_pfn(pfn)) 1514 get_page(pfn_to_page(pfn)); 1515 } 1516 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1517 1518 static int next_segment(unsigned long len, int offset) 1519 { 1520 if (len > PAGE_SIZE - offset) 1521 return PAGE_SIZE - offset; 1522 else 1523 return len; 1524 } 1525 1526 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1527 int len) 1528 { 1529 int r; 1530 unsigned long addr; 1531 1532 addr = gfn_to_hva(kvm, gfn); 1533 if (kvm_is_error_hva(addr)) 1534 return -EFAULT; 1535 r = copy_from_user(data, (void __user *)addr + offset, len); 1536 if (r) 1537 return -EFAULT; 1538 return 0; 1539 } 1540 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1541 1542 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1543 { 1544 gfn_t gfn = gpa >> PAGE_SHIFT; 1545 int seg; 1546 int offset = offset_in_page(gpa); 1547 int ret; 1548 1549 while ((seg = next_segment(len, offset)) != 0) { 1550 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1551 if (ret < 0) 1552 return ret; 1553 offset = 0; 1554 len -= seg; 1555 data += seg; 1556 ++gfn; 1557 } 1558 return 0; 1559 } 1560 EXPORT_SYMBOL_GPL(kvm_read_guest); 1561 1562 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1563 unsigned long len) 1564 { 1565 int r; 1566 unsigned long addr; 1567 gfn_t gfn = gpa >> PAGE_SHIFT; 1568 int offset = offset_in_page(gpa); 1569 1570 addr = gfn_to_hva(kvm, gfn); 1571 if (kvm_is_error_hva(addr)) 1572 return -EFAULT; 1573 pagefault_disable(); 1574 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1575 pagefault_enable(); 1576 if (r) 1577 return -EFAULT; 1578 return 0; 1579 } 1580 EXPORT_SYMBOL(kvm_read_guest_atomic); 1581 1582 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1583 int offset, int len) 1584 { 1585 int r; 1586 unsigned long addr; 1587 1588 addr = gfn_to_hva(kvm, gfn); 1589 if (kvm_is_error_hva(addr)) 1590 return -EFAULT; 1591 r = copy_to_user((void __user *)addr + offset, data, len); 1592 if (r) 1593 return -EFAULT; 1594 mark_page_dirty(kvm, gfn); 1595 return 0; 1596 } 1597 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1598 1599 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1600 unsigned long len) 1601 { 1602 gfn_t gfn = gpa >> PAGE_SHIFT; 1603 int seg; 1604 int offset = offset_in_page(gpa); 1605 int ret; 1606 1607 while ((seg = next_segment(len, offset)) != 0) { 1608 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1609 if (ret < 0) 1610 return ret; 1611 offset = 0; 1612 len -= seg; 1613 data += seg; 1614 ++gfn; 1615 } 1616 return 0; 1617 } 1618 1619 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1620 { 1621 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1622 } 1623 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1624 1625 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1626 { 1627 gfn_t gfn = gpa >> PAGE_SHIFT; 1628 int seg; 1629 int offset = offset_in_page(gpa); 1630 int ret; 1631 1632 while ((seg = next_segment(len, offset)) != 0) { 1633 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1634 if (ret < 0) 1635 return ret; 1636 offset = 0; 1637 len -= seg; 1638 ++gfn; 1639 } 1640 return 0; 1641 } 1642 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1643 1644 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1645 { 1646 struct kvm_memory_slot *memslot; 1647 1648 gfn = unalias_gfn(kvm, gfn); 1649 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1650 if (memslot && memslot->dirty_bitmap) { 1651 unsigned long rel_gfn = gfn - memslot->base_gfn; 1652 1653 /* avoid RMW */ 1654 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1655 set_bit(rel_gfn, memslot->dirty_bitmap); 1656 } 1657 } 1658 1659 /* 1660 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1661 */ 1662 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1663 { 1664 DEFINE_WAIT(wait); 1665 1666 for (;;) { 1667 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1668 1669 if (kvm_arch_vcpu_runnable(vcpu)) { 1670 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1671 break; 1672 } 1673 if (kvm_cpu_has_pending_timer(vcpu)) 1674 break; 1675 if (signal_pending(current)) 1676 break; 1677 1678 vcpu_put(vcpu); 1679 schedule(); 1680 vcpu_load(vcpu); 1681 } 1682 1683 finish_wait(&vcpu->wq, &wait); 1684 } 1685 1686 void kvm_resched(struct kvm_vcpu *vcpu) 1687 { 1688 if (!need_resched()) 1689 return; 1690 cond_resched(); 1691 } 1692 EXPORT_SYMBOL_GPL(kvm_resched); 1693 1694 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1695 { 1696 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1697 struct page *page; 1698 1699 if (vmf->pgoff == 0) 1700 page = virt_to_page(vcpu->run); 1701 #ifdef CONFIG_X86 1702 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1703 page = virt_to_page(vcpu->arch.pio_data); 1704 #endif 1705 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1706 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1707 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1708 #endif 1709 else 1710 return VM_FAULT_SIGBUS; 1711 get_page(page); 1712 vmf->page = page; 1713 return 0; 1714 } 1715 1716 static const struct vm_operations_struct kvm_vcpu_vm_ops = { 1717 .fault = kvm_vcpu_fault, 1718 }; 1719 1720 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1721 { 1722 vma->vm_ops = &kvm_vcpu_vm_ops; 1723 return 0; 1724 } 1725 1726 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1727 { 1728 struct kvm_vcpu *vcpu = filp->private_data; 1729 1730 kvm_put_kvm(vcpu->kvm); 1731 return 0; 1732 } 1733 1734 static struct file_operations kvm_vcpu_fops = { 1735 .release = kvm_vcpu_release, 1736 .unlocked_ioctl = kvm_vcpu_ioctl, 1737 .compat_ioctl = kvm_vcpu_ioctl, 1738 .mmap = kvm_vcpu_mmap, 1739 }; 1740 1741 /* 1742 * Allocates an inode for the vcpu. 1743 */ 1744 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1745 { 1746 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1747 } 1748 1749 /* 1750 * Creates some virtual cpus. Good luck creating more than one. 1751 */ 1752 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1753 { 1754 int r; 1755 struct kvm_vcpu *vcpu, *v; 1756 1757 vcpu = kvm_arch_vcpu_create(kvm, id); 1758 if (IS_ERR(vcpu)) 1759 return PTR_ERR(vcpu); 1760 1761 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1762 1763 r = kvm_arch_vcpu_setup(vcpu); 1764 if (r) 1765 return r; 1766 1767 mutex_lock(&kvm->lock); 1768 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1769 r = -EINVAL; 1770 goto vcpu_destroy; 1771 } 1772 1773 kvm_for_each_vcpu(r, v, kvm) 1774 if (v->vcpu_id == id) { 1775 r = -EEXIST; 1776 goto vcpu_destroy; 1777 } 1778 1779 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1780 1781 /* Now it's all set up, let userspace reach it */ 1782 kvm_get_kvm(kvm); 1783 r = create_vcpu_fd(vcpu); 1784 if (r < 0) { 1785 kvm_put_kvm(kvm); 1786 goto vcpu_destroy; 1787 } 1788 1789 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1790 smp_wmb(); 1791 atomic_inc(&kvm->online_vcpus); 1792 1793 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1794 if (kvm->bsp_vcpu_id == id) 1795 kvm->bsp_vcpu = vcpu; 1796 #endif 1797 mutex_unlock(&kvm->lock); 1798 return r; 1799 1800 vcpu_destroy: 1801 mutex_unlock(&kvm->lock); 1802 kvm_arch_vcpu_destroy(vcpu); 1803 return r; 1804 } 1805 1806 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1807 { 1808 if (sigset) { 1809 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1810 vcpu->sigset_active = 1; 1811 vcpu->sigset = *sigset; 1812 } else 1813 vcpu->sigset_active = 0; 1814 return 0; 1815 } 1816 1817 #ifdef __KVM_HAVE_MSIX 1818 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 1819 struct kvm_assigned_msix_nr *entry_nr) 1820 { 1821 int r = 0; 1822 struct kvm_assigned_dev_kernel *adev; 1823 1824 mutex_lock(&kvm->lock); 1825 1826 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1827 entry_nr->assigned_dev_id); 1828 if (!adev) { 1829 r = -EINVAL; 1830 goto msix_nr_out; 1831 } 1832 1833 if (adev->entries_nr == 0) { 1834 adev->entries_nr = entry_nr->entry_nr; 1835 if (adev->entries_nr == 0 || 1836 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 1837 r = -EINVAL; 1838 goto msix_nr_out; 1839 } 1840 1841 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * 1842 entry_nr->entry_nr, 1843 GFP_KERNEL); 1844 if (!adev->host_msix_entries) { 1845 r = -ENOMEM; 1846 goto msix_nr_out; 1847 } 1848 adev->guest_msix_entries = kzalloc( 1849 sizeof(struct kvm_guest_msix_entry) * 1850 entry_nr->entry_nr, GFP_KERNEL); 1851 if (!adev->guest_msix_entries) { 1852 kfree(adev->host_msix_entries); 1853 r = -ENOMEM; 1854 goto msix_nr_out; 1855 } 1856 } else /* Not allowed set MSI-X number twice */ 1857 r = -EINVAL; 1858 msix_nr_out: 1859 mutex_unlock(&kvm->lock); 1860 return r; 1861 } 1862 1863 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, 1864 struct kvm_assigned_msix_entry *entry) 1865 { 1866 int r = 0, i; 1867 struct kvm_assigned_dev_kernel *adev; 1868 1869 mutex_lock(&kvm->lock); 1870 1871 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1872 entry->assigned_dev_id); 1873 1874 if (!adev) { 1875 r = -EINVAL; 1876 goto msix_entry_out; 1877 } 1878 1879 for (i = 0; i < adev->entries_nr; i++) 1880 if (adev->guest_msix_entries[i].vector == 0 || 1881 adev->guest_msix_entries[i].entry == entry->entry) { 1882 adev->guest_msix_entries[i].entry = entry->entry; 1883 adev->guest_msix_entries[i].vector = entry->gsi; 1884 adev->host_msix_entries[i].entry = entry->entry; 1885 break; 1886 } 1887 if (i == adev->entries_nr) { 1888 r = -ENOSPC; 1889 goto msix_entry_out; 1890 } 1891 1892 msix_entry_out: 1893 mutex_unlock(&kvm->lock); 1894 1895 return r; 1896 } 1897 #endif 1898 1899 static long kvm_vcpu_ioctl(struct file *filp, 1900 unsigned int ioctl, unsigned long arg) 1901 { 1902 struct kvm_vcpu *vcpu = filp->private_data; 1903 void __user *argp = (void __user *)arg; 1904 int r; 1905 struct kvm_fpu *fpu = NULL; 1906 struct kvm_sregs *kvm_sregs = NULL; 1907 1908 if (vcpu->kvm->mm != current->mm) 1909 return -EIO; 1910 switch (ioctl) { 1911 case KVM_RUN: 1912 r = -EINVAL; 1913 if (arg) 1914 goto out; 1915 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1916 break; 1917 case KVM_GET_REGS: { 1918 struct kvm_regs *kvm_regs; 1919 1920 r = -ENOMEM; 1921 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1922 if (!kvm_regs) 1923 goto out; 1924 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1925 if (r) 1926 goto out_free1; 1927 r = -EFAULT; 1928 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1929 goto out_free1; 1930 r = 0; 1931 out_free1: 1932 kfree(kvm_regs); 1933 break; 1934 } 1935 case KVM_SET_REGS: { 1936 struct kvm_regs *kvm_regs; 1937 1938 r = -ENOMEM; 1939 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1940 if (!kvm_regs) 1941 goto out; 1942 r = -EFAULT; 1943 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1944 goto out_free2; 1945 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1946 if (r) 1947 goto out_free2; 1948 r = 0; 1949 out_free2: 1950 kfree(kvm_regs); 1951 break; 1952 } 1953 case KVM_GET_SREGS: { 1954 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1955 r = -ENOMEM; 1956 if (!kvm_sregs) 1957 goto out; 1958 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1959 if (r) 1960 goto out; 1961 r = -EFAULT; 1962 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1963 goto out; 1964 r = 0; 1965 break; 1966 } 1967 case KVM_SET_SREGS: { 1968 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1969 r = -ENOMEM; 1970 if (!kvm_sregs) 1971 goto out; 1972 r = -EFAULT; 1973 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1974 goto out; 1975 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1976 if (r) 1977 goto out; 1978 r = 0; 1979 break; 1980 } 1981 case KVM_GET_MP_STATE: { 1982 struct kvm_mp_state mp_state; 1983 1984 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1985 if (r) 1986 goto out; 1987 r = -EFAULT; 1988 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1989 goto out; 1990 r = 0; 1991 break; 1992 } 1993 case KVM_SET_MP_STATE: { 1994 struct kvm_mp_state mp_state; 1995 1996 r = -EFAULT; 1997 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1998 goto out; 1999 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2000 if (r) 2001 goto out; 2002 r = 0; 2003 break; 2004 } 2005 case KVM_TRANSLATE: { 2006 struct kvm_translation tr; 2007 2008 r = -EFAULT; 2009 if (copy_from_user(&tr, argp, sizeof tr)) 2010 goto out; 2011 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2012 if (r) 2013 goto out; 2014 r = -EFAULT; 2015 if (copy_to_user(argp, &tr, sizeof tr)) 2016 goto out; 2017 r = 0; 2018 break; 2019 } 2020 case KVM_SET_GUEST_DEBUG: { 2021 struct kvm_guest_debug dbg; 2022 2023 r = -EFAULT; 2024 if (copy_from_user(&dbg, argp, sizeof dbg)) 2025 goto out; 2026 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2027 if (r) 2028 goto out; 2029 r = 0; 2030 break; 2031 } 2032 case KVM_SET_SIGNAL_MASK: { 2033 struct kvm_signal_mask __user *sigmask_arg = argp; 2034 struct kvm_signal_mask kvm_sigmask; 2035 sigset_t sigset, *p; 2036 2037 p = NULL; 2038 if (argp) { 2039 r = -EFAULT; 2040 if (copy_from_user(&kvm_sigmask, argp, 2041 sizeof kvm_sigmask)) 2042 goto out; 2043 r = -EINVAL; 2044 if (kvm_sigmask.len != sizeof sigset) 2045 goto out; 2046 r = -EFAULT; 2047 if (copy_from_user(&sigset, sigmask_arg->sigset, 2048 sizeof sigset)) 2049 goto out; 2050 p = &sigset; 2051 } 2052 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2053 break; 2054 } 2055 case KVM_GET_FPU: { 2056 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2057 r = -ENOMEM; 2058 if (!fpu) 2059 goto out; 2060 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2061 if (r) 2062 goto out; 2063 r = -EFAULT; 2064 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2065 goto out; 2066 r = 0; 2067 break; 2068 } 2069 case KVM_SET_FPU: { 2070 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2071 r = -ENOMEM; 2072 if (!fpu) 2073 goto out; 2074 r = -EFAULT; 2075 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 2076 goto out; 2077 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2078 if (r) 2079 goto out; 2080 r = 0; 2081 break; 2082 } 2083 default: 2084 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2085 } 2086 out: 2087 kfree(fpu); 2088 kfree(kvm_sregs); 2089 return r; 2090 } 2091 2092 static long kvm_vm_ioctl(struct file *filp, 2093 unsigned int ioctl, unsigned long arg) 2094 { 2095 struct kvm *kvm = filp->private_data; 2096 void __user *argp = (void __user *)arg; 2097 int r; 2098 2099 if (kvm->mm != current->mm) 2100 return -EIO; 2101 switch (ioctl) { 2102 case KVM_CREATE_VCPU: 2103 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2104 if (r < 0) 2105 goto out; 2106 break; 2107 case KVM_SET_USER_MEMORY_REGION: { 2108 struct kvm_userspace_memory_region kvm_userspace_mem; 2109 2110 r = -EFAULT; 2111 if (copy_from_user(&kvm_userspace_mem, argp, 2112 sizeof kvm_userspace_mem)) 2113 goto out; 2114 2115 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2116 if (r) 2117 goto out; 2118 break; 2119 } 2120 case KVM_GET_DIRTY_LOG: { 2121 struct kvm_dirty_log log; 2122 2123 r = -EFAULT; 2124 if (copy_from_user(&log, argp, sizeof log)) 2125 goto out; 2126 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2127 if (r) 2128 goto out; 2129 break; 2130 } 2131 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2132 case KVM_REGISTER_COALESCED_MMIO: { 2133 struct kvm_coalesced_mmio_zone zone; 2134 r = -EFAULT; 2135 if (copy_from_user(&zone, argp, sizeof zone)) 2136 goto out; 2137 r = -ENXIO; 2138 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2139 if (r) 2140 goto out; 2141 r = 0; 2142 break; 2143 } 2144 case KVM_UNREGISTER_COALESCED_MMIO: { 2145 struct kvm_coalesced_mmio_zone zone; 2146 r = -EFAULT; 2147 if (copy_from_user(&zone, argp, sizeof zone)) 2148 goto out; 2149 r = -ENXIO; 2150 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2151 if (r) 2152 goto out; 2153 r = 0; 2154 break; 2155 } 2156 #endif 2157 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 2158 case KVM_ASSIGN_PCI_DEVICE: { 2159 struct kvm_assigned_pci_dev assigned_dev; 2160 2161 r = -EFAULT; 2162 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2163 goto out; 2164 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 2165 if (r) 2166 goto out; 2167 break; 2168 } 2169 case KVM_ASSIGN_IRQ: { 2170 r = -EOPNOTSUPP; 2171 break; 2172 } 2173 #ifdef KVM_CAP_ASSIGN_DEV_IRQ 2174 case KVM_ASSIGN_DEV_IRQ: { 2175 struct kvm_assigned_irq assigned_irq; 2176 2177 r = -EFAULT; 2178 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2179 goto out; 2180 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 2181 if (r) 2182 goto out; 2183 break; 2184 } 2185 case KVM_DEASSIGN_DEV_IRQ: { 2186 struct kvm_assigned_irq assigned_irq; 2187 2188 r = -EFAULT; 2189 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2190 goto out; 2191 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); 2192 if (r) 2193 goto out; 2194 break; 2195 } 2196 #endif 2197 #endif 2198 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 2199 case KVM_DEASSIGN_PCI_DEVICE: { 2200 struct kvm_assigned_pci_dev assigned_dev; 2201 2202 r = -EFAULT; 2203 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2204 goto out; 2205 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 2206 if (r) 2207 goto out; 2208 break; 2209 } 2210 #endif 2211 #ifdef KVM_CAP_IRQ_ROUTING 2212 case KVM_SET_GSI_ROUTING: { 2213 struct kvm_irq_routing routing; 2214 struct kvm_irq_routing __user *urouting; 2215 struct kvm_irq_routing_entry *entries; 2216 2217 r = -EFAULT; 2218 if (copy_from_user(&routing, argp, sizeof(routing))) 2219 goto out; 2220 r = -EINVAL; 2221 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2222 goto out; 2223 if (routing.flags) 2224 goto out; 2225 r = -ENOMEM; 2226 entries = vmalloc(routing.nr * sizeof(*entries)); 2227 if (!entries) 2228 goto out; 2229 r = -EFAULT; 2230 urouting = argp; 2231 if (copy_from_user(entries, urouting->entries, 2232 routing.nr * sizeof(*entries))) 2233 goto out_free_irq_routing; 2234 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2235 routing.flags); 2236 out_free_irq_routing: 2237 vfree(entries); 2238 break; 2239 } 2240 #endif /* KVM_CAP_IRQ_ROUTING */ 2241 #ifdef __KVM_HAVE_MSIX 2242 case KVM_ASSIGN_SET_MSIX_NR: { 2243 struct kvm_assigned_msix_nr entry_nr; 2244 r = -EFAULT; 2245 if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) 2246 goto out; 2247 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); 2248 if (r) 2249 goto out; 2250 break; 2251 } 2252 case KVM_ASSIGN_SET_MSIX_ENTRY: { 2253 struct kvm_assigned_msix_entry entry; 2254 r = -EFAULT; 2255 if (copy_from_user(&entry, argp, sizeof entry)) 2256 goto out; 2257 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); 2258 if (r) 2259 goto out; 2260 break; 2261 } 2262 #endif 2263 case KVM_IRQFD: { 2264 struct kvm_irqfd data; 2265 2266 r = -EFAULT; 2267 if (copy_from_user(&data, argp, sizeof data)) 2268 goto out; 2269 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 2270 break; 2271 } 2272 case KVM_IOEVENTFD: { 2273 struct kvm_ioeventfd data; 2274 2275 r = -EFAULT; 2276 if (copy_from_user(&data, argp, sizeof data)) 2277 goto out; 2278 r = kvm_ioeventfd(kvm, &data); 2279 break; 2280 } 2281 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2282 case KVM_SET_BOOT_CPU_ID: 2283 r = 0; 2284 mutex_lock(&kvm->lock); 2285 if (atomic_read(&kvm->online_vcpus) != 0) 2286 r = -EBUSY; 2287 else 2288 kvm->bsp_vcpu_id = arg; 2289 mutex_unlock(&kvm->lock); 2290 break; 2291 #endif 2292 default: 2293 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2294 } 2295 out: 2296 return r; 2297 } 2298 2299 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2300 { 2301 struct page *page[1]; 2302 unsigned long addr; 2303 int npages; 2304 gfn_t gfn = vmf->pgoff; 2305 struct kvm *kvm = vma->vm_file->private_data; 2306 2307 addr = gfn_to_hva(kvm, gfn); 2308 if (kvm_is_error_hva(addr)) 2309 return VM_FAULT_SIGBUS; 2310 2311 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2312 NULL); 2313 if (unlikely(npages != 1)) 2314 return VM_FAULT_SIGBUS; 2315 2316 vmf->page = page[0]; 2317 return 0; 2318 } 2319 2320 static const struct vm_operations_struct kvm_vm_vm_ops = { 2321 .fault = kvm_vm_fault, 2322 }; 2323 2324 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2325 { 2326 vma->vm_ops = &kvm_vm_vm_ops; 2327 return 0; 2328 } 2329 2330 static struct file_operations kvm_vm_fops = { 2331 .release = kvm_vm_release, 2332 .unlocked_ioctl = kvm_vm_ioctl, 2333 .compat_ioctl = kvm_vm_ioctl, 2334 .mmap = kvm_vm_mmap, 2335 }; 2336 2337 static int kvm_dev_ioctl_create_vm(void) 2338 { 2339 int fd; 2340 struct kvm *kvm; 2341 2342 kvm = kvm_create_vm(); 2343 if (IS_ERR(kvm)) 2344 return PTR_ERR(kvm); 2345 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 2346 if (fd < 0) 2347 kvm_put_kvm(kvm); 2348 2349 return fd; 2350 } 2351 2352 static long kvm_dev_ioctl_check_extension_generic(long arg) 2353 { 2354 switch (arg) { 2355 case KVM_CAP_USER_MEMORY: 2356 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2357 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2358 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2359 case KVM_CAP_SET_BOOT_CPU_ID: 2360 #endif 2361 return 1; 2362 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2363 case KVM_CAP_IRQ_ROUTING: 2364 return KVM_MAX_IRQ_ROUTES; 2365 #endif 2366 default: 2367 break; 2368 } 2369 return kvm_dev_ioctl_check_extension(arg); 2370 } 2371 2372 static long kvm_dev_ioctl(struct file *filp, 2373 unsigned int ioctl, unsigned long arg) 2374 { 2375 long r = -EINVAL; 2376 2377 switch (ioctl) { 2378 case KVM_GET_API_VERSION: 2379 r = -EINVAL; 2380 if (arg) 2381 goto out; 2382 r = KVM_API_VERSION; 2383 break; 2384 case KVM_CREATE_VM: 2385 r = -EINVAL; 2386 if (arg) 2387 goto out; 2388 r = kvm_dev_ioctl_create_vm(); 2389 break; 2390 case KVM_CHECK_EXTENSION: 2391 r = kvm_dev_ioctl_check_extension_generic(arg); 2392 break; 2393 case KVM_GET_VCPU_MMAP_SIZE: 2394 r = -EINVAL; 2395 if (arg) 2396 goto out; 2397 r = PAGE_SIZE; /* struct kvm_run */ 2398 #ifdef CONFIG_X86 2399 r += PAGE_SIZE; /* pio data page */ 2400 #endif 2401 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2402 r += PAGE_SIZE; /* coalesced mmio ring page */ 2403 #endif 2404 break; 2405 case KVM_TRACE_ENABLE: 2406 case KVM_TRACE_PAUSE: 2407 case KVM_TRACE_DISABLE: 2408 r = -EOPNOTSUPP; 2409 break; 2410 default: 2411 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2412 } 2413 out: 2414 return r; 2415 } 2416 2417 static struct file_operations kvm_chardev_ops = { 2418 .unlocked_ioctl = kvm_dev_ioctl, 2419 .compat_ioctl = kvm_dev_ioctl, 2420 }; 2421 2422 static struct miscdevice kvm_dev = { 2423 KVM_MINOR, 2424 "kvm", 2425 &kvm_chardev_ops, 2426 }; 2427 2428 static void hardware_enable(void *junk) 2429 { 2430 int cpu = raw_smp_processor_id(); 2431 2432 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2433 return; 2434 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2435 kvm_arch_hardware_enable(NULL); 2436 } 2437 2438 static void hardware_disable(void *junk) 2439 { 2440 int cpu = raw_smp_processor_id(); 2441 2442 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2443 return; 2444 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2445 kvm_arch_hardware_disable(NULL); 2446 } 2447 2448 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2449 void *v) 2450 { 2451 int cpu = (long)v; 2452 2453 val &= ~CPU_TASKS_FROZEN; 2454 switch (val) { 2455 case CPU_DYING: 2456 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2457 cpu); 2458 hardware_disable(NULL); 2459 break; 2460 case CPU_UP_CANCELED: 2461 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2462 cpu); 2463 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2464 break; 2465 case CPU_ONLINE: 2466 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2467 cpu); 2468 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2469 break; 2470 } 2471 return NOTIFY_OK; 2472 } 2473 2474 2475 asmlinkage void kvm_handle_fault_on_reboot(void) 2476 { 2477 if (kvm_rebooting) 2478 /* spin while reset goes on */ 2479 while (true) 2480 ; 2481 /* Fault while not rebooting. We want the trace. */ 2482 BUG(); 2483 } 2484 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2485 2486 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2487 void *v) 2488 { 2489 /* 2490 * Some (well, at least mine) BIOSes hang on reboot if 2491 * in vmx root mode. 2492 * 2493 * And Intel TXT required VMX off for all cpu when system shutdown. 2494 */ 2495 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2496 kvm_rebooting = true; 2497 on_each_cpu(hardware_disable, NULL, 1); 2498 return NOTIFY_OK; 2499 } 2500 2501 static struct notifier_block kvm_reboot_notifier = { 2502 .notifier_call = kvm_reboot, 2503 .priority = 0, 2504 }; 2505 2506 void kvm_io_bus_init(struct kvm_io_bus *bus) 2507 { 2508 memset(bus, 0, sizeof(*bus)); 2509 } 2510 2511 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2512 { 2513 int i; 2514 2515 for (i = 0; i < bus->dev_count; i++) { 2516 struct kvm_io_device *pos = bus->devs[i]; 2517 2518 kvm_iodevice_destructor(pos); 2519 } 2520 } 2521 2522 /* kvm_io_bus_write - called under kvm->slots_lock */ 2523 int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, 2524 int len, const void *val) 2525 { 2526 int i; 2527 for (i = 0; i < bus->dev_count; i++) 2528 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2529 return 0; 2530 return -EOPNOTSUPP; 2531 } 2532 2533 /* kvm_io_bus_read - called under kvm->slots_lock */ 2534 int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) 2535 { 2536 int i; 2537 for (i = 0; i < bus->dev_count; i++) 2538 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2539 return 0; 2540 return -EOPNOTSUPP; 2541 } 2542 2543 int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, 2544 struct kvm_io_device *dev) 2545 { 2546 int ret; 2547 2548 down_write(&kvm->slots_lock); 2549 ret = __kvm_io_bus_register_dev(bus, dev); 2550 up_write(&kvm->slots_lock); 2551 2552 return ret; 2553 } 2554 2555 /* An unlocked version. Caller must have write lock on slots_lock. */ 2556 int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, 2557 struct kvm_io_device *dev) 2558 { 2559 if (bus->dev_count > NR_IOBUS_DEVS-1) 2560 return -ENOSPC; 2561 2562 bus->devs[bus->dev_count++] = dev; 2563 2564 return 0; 2565 } 2566 2567 void kvm_io_bus_unregister_dev(struct kvm *kvm, 2568 struct kvm_io_bus *bus, 2569 struct kvm_io_device *dev) 2570 { 2571 down_write(&kvm->slots_lock); 2572 __kvm_io_bus_unregister_dev(bus, dev); 2573 up_write(&kvm->slots_lock); 2574 } 2575 2576 /* An unlocked version. Caller must have write lock on slots_lock. */ 2577 void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, 2578 struct kvm_io_device *dev) 2579 { 2580 int i; 2581 2582 for (i = 0; i < bus->dev_count; i++) 2583 if (bus->devs[i] == dev) { 2584 bus->devs[i] = bus->devs[--bus->dev_count]; 2585 break; 2586 } 2587 } 2588 2589 static struct notifier_block kvm_cpu_notifier = { 2590 .notifier_call = kvm_cpu_hotplug, 2591 .priority = 20, /* must be > scheduler priority */ 2592 }; 2593 2594 static int vm_stat_get(void *_offset, u64 *val) 2595 { 2596 unsigned offset = (long)_offset; 2597 struct kvm *kvm; 2598 2599 *val = 0; 2600 spin_lock(&kvm_lock); 2601 list_for_each_entry(kvm, &vm_list, vm_list) 2602 *val += *(u32 *)((void *)kvm + offset); 2603 spin_unlock(&kvm_lock); 2604 return 0; 2605 } 2606 2607 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2608 2609 static int vcpu_stat_get(void *_offset, u64 *val) 2610 { 2611 unsigned offset = (long)_offset; 2612 struct kvm *kvm; 2613 struct kvm_vcpu *vcpu; 2614 int i; 2615 2616 *val = 0; 2617 spin_lock(&kvm_lock); 2618 list_for_each_entry(kvm, &vm_list, vm_list) 2619 kvm_for_each_vcpu(i, vcpu, kvm) 2620 *val += *(u32 *)((void *)vcpu + offset); 2621 2622 spin_unlock(&kvm_lock); 2623 return 0; 2624 } 2625 2626 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2627 2628 static const struct file_operations *stat_fops[] = { 2629 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2630 [KVM_STAT_VM] = &vm_stat_fops, 2631 }; 2632 2633 static void kvm_init_debug(void) 2634 { 2635 struct kvm_stats_debugfs_item *p; 2636 2637 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2638 for (p = debugfs_entries; p->name; ++p) 2639 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2640 (void *)(long)p->offset, 2641 stat_fops[p->kind]); 2642 } 2643 2644 static void kvm_exit_debug(void) 2645 { 2646 struct kvm_stats_debugfs_item *p; 2647 2648 for (p = debugfs_entries; p->name; ++p) 2649 debugfs_remove(p->dentry); 2650 debugfs_remove(kvm_debugfs_dir); 2651 } 2652 2653 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2654 { 2655 hardware_disable(NULL); 2656 return 0; 2657 } 2658 2659 static int kvm_resume(struct sys_device *dev) 2660 { 2661 hardware_enable(NULL); 2662 return 0; 2663 } 2664 2665 static struct sysdev_class kvm_sysdev_class = { 2666 .name = "kvm", 2667 .suspend = kvm_suspend, 2668 .resume = kvm_resume, 2669 }; 2670 2671 static struct sys_device kvm_sysdev = { 2672 .id = 0, 2673 .cls = &kvm_sysdev_class, 2674 }; 2675 2676 struct page *bad_page; 2677 pfn_t bad_pfn; 2678 2679 static inline 2680 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2681 { 2682 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2683 } 2684 2685 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2686 { 2687 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2688 2689 kvm_arch_vcpu_load(vcpu, cpu); 2690 } 2691 2692 static void kvm_sched_out(struct preempt_notifier *pn, 2693 struct task_struct *next) 2694 { 2695 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2696 2697 kvm_arch_vcpu_put(vcpu); 2698 } 2699 2700 int kvm_init(void *opaque, unsigned int vcpu_size, 2701 struct module *module) 2702 { 2703 int r; 2704 int cpu; 2705 2706 kvm_init_debug(); 2707 2708 r = kvm_arch_init(opaque); 2709 if (r) 2710 goto out_fail; 2711 2712 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2713 2714 if (bad_page == NULL) { 2715 r = -ENOMEM; 2716 goto out; 2717 } 2718 2719 bad_pfn = page_to_pfn(bad_page); 2720 2721 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2722 r = -ENOMEM; 2723 goto out_free_0; 2724 } 2725 2726 r = kvm_arch_hardware_setup(); 2727 if (r < 0) 2728 goto out_free_0a; 2729 2730 for_each_online_cpu(cpu) { 2731 smp_call_function_single(cpu, 2732 kvm_arch_check_processor_compat, 2733 &r, 1); 2734 if (r < 0) 2735 goto out_free_1; 2736 } 2737 2738 on_each_cpu(hardware_enable, NULL, 1); 2739 r = register_cpu_notifier(&kvm_cpu_notifier); 2740 if (r) 2741 goto out_free_2; 2742 register_reboot_notifier(&kvm_reboot_notifier); 2743 2744 r = sysdev_class_register(&kvm_sysdev_class); 2745 if (r) 2746 goto out_free_3; 2747 2748 r = sysdev_register(&kvm_sysdev); 2749 if (r) 2750 goto out_free_4; 2751 2752 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2753 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2754 __alignof__(struct kvm_vcpu), 2755 0, NULL); 2756 if (!kvm_vcpu_cache) { 2757 r = -ENOMEM; 2758 goto out_free_5; 2759 } 2760 2761 kvm_chardev_ops.owner = module; 2762 kvm_vm_fops.owner = module; 2763 kvm_vcpu_fops.owner = module; 2764 2765 r = misc_register(&kvm_dev); 2766 if (r) { 2767 printk(KERN_ERR "kvm: misc device register failed\n"); 2768 goto out_free; 2769 } 2770 2771 kvm_preempt_ops.sched_in = kvm_sched_in; 2772 kvm_preempt_ops.sched_out = kvm_sched_out; 2773 2774 return 0; 2775 2776 out_free: 2777 kmem_cache_destroy(kvm_vcpu_cache); 2778 out_free_5: 2779 sysdev_unregister(&kvm_sysdev); 2780 out_free_4: 2781 sysdev_class_unregister(&kvm_sysdev_class); 2782 out_free_3: 2783 unregister_reboot_notifier(&kvm_reboot_notifier); 2784 unregister_cpu_notifier(&kvm_cpu_notifier); 2785 out_free_2: 2786 on_each_cpu(hardware_disable, NULL, 1); 2787 out_free_1: 2788 kvm_arch_hardware_unsetup(); 2789 out_free_0a: 2790 free_cpumask_var(cpus_hardware_enabled); 2791 out_free_0: 2792 __free_page(bad_page); 2793 out: 2794 kvm_arch_exit(); 2795 out_fail: 2796 kvm_exit_debug(); 2797 return r; 2798 } 2799 EXPORT_SYMBOL_GPL(kvm_init); 2800 2801 void kvm_exit(void) 2802 { 2803 tracepoint_synchronize_unregister(); 2804 misc_deregister(&kvm_dev); 2805 kmem_cache_destroy(kvm_vcpu_cache); 2806 sysdev_unregister(&kvm_sysdev); 2807 sysdev_class_unregister(&kvm_sysdev_class); 2808 unregister_reboot_notifier(&kvm_reboot_notifier); 2809 unregister_cpu_notifier(&kvm_cpu_notifier); 2810 on_each_cpu(hardware_disable, NULL, 1); 2811 kvm_arch_hardware_unsetup(); 2812 kvm_arch_exit(); 2813 kvm_exit_debug(); 2814 free_cpumask_var(cpus_hardware_enabled); 2815 __free_page(bad_page); 2816 } 2817 EXPORT_SYMBOL_GPL(kvm_exit); 2818