1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 47 #include <asm/processor.h> 48 #include <asm/io.h> 49 #include <asm/uaccess.h> 50 #include <asm/pgtable.h> 51 52 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 53 #include "coalesced_mmio.h" 54 #endif 55 56 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 57 #include <linux/pci.h> 58 #include <linux/interrupt.h> 59 #include "irq.h" 60 #endif 61 62 #define CREATE_TRACE_POINTS 63 #include <trace/events/kvm.h> 64 65 MODULE_AUTHOR("Qumranet"); 66 MODULE_LICENSE("GPL"); 67 68 /* 69 * Ordering of locks: 70 * 71 * kvm->slots_lock --> kvm->lock --> kvm->irq_lock 72 */ 73 74 DEFINE_SPINLOCK(kvm_lock); 75 LIST_HEAD(vm_list); 76 77 static cpumask_var_t cpus_hardware_enabled; 78 79 struct kmem_cache *kvm_vcpu_cache; 80 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 81 82 static __read_mostly struct preempt_ops kvm_preempt_ops; 83 84 struct dentry *kvm_debugfs_dir; 85 86 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 87 unsigned long arg); 88 89 static bool kvm_rebooting; 90 91 static bool largepages_enabled = true; 92 93 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 94 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 95 int assigned_dev_id) 96 { 97 struct list_head *ptr; 98 struct kvm_assigned_dev_kernel *match; 99 100 list_for_each(ptr, head) { 101 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 102 if (match->assigned_dev_id == assigned_dev_id) 103 return match; 104 } 105 return NULL; 106 } 107 108 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel 109 *assigned_dev, int irq) 110 { 111 int i, index; 112 struct msix_entry *host_msix_entries; 113 114 host_msix_entries = assigned_dev->host_msix_entries; 115 116 index = -1; 117 for (i = 0; i < assigned_dev->entries_nr; i++) 118 if (irq == host_msix_entries[i].vector) { 119 index = i; 120 break; 121 } 122 if (index < 0) { 123 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 124 return 0; 125 } 126 127 return index; 128 } 129 130 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 131 { 132 struct kvm_assigned_dev_kernel *assigned_dev; 133 struct kvm *kvm; 134 int i; 135 136 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 137 interrupt_work); 138 kvm = assigned_dev->kvm; 139 140 mutex_lock(&kvm->irq_lock); 141 spin_lock_irq(&assigned_dev->assigned_dev_lock); 142 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 143 struct kvm_guest_msix_entry *guest_entries = 144 assigned_dev->guest_msix_entries; 145 for (i = 0; i < assigned_dev->entries_nr; i++) { 146 if (!(guest_entries[i].flags & 147 KVM_ASSIGNED_MSIX_PENDING)) 148 continue; 149 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; 150 kvm_set_irq(assigned_dev->kvm, 151 assigned_dev->irq_source_id, 152 guest_entries[i].vector, 1); 153 } 154 } else 155 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 156 assigned_dev->guest_irq, 1); 157 158 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 159 mutex_unlock(&assigned_dev->kvm->irq_lock); 160 } 161 162 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 163 { 164 unsigned long flags; 165 struct kvm_assigned_dev_kernel *assigned_dev = 166 (struct kvm_assigned_dev_kernel *) dev_id; 167 168 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); 169 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 170 int index = find_index_from_host_irq(assigned_dev, irq); 171 if (index < 0) 172 goto out; 173 assigned_dev->guest_msix_entries[index].flags |= 174 KVM_ASSIGNED_MSIX_PENDING; 175 } 176 177 schedule_work(&assigned_dev->interrupt_work); 178 179 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { 180 disable_irq_nosync(irq); 181 assigned_dev->host_irq_disabled = true; 182 } 183 184 out: 185 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 186 return IRQ_HANDLED; 187 } 188 189 /* Ack the irq line for an assigned device */ 190 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 191 { 192 struct kvm_assigned_dev_kernel *dev; 193 unsigned long flags; 194 195 if (kian->gsi == -1) 196 return; 197 198 dev = container_of(kian, struct kvm_assigned_dev_kernel, 199 ack_notifier); 200 201 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 202 203 /* The guest irq may be shared so this ack may be 204 * from another device. 205 */ 206 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 207 if (dev->host_irq_disabled) { 208 enable_irq(dev->host_irq); 209 dev->host_irq_disabled = false; 210 } 211 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 212 } 213 214 static void deassign_guest_irq(struct kvm *kvm, 215 struct kvm_assigned_dev_kernel *assigned_dev) 216 { 217 kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); 218 assigned_dev->ack_notifier.gsi = -1; 219 220 if (assigned_dev->irq_source_id != -1) 221 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 222 assigned_dev->irq_source_id = -1; 223 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); 224 } 225 226 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 227 static void deassign_host_irq(struct kvm *kvm, 228 struct kvm_assigned_dev_kernel *assigned_dev) 229 { 230 /* 231 * In kvm_free_device_irq, cancel_work_sync return true if: 232 * 1. work is scheduled, and then cancelled. 233 * 2. work callback is executed. 234 * 235 * The first one ensured that the irq is disabled and no more events 236 * would happen. But for the second one, the irq may be enabled (e.g. 237 * for MSI). So we disable irq here to prevent further events. 238 * 239 * Notice this maybe result in nested disable if the interrupt type is 240 * INTx, but it's OK for we are going to free it. 241 * 242 * If this function is a part of VM destroy, please ensure that till 243 * now, the kvm state is still legal for probably we also have to wait 244 * interrupt_work done. 245 */ 246 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 247 int i; 248 for (i = 0; i < assigned_dev->entries_nr; i++) 249 disable_irq_nosync(assigned_dev-> 250 host_msix_entries[i].vector); 251 252 cancel_work_sync(&assigned_dev->interrupt_work); 253 254 for (i = 0; i < assigned_dev->entries_nr; i++) 255 free_irq(assigned_dev->host_msix_entries[i].vector, 256 (void *)assigned_dev); 257 258 assigned_dev->entries_nr = 0; 259 kfree(assigned_dev->host_msix_entries); 260 kfree(assigned_dev->guest_msix_entries); 261 pci_disable_msix(assigned_dev->dev); 262 } else { 263 /* Deal with MSI and INTx */ 264 disable_irq_nosync(assigned_dev->host_irq); 265 cancel_work_sync(&assigned_dev->interrupt_work); 266 267 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 268 269 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 270 pci_disable_msi(assigned_dev->dev); 271 } 272 273 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); 274 } 275 276 static int kvm_deassign_irq(struct kvm *kvm, 277 struct kvm_assigned_dev_kernel *assigned_dev, 278 unsigned long irq_requested_type) 279 { 280 unsigned long guest_irq_type, host_irq_type; 281 282 if (!irqchip_in_kernel(kvm)) 283 return -EINVAL; 284 /* no irq assignment to deassign */ 285 if (!assigned_dev->irq_requested_type) 286 return -ENXIO; 287 288 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; 289 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; 290 291 if (host_irq_type) 292 deassign_host_irq(kvm, assigned_dev); 293 if (guest_irq_type) 294 deassign_guest_irq(kvm, assigned_dev); 295 296 return 0; 297 } 298 299 static void kvm_free_assigned_irq(struct kvm *kvm, 300 struct kvm_assigned_dev_kernel *assigned_dev) 301 { 302 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); 303 } 304 305 static void kvm_free_assigned_device(struct kvm *kvm, 306 struct kvm_assigned_dev_kernel 307 *assigned_dev) 308 { 309 kvm_free_assigned_irq(kvm, assigned_dev); 310 311 pci_reset_function(assigned_dev->dev); 312 313 pci_release_regions(assigned_dev->dev); 314 pci_disable_device(assigned_dev->dev); 315 pci_dev_put(assigned_dev->dev); 316 317 list_del(&assigned_dev->list); 318 kfree(assigned_dev); 319 } 320 321 void kvm_free_all_assigned_devices(struct kvm *kvm) 322 { 323 struct list_head *ptr, *ptr2; 324 struct kvm_assigned_dev_kernel *assigned_dev; 325 326 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 327 assigned_dev = list_entry(ptr, 328 struct kvm_assigned_dev_kernel, 329 list); 330 331 kvm_free_assigned_device(kvm, assigned_dev); 332 } 333 } 334 335 static int assigned_device_enable_host_intx(struct kvm *kvm, 336 struct kvm_assigned_dev_kernel *dev) 337 { 338 dev->host_irq = dev->dev->irq; 339 /* Even though this is PCI, we don't want to use shared 340 * interrupts. Sharing host devices with guest-assigned devices 341 * on the same interrupt line is not a happy situation: there 342 * are going to be long delays in accepting, acking, etc. 343 */ 344 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 345 0, "kvm_assigned_intx_device", (void *)dev)) 346 return -EIO; 347 return 0; 348 } 349 350 #ifdef __KVM_HAVE_MSI 351 static int assigned_device_enable_host_msi(struct kvm *kvm, 352 struct kvm_assigned_dev_kernel *dev) 353 { 354 int r; 355 356 if (!dev->dev->msi_enabled) { 357 r = pci_enable_msi(dev->dev); 358 if (r) 359 return r; 360 } 361 362 dev->host_irq = dev->dev->irq; 363 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 364 "kvm_assigned_msi_device", (void *)dev)) { 365 pci_disable_msi(dev->dev); 366 return -EIO; 367 } 368 369 return 0; 370 } 371 #endif 372 373 #ifdef __KVM_HAVE_MSIX 374 static int assigned_device_enable_host_msix(struct kvm *kvm, 375 struct kvm_assigned_dev_kernel *dev) 376 { 377 int i, r = -EINVAL; 378 379 /* host_msix_entries and guest_msix_entries should have been 380 * initialized */ 381 if (dev->entries_nr == 0) 382 return r; 383 384 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); 385 if (r) 386 return r; 387 388 for (i = 0; i < dev->entries_nr; i++) { 389 r = request_irq(dev->host_msix_entries[i].vector, 390 kvm_assigned_dev_intr, 0, 391 "kvm_assigned_msix_device", 392 (void *)dev); 393 /* FIXME: free requested_irq's on failure */ 394 if (r) 395 return r; 396 } 397 398 return 0; 399 } 400 401 #endif 402 403 static int assigned_device_enable_guest_intx(struct kvm *kvm, 404 struct kvm_assigned_dev_kernel *dev, 405 struct kvm_assigned_irq *irq) 406 { 407 dev->guest_irq = irq->guest_irq; 408 dev->ack_notifier.gsi = irq->guest_irq; 409 return 0; 410 } 411 412 #ifdef __KVM_HAVE_MSI 413 static int assigned_device_enable_guest_msi(struct kvm *kvm, 414 struct kvm_assigned_dev_kernel *dev, 415 struct kvm_assigned_irq *irq) 416 { 417 dev->guest_irq = irq->guest_irq; 418 dev->ack_notifier.gsi = -1; 419 dev->host_irq_disabled = false; 420 return 0; 421 } 422 #endif 423 #ifdef __KVM_HAVE_MSIX 424 static int assigned_device_enable_guest_msix(struct kvm *kvm, 425 struct kvm_assigned_dev_kernel *dev, 426 struct kvm_assigned_irq *irq) 427 { 428 dev->guest_irq = irq->guest_irq; 429 dev->ack_notifier.gsi = -1; 430 dev->host_irq_disabled = false; 431 return 0; 432 } 433 #endif 434 435 static int assign_host_irq(struct kvm *kvm, 436 struct kvm_assigned_dev_kernel *dev, 437 __u32 host_irq_type) 438 { 439 int r = -EEXIST; 440 441 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 442 return r; 443 444 switch (host_irq_type) { 445 case KVM_DEV_IRQ_HOST_INTX: 446 r = assigned_device_enable_host_intx(kvm, dev); 447 break; 448 #ifdef __KVM_HAVE_MSI 449 case KVM_DEV_IRQ_HOST_MSI: 450 r = assigned_device_enable_host_msi(kvm, dev); 451 break; 452 #endif 453 #ifdef __KVM_HAVE_MSIX 454 case KVM_DEV_IRQ_HOST_MSIX: 455 r = assigned_device_enable_host_msix(kvm, dev); 456 break; 457 #endif 458 default: 459 r = -EINVAL; 460 } 461 462 if (!r) 463 dev->irq_requested_type |= host_irq_type; 464 465 return r; 466 } 467 468 static int assign_guest_irq(struct kvm *kvm, 469 struct kvm_assigned_dev_kernel *dev, 470 struct kvm_assigned_irq *irq, 471 unsigned long guest_irq_type) 472 { 473 int id; 474 int r = -EEXIST; 475 476 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) 477 return r; 478 479 id = kvm_request_irq_source_id(kvm); 480 if (id < 0) 481 return id; 482 483 dev->irq_source_id = id; 484 485 switch (guest_irq_type) { 486 case KVM_DEV_IRQ_GUEST_INTX: 487 r = assigned_device_enable_guest_intx(kvm, dev, irq); 488 break; 489 #ifdef __KVM_HAVE_MSI 490 case KVM_DEV_IRQ_GUEST_MSI: 491 r = assigned_device_enable_guest_msi(kvm, dev, irq); 492 break; 493 #endif 494 #ifdef __KVM_HAVE_MSIX 495 case KVM_DEV_IRQ_GUEST_MSIX: 496 r = assigned_device_enable_guest_msix(kvm, dev, irq); 497 break; 498 #endif 499 default: 500 r = -EINVAL; 501 } 502 503 if (!r) { 504 dev->irq_requested_type |= guest_irq_type; 505 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 506 } else 507 kvm_free_irq_source_id(kvm, dev->irq_source_id); 508 509 return r; 510 } 511 512 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ 513 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 514 struct kvm_assigned_irq *assigned_irq) 515 { 516 int r = -EINVAL; 517 struct kvm_assigned_dev_kernel *match; 518 unsigned long host_irq_type, guest_irq_type; 519 520 if (!capable(CAP_SYS_RAWIO)) 521 return -EPERM; 522 523 if (!irqchip_in_kernel(kvm)) 524 return r; 525 526 mutex_lock(&kvm->lock); 527 r = -ENODEV; 528 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 529 assigned_irq->assigned_dev_id); 530 if (!match) 531 goto out; 532 533 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); 534 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); 535 536 r = -EINVAL; 537 /* can only assign one type at a time */ 538 if (hweight_long(host_irq_type) > 1) 539 goto out; 540 if (hweight_long(guest_irq_type) > 1) 541 goto out; 542 if (host_irq_type == 0 && guest_irq_type == 0) 543 goto out; 544 545 r = 0; 546 if (host_irq_type) 547 r = assign_host_irq(kvm, match, host_irq_type); 548 if (r) 549 goto out; 550 551 if (guest_irq_type) 552 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); 553 out: 554 mutex_unlock(&kvm->lock); 555 return r; 556 } 557 558 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, 559 struct kvm_assigned_irq 560 *assigned_irq) 561 { 562 int r = -ENODEV; 563 struct kvm_assigned_dev_kernel *match; 564 565 mutex_lock(&kvm->lock); 566 567 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 568 assigned_irq->assigned_dev_id); 569 if (!match) 570 goto out; 571 572 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 573 out: 574 mutex_unlock(&kvm->lock); 575 return r; 576 } 577 578 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 579 struct kvm_assigned_pci_dev *assigned_dev) 580 { 581 int r = 0; 582 struct kvm_assigned_dev_kernel *match; 583 struct pci_dev *dev; 584 585 down_read(&kvm->slots_lock); 586 mutex_lock(&kvm->lock); 587 588 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 589 assigned_dev->assigned_dev_id); 590 if (match) { 591 /* device already assigned */ 592 r = -EEXIST; 593 goto out; 594 } 595 596 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 597 if (match == NULL) { 598 printk(KERN_INFO "%s: Couldn't allocate memory\n", 599 __func__); 600 r = -ENOMEM; 601 goto out; 602 } 603 dev = pci_get_bus_and_slot(assigned_dev->busnr, 604 assigned_dev->devfn); 605 if (!dev) { 606 printk(KERN_INFO "%s: host device not found\n", __func__); 607 r = -EINVAL; 608 goto out_free; 609 } 610 if (pci_enable_device(dev)) { 611 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 612 r = -EBUSY; 613 goto out_put; 614 } 615 r = pci_request_regions(dev, "kvm_assigned_device"); 616 if (r) { 617 printk(KERN_INFO "%s: Could not get access to device regions\n", 618 __func__); 619 goto out_disable; 620 } 621 622 pci_reset_function(dev); 623 624 match->assigned_dev_id = assigned_dev->assigned_dev_id; 625 match->host_busnr = assigned_dev->busnr; 626 match->host_devfn = assigned_dev->devfn; 627 match->flags = assigned_dev->flags; 628 match->dev = dev; 629 spin_lock_init(&match->assigned_dev_lock); 630 match->irq_source_id = -1; 631 match->kvm = kvm; 632 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 633 INIT_WORK(&match->interrupt_work, 634 kvm_assigned_dev_interrupt_work_handler); 635 636 list_add(&match->list, &kvm->arch.assigned_dev_head); 637 638 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 639 if (!kvm->arch.iommu_domain) { 640 r = kvm_iommu_map_guest(kvm); 641 if (r) 642 goto out_list_del; 643 } 644 r = kvm_assign_device(kvm, match); 645 if (r) 646 goto out_list_del; 647 } 648 649 out: 650 mutex_unlock(&kvm->lock); 651 up_read(&kvm->slots_lock); 652 return r; 653 out_list_del: 654 list_del(&match->list); 655 pci_release_regions(dev); 656 out_disable: 657 pci_disable_device(dev); 658 out_put: 659 pci_dev_put(dev); 660 out_free: 661 kfree(match); 662 mutex_unlock(&kvm->lock); 663 up_read(&kvm->slots_lock); 664 return r; 665 } 666 #endif 667 668 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 669 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 670 struct kvm_assigned_pci_dev *assigned_dev) 671 { 672 int r = 0; 673 struct kvm_assigned_dev_kernel *match; 674 675 mutex_lock(&kvm->lock); 676 677 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 678 assigned_dev->assigned_dev_id); 679 if (!match) { 680 printk(KERN_INFO "%s: device hasn't been assigned before, " 681 "so cannot be deassigned\n", __func__); 682 r = -EINVAL; 683 goto out; 684 } 685 686 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 687 kvm_deassign_device(kvm, match); 688 689 kvm_free_assigned_device(kvm, match); 690 691 out: 692 mutex_unlock(&kvm->lock); 693 return r; 694 } 695 #endif 696 697 inline int kvm_is_mmio_pfn(pfn_t pfn) 698 { 699 if (pfn_valid(pfn)) { 700 struct page *page = compound_head(pfn_to_page(pfn)); 701 return PageReserved(page); 702 } 703 704 return true; 705 } 706 707 /* 708 * Switches to specified vcpu, until a matching vcpu_put() 709 */ 710 void vcpu_load(struct kvm_vcpu *vcpu) 711 { 712 int cpu; 713 714 mutex_lock(&vcpu->mutex); 715 cpu = get_cpu(); 716 preempt_notifier_register(&vcpu->preempt_notifier); 717 kvm_arch_vcpu_load(vcpu, cpu); 718 put_cpu(); 719 } 720 721 void vcpu_put(struct kvm_vcpu *vcpu) 722 { 723 preempt_disable(); 724 kvm_arch_vcpu_put(vcpu); 725 preempt_notifier_unregister(&vcpu->preempt_notifier); 726 preempt_enable(); 727 mutex_unlock(&vcpu->mutex); 728 } 729 730 static void ack_flush(void *_completed) 731 { 732 } 733 734 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 735 { 736 int i, cpu, me; 737 cpumask_var_t cpus; 738 bool called = true; 739 struct kvm_vcpu *vcpu; 740 741 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 742 cpumask_clear(cpus); 743 744 spin_lock(&kvm->requests_lock); 745 me = smp_processor_id(); 746 kvm_for_each_vcpu(i, vcpu, kvm) { 747 if (test_and_set_bit(req, &vcpu->requests)) 748 continue; 749 cpu = vcpu->cpu; 750 if (cpus != NULL && cpu != -1 && cpu != me) 751 cpumask_set_cpu(cpu, cpus); 752 } 753 if (unlikely(cpus == NULL)) 754 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 755 else if (!cpumask_empty(cpus)) 756 smp_call_function_many(cpus, ack_flush, NULL, 1); 757 else 758 called = false; 759 spin_unlock(&kvm->requests_lock); 760 free_cpumask_var(cpus); 761 return called; 762 } 763 764 void kvm_flush_remote_tlbs(struct kvm *kvm) 765 { 766 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 767 ++kvm->stat.remote_tlb_flush; 768 } 769 770 void kvm_reload_remote_mmus(struct kvm *kvm) 771 { 772 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 773 } 774 775 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 776 { 777 struct page *page; 778 int r; 779 780 mutex_init(&vcpu->mutex); 781 vcpu->cpu = -1; 782 vcpu->kvm = kvm; 783 vcpu->vcpu_id = id; 784 init_waitqueue_head(&vcpu->wq); 785 786 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 787 if (!page) { 788 r = -ENOMEM; 789 goto fail; 790 } 791 vcpu->run = page_address(page); 792 793 r = kvm_arch_vcpu_init(vcpu); 794 if (r < 0) 795 goto fail_free_run; 796 return 0; 797 798 fail_free_run: 799 free_page((unsigned long)vcpu->run); 800 fail: 801 return r; 802 } 803 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 804 805 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 806 { 807 kvm_arch_vcpu_uninit(vcpu); 808 free_page((unsigned long)vcpu->run); 809 } 810 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 811 812 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 813 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 814 { 815 return container_of(mn, struct kvm, mmu_notifier); 816 } 817 818 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 819 struct mm_struct *mm, 820 unsigned long address) 821 { 822 struct kvm *kvm = mmu_notifier_to_kvm(mn); 823 int need_tlb_flush; 824 825 /* 826 * When ->invalidate_page runs, the linux pte has been zapped 827 * already but the page is still allocated until 828 * ->invalidate_page returns. So if we increase the sequence 829 * here the kvm page fault will notice if the spte can't be 830 * established because the page is going to be freed. If 831 * instead the kvm page fault establishes the spte before 832 * ->invalidate_page runs, kvm_unmap_hva will release it 833 * before returning. 834 * 835 * The sequence increase only need to be seen at spin_unlock 836 * time, and not at spin_lock time. 837 * 838 * Increasing the sequence after the spin_unlock would be 839 * unsafe because the kvm page fault could then establish the 840 * pte after kvm_unmap_hva returned, without noticing the page 841 * is going to be freed. 842 */ 843 spin_lock(&kvm->mmu_lock); 844 kvm->mmu_notifier_seq++; 845 need_tlb_flush = kvm_unmap_hva(kvm, address); 846 spin_unlock(&kvm->mmu_lock); 847 848 /* we've to flush the tlb before the pages can be freed */ 849 if (need_tlb_flush) 850 kvm_flush_remote_tlbs(kvm); 851 852 } 853 854 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 855 struct mm_struct *mm, 856 unsigned long start, 857 unsigned long end) 858 { 859 struct kvm *kvm = mmu_notifier_to_kvm(mn); 860 int need_tlb_flush = 0; 861 862 spin_lock(&kvm->mmu_lock); 863 /* 864 * The count increase must become visible at unlock time as no 865 * spte can be established without taking the mmu_lock and 866 * count is also read inside the mmu_lock critical section. 867 */ 868 kvm->mmu_notifier_count++; 869 for (; start < end; start += PAGE_SIZE) 870 need_tlb_flush |= kvm_unmap_hva(kvm, start); 871 spin_unlock(&kvm->mmu_lock); 872 873 /* we've to flush the tlb before the pages can be freed */ 874 if (need_tlb_flush) 875 kvm_flush_remote_tlbs(kvm); 876 } 877 878 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 879 struct mm_struct *mm, 880 unsigned long start, 881 unsigned long end) 882 { 883 struct kvm *kvm = mmu_notifier_to_kvm(mn); 884 885 spin_lock(&kvm->mmu_lock); 886 /* 887 * This sequence increase will notify the kvm page fault that 888 * the page that is going to be mapped in the spte could have 889 * been freed. 890 */ 891 kvm->mmu_notifier_seq++; 892 /* 893 * The above sequence increase must be visible before the 894 * below count decrease but both values are read by the kvm 895 * page fault under mmu_lock spinlock so we don't need to add 896 * a smb_wmb() here in between the two. 897 */ 898 kvm->mmu_notifier_count--; 899 spin_unlock(&kvm->mmu_lock); 900 901 BUG_ON(kvm->mmu_notifier_count < 0); 902 } 903 904 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 905 struct mm_struct *mm, 906 unsigned long address) 907 { 908 struct kvm *kvm = mmu_notifier_to_kvm(mn); 909 int young; 910 911 spin_lock(&kvm->mmu_lock); 912 young = kvm_age_hva(kvm, address); 913 spin_unlock(&kvm->mmu_lock); 914 915 if (young) 916 kvm_flush_remote_tlbs(kvm); 917 918 return young; 919 } 920 921 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 922 struct mm_struct *mm) 923 { 924 struct kvm *kvm = mmu_notifier_to_kvm(mn); 925 kvm_arch_flush_shadow(kvm); 926 } 927 928 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 929 .invalidate_page = kvm_mmu_notifier_invalidate_page, 930 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 931 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 932 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 933 .release = kvm_mmu_notifier_release, 934 }; 935 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 936 937 static struct kvm *kvm_create_vm(void) 938 { 939 struct kvm *kvm = kvm_arch_create_vm(); 940 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 941 struct page *page; 942 #endif 943 944 if (IS_ERR(kvm)) 945 goto out; 946 #ifdef CONFIG_HAVE_KVM_IRQCHIP 947 INIT_LIST_HEAD(&kvm->irq_routing); 948 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 949 #endif 950 951 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 952 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 953 if (!page) { 954 kfree(kvm); 955 return ERR_PTR(-ENOMEM); 956 } 957 kvm->coalesced_mmio_ring = 958 (struct kvm_coalesced_mmio_ring *)page_address(page); 959 #endif 960 961 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 962 { 963 int err; 964 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 965 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 966 if (err) { 967 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 968 put_page(page); 969 #endif 970 kfree(kvm); 971 return ERR_PTR(err); 972 } 973 } 974 #endif 975 976 kvm->mm = current->mm; 977 atomic_inc(&kvm->mm->mm_count); 978 spin_lock_init(&kvm->mmu_lock); 979 spin_lock_init(&kvm->requests_lock); 980 kvm_io_bus_init(&kvm->pio_bus); 981 kvm_eventfd_init(kvm); 982 mutex_init(&kvm->lock); 983 mutex_init(&kvm->irq_lock); 984 kvm_io_bus_init(&kvm->mmio_bus); 985 init_rwsem(&kvm->slots_lock); 986 atomic_set(&kvm->users_count, 1); 987 spin_lock(&kvm_lock); 988 list_add(&kvm->vm_list, &vm_list); 989 spin_unlock(&kvm_lock); 990 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 991 kvm_coalesced_mmio_init(kvm); 992 #endif 993 out: 994 return kvm; 995 } 996 997 /* 998 * Free any memory in @free but not in @dont. 999 */ 1000 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1001 struct kvm_memory_slot *dont) 1002 { 1003 int i; 1004 1005 if (!dont || free->rmap != dont->rmap) 1006 vfree(free->rmap); 1007 1008 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1009 vfree(free->dirty_bitmap); 1010 1011 1012 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 1013 if (!dont || free->lpage_info[i] != dont->lpage_info[i]) { 1014 vfree(free->lpage_info[i]); 1015 free->lpage_info[i] = NULL; 1016 } 1017 } 1018 1019 free->npages = 0; 1020 free->dirty_bitmap = NULL; 1021 free->rmap = NULL; 1022 } 1023 1024 void kvm_free_physmem(struct kvm *kvm) 1025 { 1026 int i; 1027 1028 for (i = 0; i < kvm->nmemslots; ++i) 1029 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 1030 } 1031 1032 static void kvm_destroy_vm(struct kvm *kvm) 1033 { 1034 struct mm_struct *mm = kvm->mm; 1035 1036 kvm_arch_sync_events(kvm); 1037 spin_lock(&kvm_lock); 1038 list_del(&kvm->vm_list); 1039 spin_unlock(&kvm_lock); 1040 kvm_free_irq_routing(kvm); 1041 kvm_io_bus_destroy(&kvm->pio_bus); 1042 kvm_io_bus_destroy(&kvm->mmio_bus); 1043 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1044 if (kvm->coalesced_mmio_ring != NULL) 1045 free_page((unsigned long)kvm->coalesced_mmio_ring); 1046 #endif 1047 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1048 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 1049 #else 1050 kvm_arch_flush_shadow(kvm); 1051 #endif 1052 kvm_arch_destroy_vm(kvm); 1053 mmdrop(mm); 1054 } 1055 1056 void kvm_get_kvm(struct kvm *kvm) 1057 { 1058 atomic_inc(&kvm->users_count); 1059 } 1060 EXPORT_SYMBOL_GPL(kvm_get_kvm); 1061 1062 void kvm_put_kvm(struct kvm *kvm) 1063 { 1064 if (atomic_dec_and_test(&kvm->users_count)) 1065 kvm_destroy_vm(kvm); 1066 } 1067 EXPORT_SYMBOL_GPL(kvm_put_kvm); 1068 1069 1070 static int kvm_vm_release(struct inode *inode, struct file *filp) 1071 { 1072 struct kvm *kvm = filp->private_data; 1073 1074 kvm_irqfd_release(kvm); 1075 1076 kvm_put_kvm(kvm); 1077 return 0; 1078 } 1079 1080 /* 1081 * Allocate some memory and give it an address in the guest physical address 1082 * space. 1083 * 1084 * Discontiguous memory is allowed, mostly for framebuffers. 1085 * 1086 * Must be called holding mmap_sem for write. 1087 */ 1088 int __kvm_set_memory_region(struct kvm *kvm, 1089 struct kvm_userspace_memory_region *mem, 1090 int user_alloc) 1091 { 1092 int r; 1093 gfn_t base_gfn; 1094 unsigned long npages; 1095 unsigned long i; 1096 struct kvm_memory_slot *memslot; 1097 struct kvm_memory_slot old, new; 1098 1099 r = -EINVAL; 1100 /* General sanity checks */ 1101 if (mem->memory_size & (PAGE_SIZE - 1)) 1102 goto out; 1103 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1104 goto out; 1105 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 1106 goto out; 1107 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 1108 goto out; 1109 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1110 goto out; 1111 1112 memslot = &kvm->memslots[mem->slot]; 1113 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1114 npages = mem->memory_size >> PAGE_SHIFT; 1115 1116 if (!npages) 1117 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1118 1119 new = old = *memslot; 1120 1121 new.base_gfn = base_gfn; 1122 new.npages = npages; 1123 new.flags = mem->flags; 1124 1125 /* Disallow changing a memory slot's size. */ 1126 r = -EINVAL; 1127 if (npages && old.npages && npages != old.npages) 1128 goto out_free; 1129 1130 /* Check for overlaps */ 1131 r = -EEXIST; 1132 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1133 struct kvm_memory_slot *s = &kvm->memslots[i]; 1134 1135 if (s == memslot || !s->npages) 1136 continue; 1137 if (!((base_gfn + npages <= s->base_gfn) || 1138 (base_gfn >= s->base_gfn + s->npages))) 1139 goto out_free; 1140 } 1141 1142 /* Free page dirty bitmap if unneeded */ 1143 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1144 new.dirty_bitmap = NULL; 1145 1146 r = -ENOMEM; 1147 1148 /* Allocate if a slot is being created */ 1149 #ifndef CONFIG_S390 1150 if (npages && !new.rmap) { 1151 new.rmap = vmalloc(npages * sizeof(struct page *)); 1152 1153 if (!new.rmap) 1154 goto out_free; 1155 1156 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 1157 1158 new.user_alloc = user_alloc; 1159 /* 1160 * hva_to_rmmap() serialzies with the mmu_lock and to be 1161 * safe it has to ignore memslots with !user_alloc && 1162 * !userspace_addr. 1163 */ 1164 if (user_alloc) 1165 new.userspace_addr = mem->userspace_addr; 1166 else 1167 new.userspace_addr = 0; 1168 } 1169 if (!npages) 1170 goto skip_lpage; 1171 1172 for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { 1173 unsigned long ugfn; 1174 unsigned long j; 1175 int lpages; 1176 int level = i + 2; 1177 1178 /* Avoid unused variable warning if no large pages */ 1179 (void)level; 1180 1181 if (new.lpage_info[i]) 1182 continue; 1183 1184 lpages = 1 + (base_gfn + npages - 1) / 1185 KVM_PAGES_PER_HPAGE(level); 1186 lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); 1187 1188 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 1189 1190 if (!new.lpage_info[i]) 1191 goto out_free; 1192 1193 memset(new.lpage_info[i], 0, 1194 lpages * sizeof(*new.lpage_info[i])); 1195 1196 if (base_gfn % KVM_PAGES_PER_HPAGE(level)) 1197 new.lpage_info[i][0].write_count = 1; 1198 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) 1199 new.lpage_info[i][lpages - 1].write_count = 1; 1200 ugfn = new.userspace_addr >> PAGE_SHIFT; 1201 /* 1202 * If the gfn and userspace address are not aligned wrt each 1203 * other, or if explicitly asked to, disable large page 1204 * support for this slot 1205 */ 1206 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || 1207 !largepages_enabled) 1208 for (j = 0; j < lpages; ++j) 1209 new.lpage_info[i][j].write_count = 1; 1210 } 1211 1212 skip_lpage: 1213 1214 /* Allocate page dirty bitmap if needed */ 1215 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1216 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1217 1218 new.dirty_bitmap = vmalloc(dirty_bytes); 1219 if (!new.dirty_bitmap) 1220 goto out_free; 1221 memset(new.dirty_bitmap, 0, dirty_bytes); 1222 if (old.npages) 1223 kvm_arch_flush_shadow(kvm); 1224 } 1225 #else /* not defined CONFIG_S390 */ 1226 new.user_alloc = user_alloc; 1227 if (user_alloc) 1228 new.userspace_addr = mem->userspace_addr; 1229 #endif /* not defined CONFIG_S390 */ 1230 1231 if (!npages) 1232 kvm_arch_flush_shadow(kvm); 1233 1234 spin_lock(&kvm->mmu_lock); 1235 if (mem->slot >= kvm->nmemslots) 1236 kvm->nmemslots = mem->slot + 1; 1237 1238 *memslot = new; 1239 spin_unlock(&kvm->mmu_lock); 1240 1241 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1242 if (r) { 1243 spin_lock(&kvm->mmu_lock); 1244 *memslot = old; 1245 spin_unlock(&kvm->mmu_lock); 1246 goto out_free; 1247 } 1248 1249 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1250 /* Slot deletion case: we have to update the current slot */ 1251 spin_lock(&kvm->mmu_lock); 1252 if (!npages) 1253 *memslot = old; 1254 spin_unlock(&kvm->mmu_lock); 1255 #ifdef CONFIG_DMAR 1256 /* map the pages in iommu page table */ 1257 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1258 if (r) 1259 goto out; 1260 #endif 1261 return 0; 1262 1263 out_free: 1264 kvm_free_physmem_slot(&new, &old); 1265 out: 1266 return r; 1267 1268 } 1269 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1270 1271 int kvm_set_memory_region(struct kvm *kvm, 1272 struct kvm_userspace_memory_region *mem, 1273 int user_alloc) 1274 { 1275 int r; 1276 1277 down_write(&kvm->slots_lock); 1278 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1279 up_write(&kvm->slots_lock); 1280 return r; 1281 } 1282 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1283 1284 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1285 struct 1286 kvm_userspace_memory_region *mem, 1287 int user_alloc) 1288 { 1289 if (mem->slot >= KVM_MEMORY_SLOTS) 1290 return -EINVAL; 1291 return kvm_set_memory_region(kvm, mem, user_alloc); 1292 } 1293 1294 int kvm_get_dirty_log(struct kvm *kvm, 1295 struct kvm_dirty_log *log, int *is_dirty) 1296 { 1297 struct kvm_memory_slot *memslot; 1298 int r, i; 1299 int n; 1300 unsigned long any = 0; 1301 1302 r = -EINVAL; 1303 if (log->slot >= KVM_MEMORY_SLOTS) 1304 goto out; 1305 1306 memslot = &kvm->memslots[log->slot]; 1307 r = -ENOENT; 1308 if (!memslot->dirty_bitmap) 1309 goto out; 1310 1311 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1312 1313 for (i = 0; !any && i < n/sizeof(long); ++i) 1314 any = memslot->dirty_bitmap[i]; 1315 1316 r = -EFAULT; 1317 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1318 goto out; 1319 1320 if (any) 1321 *is_dirty = 1; 1322 1323 r = 0; 1324 out: 1325 return r; 1326 } 1327 1328 void kvm_disable_largepages(void) 1329 { 1330 largepages_enabled = false; 1331 } 1332 EXPORT_SYMBOL_GPL(kvm_disable_largepages); 1333 1334 int is_error_page(struct page *page) 1335 { 1336 return page == bad_page; 1337 } 1338 EXPORT_SYMBOL_GPL(is_error_page); 1339 1340 int is_error_pfn(pfn_t pfn) 1341 { 1342 return pfn == bad_pfn; 1343 } 1344 EXPORT_SYMBOL_GPL(is_error_pfn); 1345 1346 static inline unsigned long bad_hva(void) 1347 { 1348 return PAGE_OFFSET; 1349 } 1350 1351 int kvm_is_error_hva(unsigned long addr) 1352 { 1353 return addr == bad_hva(); 1354 } 1355 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1356 1357 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1358 { 1359 int i; 1360 1361 for (i = 0; i < kvm->nmemslots; ++i) { 1362 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1363 1364 if (gfn >= memslot->base_gfn 1365 && gfn < memslot->base_gfn + memslot->npages) 1366 return memslot; 1367 } 1368 return NULL; 1369 } 1370 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1371 1372 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1373 { 1374 gfn = unalias_gfn(kvm, gfn); 1375 return gfn_to_memslot_unaliased(kvm, gfn); 1376 } 1377 1378 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1379 { 1380 int i; 1381 1382 gfn = unalias_gfn(kvm, gfn); 1383 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1384 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1385 1386 if (gfn >= memslot->base_gfn 1387 && gfn < memslot->base_gfn + memslot->npages) 1388 return 1; 1389 } 1390 return 0; 1391 } 1392 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1393 1394 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1395 { 1396 struct kvm_memory_slot *slot; 1397 1398 gfn = unalias_gfn(kvm, gfn); 1399 slot = gfn_to_memslot_unaliased(kvm, gfn); 1400 if (!slot) 1401 return bad_hva(); 1402 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1403 } 1404 EXPORT_SYMBOL_GPL(gfn_to_hva); 1405 1406 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1407 { 1408 struct page *page[1]; 1409 unsigned long addr; 1410 int npages; 1411 pfn_t pfn; 1412 1413 might_sleep(); 1414 1415 addr = gfn_to_hva(kvm, gfn); 1416 if (kvm_is_error_hva(addr)) { 1417 get_page(bad_page); 1418 return page_to_pfn(bad_page); 1419 } 1420 1421 npages = get_user_pages_fast(addr, 1, 1, page); 1422 1423 if (unlikely(npages != 1)) { 1424 struct vm_area_struct *vma; 1425 1426 down_read(¤t->mm->mmap_sem); 1427 vma = find_vma(current->mm, addr); 1428 1429 if (vma == NULL || addr < vma->vm_start || 1430 !(vma->vm_flags & VM_PFNMAP)) { 1431 up_read(¤t->mm->mmap_sem); 1432 get_page(bad_page); 1433 return page_to_pfn(bad_page); 1434 } 1435 1436 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1437 up_read(¤t->mm->mmap_sem); 1438 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1439 } else 1440 pfn = page_to_pfn(page[0]); 1441 1442 return pfn; 1443 } 1444 1445 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1446 1447 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1448 { 1449 pfn_t pfn; 1450 1451 pfn = gfn_to_pfn(kvm, gfn); 1452 if (!kvm_is_mmio_pfn(pfn)) 1453 return pfn_to_page(pfn); 1454 1455 WARN_ON(kvm_is_mmio_pfn(pfn)); 1456 1457 get_page(bad_page); 1458 return bad_page; 1459 } 1460 1461 EXPORT_SYMBOL_GPL(gfn_to_page); 1462 1463 void kvm_release_page_clean(struct page *page) 1464 { 1465 kvm_release_pfn_clean(page_to_pfn(page)); 1466 } 1467 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1468 1469 void kvm_release_pfn_clean(pfn_t pfn) 1470 { 1471 if (!kvm_is_mmio_pfn(pfn)) 1472 put_page(pfn_to_page(pfn)); 1473 } 1474 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1475 1476 void kvm_release_page_dirty(struct page *page) 1477 { 1478 kvm_release_pfn_dirty(page_to_pfn(page)); 1479 } 1480 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1481 1482 void kvm_release_pfn_dirty(pfn_t pfn) 1483 { 1484 kvm_set_pfn_dirty(pfn); 1485 kvm_release_pfn_clean(pfn); 1486 } 1487 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1488 1489 void kvm_set_page_dirty(struct page *page) 1490 { 1491 kvm_set_pfn_dirty(page_to_pfn(page)); 1492 } 1493 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1494 1495 void kvm_set_pfn_dirty(pfn_t pfn) 1496 { 1497 if (!kvm_is_mmio_pfn(pfn)) { 1498 struct page *page = pfn_to_page(pfn); 1499 if (!PageReserved(page)) 1500 SetPageDirty(page); 1501 } 1502 } 1503 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1504 1505 void kvm_set_pfn_accessed(pfn_t pfn) 1506 { 1507 if (!kvm_is_mmio_pfn(pfn)) 1508 mark_page_accessed(pfn_to_page(pfn)); 1509 } 1510 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1511 1512 void kvm_get_pfn(pfn_t pfn) 1513 { 1514 if (!kvm_is_mmio_pfn(pfn)) 1515 get_page(pfn_to_page(pfn)); 1516 } 1517 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1518 1519 static int next_segment(unsigned long len, int offset) 1520 { 1521 if (len > PAGE_SIZE - offset) 1522 return PAGE_SIZE - offset; 1523 else 1524 return len; 1525 } 1526 1527 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1528 int len) 1529 { 1530 int r; 1531 unsigned long addr; 1532 1533 addr = gfn_to_hva(kvm, gfn); 1534 if (kvm_is_error_hva(addr)) 1535 return -EFAULT; 1536 r = copy_from_user(data, (void __user *)addr + offset, len); 1537 if (r) 1538 return -EFAULT; 1539 return 0; 1540 } 1541 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1542 1543 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1544 { 1545 gfn_t gfn = gpa >> PAGE_SHIFT; 1546 int seg; 1547 int offset = offset_in_page(gpa); 1548 int ret; 1549 1550 while ((seg = next_segment(len, offset)) != 0) { 1551 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1552 if (ret < 0) 1553 return ret; 1554 offset = 0; 1555 len -= seg; 1556 data += seg; 1557 ++gfn; 1558 } 1559 return 0; 1560 } 1561 EXPORT_SYMBOL_GPL(kvm_read_guest); 1562 1563 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1564 unsigned long len) 1565 { 1566 int r; 1567 unsigned long addr; 1568 gfn_t gfn = gpa >> PAGE_SHIFT; 1569 int offset = offset_in_page(gpa); 1570 1571 addr = gfn_to_hva(kvm, gfn); 1572 if (kvm_is_error_hva(addr)) 1573 return -EFAULT; 1574 pagefault_disable(); 1575 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1576 pagefault_enable(); 1577 if (r) 1578 return -EFAULT; 1579 return 0; 1580 } 1581 EXPORT_SYMBOL(kvm_read_guest_atomic); 1582 1583 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1584 int offset, int len) 1585 { 1586 int r; 1587 unsigned long addr; 1588 1589 addr = gfn_to_hva(kvm, gfn); 1590 if (kvm_is_error_hva(addr)) 1591 return -EFAULT; 1592 r = copy_to_user((void __user *)addr + offset, data, len); 1593 if (r) 1594 return -EFAULT; 1595 mark_page_dirty(kvm, gfn); 1596 return 0; 1597 } 1598 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1599 1600 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1601 unsigned long len) 1602 { 1603 gfn_t gfn = gpa >> PAGE_SHIFT; 1604 int seg; 1605 int offset = offset_in_page(gpa); 1606 int ret; 1607 1608 while ((seg = next_segment(len, offset)) != 0) { 1609 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1610 if (ret < 0) 1611 return ret; 1612 offset = 0; 1613 len -= seg; 1614 data += seg; 1615 ++gfn; 1616 } 1617 return 0; 1618 } 1619 1620 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1621 { 1622 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1623 } 1624 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1625 1626 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1627 { 1628 gfn_t gfn = gpa >> PAGE_SHIFT; 1629 int seg; 1630 int offset = offset_in_page(gpa); 1631 int ret; 1632 1633 while ((seg = next_segment(len, offset)) != 0) { 1634 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1635 if (ret < 0) 1636 return ret; 1637 offset = 0; 1638 len -= seg; 1639 ++gfn; 1640 } 1641 return 0; 1642 } 1643 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1644 1645 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1646 { 1647 struct kvm_memory_slot *memslot; 1648 1649 gfn = unalias_gfn(kvm, gfn); 1650 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1651 if (memslot && memslot->dirty_bitmap) { 1652 unsigned long rel_gfn = gfn - memslot->base_gfn; 1653 1654 /* avoid RMW */ 1655 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1656 set_bit(rel_gfn, memslot->dirty_bitmap); 1657 } 1658 } 1659 1660 /* 1661 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1662 */ 1663 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1664 { 1665 DEFINE_WAIT(wait); 1666 1667 for (;;) { 1668 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1669 1670 if (kvm_arch_vcpu_runnable(vcpu)) { 1671 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1672 break; 1673 } 1674 if (kvm_cpu_has_pending_timer(vcpu)) 1675 break; 1676 if (signal_pending(current)) 1677 break; 1678 1679 vcpu_put(vcpu); 1680 schedule(); 1681 vcpu_load(vcpu); 1682 } 1683 1684 finish_wait(&vcpu->wq, &wait); 1685 } 1686 1687 void kvm_resched(struct kvm_vcpu *vcpu) 1688 { 1689 if (!need_resched()) 1690 return; 1691 cond_resched(); 1692 } 1693 EXPORT_SYMBOL_GPL(kvm_resched); 1694 1695 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1696 { 1697 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1698 struct page *page; 1699 1700 if (vmf->pgoff == 0) 1701 page = virt_to_page(vcpu->run); 1702 #ifdef CONFIG_X86 1703 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1704 page = virt_to_page(vcpu->arch.pio_data); 1705 #endif 1706 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1707 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1708 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1709 #endif 1710 else 1711 return VM_FAULT_SIGBUS; 1712 get_page(page); 1713 vmf->page = page; 1714 return 0; 1715 } 1716 1717 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1718 .fault = kvm_vcpu_fault, 1719 }; 1720 1721 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1722 { 1723 vma->vm_ops = &kvm_vcpu_vm_ops; 1724 return 0; 1725 } 1726 1727 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1728 { 1729 struct kvm_vcpu *vcpu = filp->private_data; 1730 1731 kvm_put_kvm(vcpu->kvm); 1732 return 0; 1733 } 1734 1735 static struct file_operations kvm_vcpu_fops = { 1736 .release = kvm_vcpu_release, 1737 .unlocked_ioctl = kvm_vcpu_ioctl, 1738 .compat_ioctl = kvm_vcpu_ioctl, 1739 .mmap = kvm_vcpu_mmap, 1740 }; 1741 1742 /* 1743 * Allocates an inode for the vcpu. 1744 */ 1745 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1746 { 1747 return anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1748 } 1749 1750 /* 1751 * Creates some virtual cpus. Good luck creating more than one. 1752 */ 1753 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) 1754 { 1755 int r; 1756 struct kvm_vcpu *vcpu, *v; 1757 1758 vcpu = kvm_arch_vcpu_create(kvm, id); 1759 if (IS_ERR(vcpu)) 1760 return PTR_ERR(vcpu); 1761 1762 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1763 1764 r = kvm_arch_vcpu_setup(vcpu); 1765 if (r) 1766 return r; 1767 1768 mutex_lock(&kvm->lock); 1769 if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) { 1770 r = -EINVAL; 1771 goto vcpu_destroy; 1772 } 1773 1774 kvm_for_each_vcpu(r, v, kvm) 1775 if (v->vcpu_id == id) { 1776 r = -EEXIST; 1777 goto vcpu_destroy; 1778 } 1779 1780 BUG_ON(kvm->vcpus[atomic_read(&kvm->online_vcpus)]); 1781 1782 /* Now it's all set up, let userspace reach it */ 1783 kvm_get_kvm(kvm); 1784 r = create_vcpu_fd(vcpu); 1785 if (r < 0) { 1786 kvm_put_kvm(kvm); 1787 goto vcpu_destroy; 1788 } 1789 1790 kvm->vcpus[atomic_read(&kvm->online_vcpus)] = vcpu; 1791 smp_wmb(); 1792 atomic_inc(&kvm->online_vcpus); 1793 1794 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 1795 if (kvm->bsp_vcpu_id == id) 1796 kvm->bsp_vcpu = vcpu; 1797 #endif 1798 mutex_unlock(&kvm->lock); 1799 return r; 1800 1801 vcpu_destroy: 1802 mutex_unlock(&kvm->lock); 1803 kvm_arch_vcpu_destroy(vcpu); 1804 return r; 1805 } 1806 1807 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1808 { 1809 if (sigset) { 1810 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1811 vcpu->sigset_active = 1; 1812 vcpu->sigset = *sigset; 1813 } else 1814 vcpu->sigset_active = 0; 1815 return 0; 1816 } 1817 1818 #ifdef __KVM_HAVE_MSIX 1819 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 1820 struct kvm_assigned_msix_nr *entry_nr) 1821 { 1822 int r = 0; 1823 struct kvm_assigned_dev_kernel *adev; 1824 1825 mutex_lock(&kvm->lock); 1826 1827 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1828 entry_nr->assigned_dev_id); 1829 if (!adev) { 1830 r = -EINVAL; 1831 goto msix_nr_out; 1832 } 1833 1834 if (adev->entries_nr == 0) { 1835 adev->entries_nr = entry_nr->entry_nr; 1836 if (adev->entries_nr == 0 || 1837 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 1838 r = -EINVAL; 1839 goto msix_nr_out; 1840 } 1841 1842 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * 1843 entry_nr->entry_nr, 1844 GFP_KERNEL); 1845 if (!adev->host_msix_entries) { 1846 r = -ENOMEM; 1847 goto msix_nr_out; 1848 } 1849 adev->guest_msix_entries = kzalloc( 1850 sizeof(struct kvm_guest_msix_entry) * 1851 entry_nr->entry_nr, GFP_KERNEL); 1852 if (!adev->guest_msix_entries) { 1853 kfree(adev->host_msix_entries); 1854 r = -ENOMEM; 1855 goto msix_nr_out; 1856 } 1857 } else /* Not allowed set MSI-X number twice */ 1858 r = -EINVAL; 1859 msix_nr_out: 1860 mutex_unlock(&kvm->lock); 1861 return r; 1862 } 1863 1864 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, 1865 struct kvm_assigned_msix_entry *entry) 1866 { 1867 int r = 0, i; 1868 struct kvm_assigned_dev_kernel *adev; 1869 1870 mutex_lock(&kvm->lock); 1871 1872 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1873 entry->assigned_dev_id); 1874 1875 if (!adev) { 1876 r = -EINVAL; 1877 goto msix_entry_out; 1878 } 1879 1880 for (i = 0; i < adev->entries_nr; i++) 1881 if (adev->guest_msix_entries[i].vector == 0 || 1882 adev->guest_msix_entries[i].entry == entry->entry) { 1883 adev->guest_msix_entries[i].entry = entry->entry; 1884 adev->guest_msix_entries[i].vector = entry->gsi; 1885 adev->host_msix_entries[i].entry = entry->entry; 1886 break; 1887 } 1888 if (i == adev->entries_nr) { 1889 r = -ENOSPC; 1890 goto msix_entry_out; 1891 } 1892 1893 msix_entry_out: 1894 mutex_unlock(&kvm->lock); 1895 1896 return r; 1897 } 1898 #endif 1899 1900 static long kvm_vcpu_ioctl(struct file *filp, 1901 unsigned int ioctl, unsigned long arg) 1902 { 1903 struct kvm_vcpu *vcpu = filp->private_data; 1904 void __user *argp = (void __user *)arg; 1905 int r; 1906 struct kvm_fpu *fpu = NULL; 1907 struct kvm_sregs *kvm_sregs = NULL; 1908 1909 if (vcpu->kvm->mm != current->mm) 1910 return -EIO; 1911 switch (ioctl) { 1912 case KVM_RUN: 1913 r = -EINVAL; 1914 if (arg) 1915 goto out; 1916 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1917 break; 1918 case KVM_GET_REGS: { 1919 struct kvm_regs *kvm_regs; 1920 1921 r = -ENOMEM; 1922 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1923 if (!kvm_regs) 1924 goto out; 1925 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1926 if (r) 1927 goto out_free1; 1928 r = -EFAULT; 1929 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1930 goto out_free1; 1931 r = 0; 1932 out_free1: 1933 kfree(kvm_regs); 1934 break; 1935 } 1936 case KVM_SET_REGS: { 1937 struct kvm_regs *kvm_regs; 1938 1939 r = -ENOMEM; 1940 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1941 if (!kvm_regs) 1942 goto out; 1943 r = -EFAULT; 1944 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1945 goto out_free2; 1946 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1947 if (r) 1948 goto out_free2; 1949 r = 0; 1950 out_free2: 1951 kfree(kvm_regs); 1952 break; 1953 } 1954 case KVM_GET_SREGS: { 1955 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1956 r = -ENOMEM; 1957 if (!kvm_sregs) 1958 goto out; 1959 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1960 if (r) 1961 goto out; 1962 r = -EFAULT; 1963 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1964 goto out; 1965 r = 0; 1966 break; 1967 } 1968 case KVM_SET_SREGS: { 1969 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1970 r = -ENOMEM; 1971 if (!kvm_sregs) 1972 goto out; 1973 r = -EFAULT; 1974 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1975 goto out; 1976 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1977 if (r) 1978 goto out; 1979 r = 0; 1980 break; 1981 } 1982 case KVM_GET_MP_STATE: { 1983 struct kvm_mp_state mp_state; 1984 1985 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1986 if (r) 1987 goto out; 1988 r = -EFAULT; 1989 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1990 goto out; 1991 r = 0; 1992 break; 1993 } 1994 case KVM_SET_MP_STATE: { 1995 struct kvm_mp_state mp_state; 1996 1997 r = -EFAULT; 1998 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1999 goto out; 2000 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 2001 if (r) 2002 goto out; 2003 r = 0; 2004 break; 2005 } 2006 case KVM_TRANSLATE: { 2007 struct kvm_translation tr; 2008 2009 r = -EFAULT; 2010 if (copy_from_user(&tr, argp, sizeof tr)) 2011 goto out; 2012 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 2013 if (r) 2014 goto out; 2015 r = -EFAULT; 2016 if (copy_to_user(argp, &tr, sizeof tr)) 2017 goto out; 2018 r = 0; 2019 break; 2020 } 2021 case KVM_SET_GUEST_DEBUG: { 2022 struct kvm_guest_debug dbg; 2023 2024 r = -EFAULT; 2025 if (copy_from_user(&dbg, argp, sizeof dbg)) 2026 goto out; 2027 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2028 if (r) 2029 goto out; 2030 r = 0; 2031 break; 2032 } 2033 case KVM_SET_SIGNAL_MASK: { 2034 struct kvm_signal_mask __user *sigmask_arg = argp; 2035 struct kvm_signal_mask kvm_sigmask; 2036 sigset_t sigset, *p; 2037 2038 p = NULL; 2039 if (argp) { 2040 r = -EFAULT; 2041 if (copy_from_user(&kvm_sigmask, argp, 2042 sizeof kvm_sigmask)) 2043 goto out; 2044 r = -EINVAL; 2045 if (kvm_sigmask.len != sizeof sigset) 2046 goto out; 2047 r = -EFAULT; 2048 if (copy_from_user(&sigset, sigmask_arg->sigset, 2049 sizeof sigset)) 2050 goto out; 2051 p = &sigset; 2052 } 2053 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2054 break; 2055 } 2056 case KVM_GET_FPU: { 2057 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2058 r = -ENOMEM; 2059 if (!fpu) 2060 goto out; 2061 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2062 if (r) 2063 goto out; 2064 r = -EFAULT; 2065 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2066 goto out; 2067 r = 0; 2068 break; 2069 } 2070 case KVM_SET_FPU: { 2071 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2072 r = -ENOMEM; 2073 if (!fpu) 2074 goto out; 2075 r = -EFAULT; 2076 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 2077 goto out; 2078 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2079 if (r) 2080 goto out; 2081 r = 0; 2082 break; 2083 } 2084 default: 2085 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2086 } 2087 out: 2088 kfree(fpu); 2089 kfree(kvm_sregs); 2090 return r; 2091 } 2092 2093 static long kvm_vm_ioctl(struct file *filp, 2094 unsigned int ioctl, unsigned long arg) 2095 { 2096 struct kvm *kvm = filp->private_data; 2097 void __user *argp = (void __user *)arg; 2098 int r; 2099 2100 if (kvm->mm != current->mm) 2101 return -EIO; 2102 switch (ioctl) { 2103 case KVM_CREATE_VCPU: 2104 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2105 if (r < 0) 2106 goto out; 2107 break; 2108 case KVM_SET_USER_MEMORY_REGION: { 2109 struct kvm_userspace_memory_region kvm_userspace_mem; 2110 2111 r = -EFAULT; 2112 if (copy_from_user(&kvm_userspace_mem, argp, 2113 sizeof kvm_userspace_mem)) 2114 goto out; 2115 2116 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2117 if (r) 2118 goto out; 2119 break; 2120 } 2121 case KVM_GET_DIRTY_LOG: { 2122 struct kvm_dirty_log log; 2123 2124 r = -EFAULT; 2125 if (copy_from_user(&log, argp, sizeof log)) 2126 goto out; 2127 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2128 if (r) 2129 goto out; 2130 break; 2131 } 2132 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2133 case KVM_REGISTER_COALESCED_MMIO: { 2134 struct kvm_coalesced_mmio_zone zone; 2135 r = -EFAULT; 2136 if (copy_from_user(&zone, argp, sizeof zone)) 2137 goto out; 2138 r = -ENXIO; 2139 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2140 if (r) 2141 goto out; 2142 r = 0; 2143 break; 2144 } 2145 case KVM_UNREGISTER_COALESCED_MMIO: { 2146 struct kvm_coalesced_mmio_zone zone; 2147 r = -EFAULT; 2148 if (copy_from_user(&zone, argp, sizeof zone)) 2149 goto out; 2150 r = -ENXIO; 2151 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2152 if (r) 2153 goto out; 2154 r = 0; 2155 break; 2156 } 2157 #endif 2158 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 2159 case KVM_ASSIGN_PCI_DEVICE: { 2160 struct kvm_assigned_pci_dev assigned_dev; 2161 2162 r = -EFAULT; 2163 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2164 goto out; 2165 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 2166 if (r) 2167 goto out; 2168 break; 2169 } 2170 case KVM_ASSIGN_IRQ: { 2171 r = -EOPNOTSUPP; 2172 break; 2173 } 2174 #ifdef KVM_CAP_ASSIGN_DEV_IRQ 2175 case KVM_ASSIGN_DEV_IRQ: { 2176 struct kvm_assigned_irq assigned_irq; 2177 2178 r = -EFAULT; 2179 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2180 goto out; 2181 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 2182 if (r) 2183 goto out; 2184 break; 2185 } 2186 case KVM_DEASSIGN_DEV_IRQ: { 2187 struct kvm_assigned_irq assigned_irq; 2188 2189 r = -EFAULT; 2190 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2191 goto out; 2192 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); 2193 if (r) 2194 goto out; 2195 break; 2196 } 2197 #endif 2198 #endif 2199 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 2200 case KVM_DEASSIGN_PCI_DEVICE: { 2201 struct kvm_assigned_pci_dev assigned_dev; 2202 2203 r = -EFAULT; 2204 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2205 goto out; 2206 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 2207 if (r) 2208 goto out; 2209 break; 2210 } 2211 #endif 2212 #ifdef KVM_CAP_IRQ_ROUTING 2213 case KVM_SET_GSI_ROUTING: { 2214 struct kvm_irq_routing routing; 2215 struct kvm_irq_routing __user *urouting; 2216 struct kvm_irq_routing_entry *entries; 2217 2218 r = -EFAULT; 2219 if (copy_from_user(&routing, argp, sizeof(routing))) 2220 goto out; 2221 r = -EINVAL; 2222 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2223 goto out; 2224 if (routing.flags) 2225 goto out; 2226 r = -ENOMEM; 2227 entries = vmalloc(routing.nr * sizeof(*entries)); 2228 if (!entries) 2229 goto out; 2230 r = -EFAULT; 2231 urouting = argp; 2232 if (copy_from_user(entries, urouting->entries, 2233 routing.nr * sizeof(*entries))) 2234 goto out_free_irq_routing; 2235 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2236 routing.flags); 2237 out_free_irq_routing: 2238 vfree(entries); 2239 break; 2240 } 2241 #endif /* KVM_CAP_IRQ_ROUTING */ 2242 #ifdef __KVM_HAVE_MSIX 2243 case KVM_ASSIGN_SET_MSIX_NR: { 2244 struct kvm_assigned_msix_nr entry_nr; 2245 r = -EFAULT; 2246 if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) 2247 goto out; 2248 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); 2249 if (r) 2250 goto out; 2251 break; 2252 } 2253 case KVM_ASSIGN_SET_MSIX_ENTRY: { 2254 struct kvm_assigned_msix_entry entry; 2255 r = -EFAULT; 2256 if (copy_from_user(&entry, argp, sizeof entry)) 2257 goto out; 2258 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); 2259 if (r) 2260 goto out; 2261 break; 2262 } 2263 #endif 2264 case KVM_IRQFD: { 2265 struct kvm_irqfd data; 2266 2267 r = -EFAULT; 2268 if (copy_from_user(&data, argp, sizeof data)) 2269 goto out; 2270 r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); 2271 break; 2272 } 2273 case KVM_IOEVENTFD: { 2274 struct kvm_ioeventfd data; 2275 2276 r = -EFAULT; 2277 if (copy_from_user(&data, argp, sizeof data)) 2278 goto out; 2279 r = kvm_ioeventfd(kvm, &data); 2280 break; 2281 } 2282 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2283 case KVM_SET_BOOT_CPU_ID: 2284 r = 0; 2285 mutex_lock(&kvm->lock); 2286 if (atomic_read(&kvm->online_vcpus) != 0) 2287 r = -EBUSY; 2288 else 2289 kvm->bsp_vcpu_id = arg; 2290 mutex_unlock(&kvm->lock); 2291 break; 2292 #endif 2293 default: 2294 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2295 } 2296 out: 2297 return r; 2298 } 2299 2300 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2301 { 2302 struct page *page[1]; 2303 unsigned long addr; 2304 int npages; 2305 gfn_t gfn = vmf->pgoff; 2306 struct kvm *kvm = vma->vm_file->private_data; 2307 2308 addr = gfn_to_hva(kvm, gfn); 2309 if (kvm_is_error_hva(addr)) 2310 return VM_FAULT_SIGBUS; 2311 2312 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2313 NULL); 2314 if (unlikely(npages != 1)) 2315 return VM_FAULT_SIGBUS; 2316 2317 vmf->page = page[0]; 2318 return 0; 2319 } 2320 2321 static struct vm_operations_struct kvm_vm_vm_ops = { 2322 .fault = kvm_vm_fault, 2323 }; 2324 2325 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2326 { 2327 vma->vm_ops = &kvm_vm_vm_ops; 2328 return 0; 2329 } 2330 2331 static struct file_operations kvm_vm_fops = { 2332 .release = kvm_vm_release, 2333 .unlocked_ioctl = kvm_vm_ioctl, 2334 .compat_ioctl = kvm_vm_ioctl, 2335 .mmap = kvm_vm_mmap, 2336 }; 2337 2338 static int kvm_dev_ioctl_create_vm(void) 2339 { 2340 int fd; 2341 struct kvm *kvm; 2342 2343 kvm = kvm_create_vm(); 2344 if (IS_ERR(kvm)) 2345 return PTR_ERR(kvm); 2346 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 2347 if (fd < 0) 2348 kvm_put_kvm(kvm); 2349 2350 return fd; 2351 } 2352 2353 static long kvm_dev_ioctl_check_extension_generic(long arg) 2354 { 2355 switch (arg) { 2356 case KVM_CAP_USER_MEMORY: 2357 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2358 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2359 #ifdef CONFIG_KVM_APIC_ARCHITECTURE 2360 case KVM_CAP_SET_BOOT_CPU_ID: 2361 #endif 2362 return 1; 2363 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2364 case KVM_CAP_IRQ_ROUTING: 2365 return KVM_MAX_IRQ_ROUTES; 2366 #endif 2367 default: 2368 break; 2369 } 2370 return kvm_dev_ioctl_check_extension(arg); 2371 } 2372 2373 static long kvm_dev_ioctl(struct file *filp, 2374 unsigned int ioctl, unsigned long arg) 2375 { 2376 long r = -EINVAL; 2377 2378 switch (ioctl) { 2379 case KVM_GET_API_VERSION: 2380 r = -EINVAL; 2381 if (arg) 2382 goto out; 2383 r = KVM_API_VERSION; 2384 break; 2385 case KVM_CREATE_VM: 2386 r = -EINVAL; 2387 if (arg) 2388 goto out; 2389 r = kvm_dev_ioctl_create_vm(); 2390 break; 2391 case KVM_CHECK_EXTENSION: 2392 r = kvm_dev_ioctl_check_extension_generic(arg); 2393 break; 2394 case KVM_GET_VCPU_MMAP_SIZE: 2395 r = -EINVAL; 2396 if (arg) 2397 goto out; 2398 r = PAGE_SIZE; /* struct kvm_run */ 2399 #ifdef CONFIG_X86 2400 r += PAGE_SIZE; /* pio data page */ 2401 #endif 2402 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2403 r += PAGE_SIZE; /* coalesced mmio ring page */ 2404 #endif 2405 break; 2406 case KVM_TRACE_ENABLE: 2407 case KVM_TRACE_PAUSE: 2408 case KVM_TRACE_DISABLE: 2409 r = -EOPNOTSUPP; 2410 break; 2411 default: 2412 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2413 } 2414 out: 2415 return r; 2416 } 2417 2418 static struct file_operations kvm_chardev_ops = { 2419 .unlocked_ioctl = kvm_dev_ioctl, 2420 .compat_ioctl = kvm_dev_ioctl, 2421 }; 2422 2423 static struct miscdevice kvm_dev = { 2424 KVM_MINOR, 2425 "kvm", 2426 &kvm_chardev_ops, 2427 }; 2428 2429 static void hardware_enable(void *junk) 2430 { 2431 int cpu = raw_smp_processor_id(); 2432 2433 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2434 return; 2435 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2436 kvm_arch_hardware_enable(NULL); 2437 } 2438 2439 static void hardware_disable(void *junk) 2440 { 2441 int cpu = raw_smp_processor_id(); 2442 2443 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2444 return; 2445 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2446 kvm_arch_hardware_disable(NULL); 2447 } 2448 2449 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2450 void *v) 2451 { 2452 int cpu = (long)v; 2453 2454 val &= ~CPU_TASKS_FROZEN; 2455 switch (val) { 2456 case CPU_DYING: 2457 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2458 cpu); 2459 hardware_disable(NULL); 2460 break; 2461 case CPU_UP_CANCELED: 2462 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2463 cpu); 2464 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2465 break; 2466 case CPU_ONLINE: 2467 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2468 cpu); 2469 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2470 break; 2471 } 2472 return NOTIFY_OK; 2473 } 2474 2475 2476 asmlinkage void kvm_handle_fault_on_reboot(void) 2477 { 2478 if (kvm_rebooting) 2479 /* spin while reset goes on */ 2480 while (true) 2481 ; 2482 /* Fault while not rebooting. We want the trace. */ 2483 BUG(); 2484 } 2485 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2486 2487 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2488 void *v) 2489 { 2490 /* 2491 * Some (well, at least mine) BIOSes hang on reboot if 2492 * in vmx root mode. 2493 * 2494 * And Intel TXT required VMX off for all cpu when system shutdown. 2495 */ 2496 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2497 kvm_rebooting = true; 2498 on_each_cpu(hardware_disable, NULL, 1); 2499 return NOTIFY_OK; 2500 } 2501 2502 static struct notifier_block kvm_reboot_notifier = { 2503 .notifier_call = kvm_reboot, 2504 .priority = 0, 2505 }; 2506 2507 void kvm_io_bus_init(struct kvm_io_bus *bus) 2508 { 2509 memset(bus, 0, sizeof(*bus)); 2510 } 2511 2512 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2513 { 2514 int i; 2515 2516 for (i = 0; i < bus->dev_count; i++) { 2517 struct kvm_io_device *pos = bus->devs[i]; 2518 2519 kvm_iodevice_destructor(pos); 2520 } 2521 } 2522 2523 /* kvm_io_bus_write - called under kvm->slots_lock */ 2524 int kvm_io_bus_write(struct kvm_io_bus *bus, gpa_t addr, 2525 int len, const void *val) 2526 { 2527 int i; 2528 for (i = 0; i < bus->dev_count; i++) 2529 if (!kvm_iodevice_write(bus->devs[i], addr, len, val)) 2530 return 0; 2531 return -EOPNOTSUPP; 2532 } 2533 2534 /* kvm_io_bus_read - called under kvm->slots_lock */ 2535 int kvm_io_bus_read(struct kvm_io_bus *bus, gpa_t addr, int len, void *val) 2536 { 2537 int i; 2538 for (i = 0; i < bus->dev_count; i++) 2539 if (!kvm_iodevice_read(bus->devs[i], addr, len, val)) 2540 return 0; 2541 return -EOPNOTSUPP; 2542 } 2543 2544 int kvm_io_bus_register_dev(struct kvm *kvm, struct kvm_io_bus *bus, 2545 struct kvm_io_device *dev) 2546 { 2547 int ret; 2548 2549 down_write(&kvm->slots_lock); 2550 ret = __kvm_io_bus_register_dev(bus, dev); 2551 up_write(&kvm->slots_lock); 2552 2553 return ret; 2554 } 2555 2556 /* An unlocked version. Caller must have write lock on slots_lock. */ 2557 int __kvm_io_bus_register_dev(struct kvm_io_bus *bus, 2558 struct kvm_io_device *dev) 2559 { 2560 if (bus->dev_count > NR_IOBUS_DEVS-1) 2561 return -ENOSPC; 2562 2563 bus->devs[bus->dev_count++] = dev; 2564 2565 return 0; 2566 } 2567 2568 void kvm_io_bus_unregister_dev(struct kvm *kvm, 2569 struct kvm_io_bus *bus, 2570 struct kvm_io_device *dev) 2571 { 2572 down_write(&kvm->slots_lock); 2573 __kvm_io_bus_unregister_dev(bus, dev); 2574 up_write(&kvm->slots_lock); 2575 } 2576 2577 /* An unlocked version. Caller must have write lock on slots_lock. */ 2578 void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus, 2579 struct kvm_io_device *dev) 2580 { 2581 int i; 2582 2583 for (i = 0; i < bus->dev_count; i++) 2584 if (bus->devs[i] == dev) { 2585 bus->devs[i] = bus->devs[--bus->dev_count]; 2586 break; 2587 } 2588 } 2589 2590 static struct notifier_block kvm_cpu_notifier = { 2591 .notifier_call = kvm_cpu_hotplug, 2592 .priority = 20, /* must be > scheduler priority */ 2593 }; 2594 2595 static int vm_stat_get(void *_offset, u64 *val) 2596 { 2597 unsigned offset = (long)_offset; 2598 struct kvm *kvm; 2599 2600 *val = 0; 2601 spin_lock(&kvm_lock); 2602 list_for_each_entry(kvm, &vm_list, vm_list) 2603 *val += *(u32 *)((void *)kvm + offset); 2604 spin_unlock(&kvm_lock); 2605 return 0; 2606 } 2607 2608 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2609 2610 static int vcpu_stat_get(void *_offset, u64 *val) 2611 { 2612 unsigned offset = (long)_offset; 2613 struct kvm *kvm; 2614 struct kvm_vcpu *vcpu; 2615 int i; 2616 2617 *val = 0; 2618 spin_lock(&kvm_lock); 2619 list_for_each_entry(kvm, &vm_list, vm_list) 2620 kvm_for_each_vcpu(i, vcpu, kvm) 2621 *val += *(u32 *)((void *)vcpu + offset); 2622 2623 spin_unlock(&kvm_lock); 2624 return 0; 2625 } 2626 2627 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2628 2629 static struct file_operations *stat_fops[] = { 2630 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2631 [KVM_STAT_VM] = &vm_stat_fops, 2632 }; 2633 2634 static void kvm_init_debug(void) 2635 { 2636 struct kvm_stats_debugfs_item *p; 2637 2638 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2639 for (p = debugfs_entries; p->name; ++p) 2640 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2641 (void *)(long)p->offset, 2642 stat_fops[p->kind]); 2643 } 2644 2645 static void kvm_exit_debug(void) 2646 { 2647 struct kvm_stats_debugfs_item *p; 2648 2649 for (p = debugfs_entries; p->name; ++p) 2650 debugfs_remove(p->dentry); 2651 debugfs_remove(kvm_debugfs_dir); 2652 } 2653 2654 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2655 { 2656 hardware_disable(NULL); 2657 return 0; 2658 } 2659 2660 static int kvm_resume(struct sys_device *dev) 2661 { 2662 hardware_enable(NULL); 2663 return 0; 2664 } 2665 2666 static struct sysdev_class kvm_sysdev_class = { 2667 .name = "kvm", 2668 .suspend = kvm_suspend, 2669 .resume = kvm_resume, 2670 }; 2671 2672 static struct sys_device kvm_sysdev = { 2673 .id = 0, 2674 .cls = &kvm_sysdev_class, 2675 }; 2676 2677 struct page *bad_page; 2678 pfn_t bad_pfn; 2679 2680 static inline 2681 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2682 { 2683 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2684 } 2685 2686 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2687 { 2688 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2689 2690 kvm_arch_vcpu_load(vcpu, cpu); 2691 } 2692 2693 static void kvm_sched_out(struct preempt_notifier *pn, 2694 struct task_struct *next) 2695 { 2696 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2697 2698 kvm_arch_vcpu_put(vcpu); 2699 } 2700 2701 int kvm_init(void *opaque, unsigned int vcpu_size, 2702 struct module *module) 2703 { 2704 int r; 2705 int cpu; 2706 2707 kvm_init_debug(); 2708 2709 r = kvm_arch_init(opaque); 2710 if (r) 2711 goto out_fail; 2712 2713 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2714 2715 if (bad_page == NULL) { 2716 r = -ENOMEM; 2717 goto out; 2718 } 2719 2720 bad_pfn = page_to_pfn(bad_page); 2721 2722 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2723 r = -ENOMEM; 2724 goto out_free_0; 2725 } 2726 2727 r = kvm_arch_hardware_setup(); 2728 if (r < 0) 2729 goto out_free_0a; 2730 2731 for_each_online_cpu(cpu) { 2732 smp_call_function_single(cpu, 2733 kvm_arch_check_processor_compat, 2734 &r, 1); 2735 if (r < 0) 2736 goto out_free_1; 2737 } 2738 2739 on_each_cpu(hardware_enable, NULL, 1); 2740 r = register_cpu_notifier(&kvm_cpu_notifier); 2741 if (r) 2742 goto out_free_2; 2743 register_reboot_notifier(&kvm_reboot_notifier); 2744 2745 r = sysdev_class_register(&kvm_sysdev_class); 2746 if (r) 2747 goto out_free_3; 2748 2749 r = sysdev_register(&kvm_sysdev); 2750 if (r) 2751 goto out_free_4; 2752 2753 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2754 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2755 __alignof__(struct kvm_vcpu), 2756 0, NULL); 2757 if (!kvm_vcpu_cache) { 2758 r = -ENOMEM; 2759 goto out_free_5; 2760 } 2761 2762 kvm_chardev_ops.owner = module; 2763 kvm_vm_fops.owner = module; 2764 kvm_vcpu_fops.owner = module; 2765 2766 r = misc_register(&kvm_dev); 2767 if (r) { 2768 printk(KERN_ERR "kvm: misc device register failed\n"); 2769 goto out_free; 2770 } 2771 2772 kvm_preempt_ops.sched_in = kvm_sched_in; 2773 kvm_preempt_ops.sched_out = kvm_sched_out; 2774 2775 return 0; 2776 2777 out_free: 2778 kmem_cache_destroy(kvm_vcpu_cache); 2779 out_free_5: 2780 sysdev_unregister(&kvm_sysdev); 2781 out_free_4: 2782 sysdev_class_unregister(&kvm_sysdev_class); 2783 out_free_3: 2784 unregister_reboot_notifier(&kvm_reboot_notifier); 2785 unregister_cpu_notifier(&kvm_cpu_notifier); 2786 out_free_2: 2787 on_each_cpu(hardware_disable, NULL, 1); 2788 out_free_1: 2789 kvm_arch_hardware_unsetup(); 2790 out_free_0a: 2791 free_cpumask_var(cpus_hardware_enabled); 2792 out_free_0: 2793 __free_page(bad_page); 2794 out: 2795 kvm_arch_exit(); 2796 out_fail: 2797 kvm_exit_debug(); 2798 return r; 2799 } 2800 EXPORT_SYMBOL_GPL(kvm_init); 2801 2802 void kvm_exit(void) 2803 { 2804 tracepoint_synchronize_unregister(); 2805 misc_deregister(&kvm_dev); 2806 kmem_cache_destroy(kvm_vcpu_cache); 2807 sysdev_unregister(&kvm_sysdev); 2808 sysdev_class_unregister(&kvm_sysdev_class); 2809 unregister_reboot_notifier(&kvm_reboot_notifier); 2810 unregister_cpu_notifier(&kvm_cpu_notifier); 2811 on_each_cpu(hardware_disable, NULL, 1); 2812 kvm_arch_hardware_unsetup(); 2813 kvm_arch_exit(); 2814 kvm_exit_debug(); 2815 free_cpumask_var(cpus_hardware_enabled); 2816 __free_page(bad_page); 2817 } 2818 EXPORT_SYMBOL_GPL(kvm_exit); 2819