1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 #include <linux/swap.h> 44 #include <linux/bitops.h> 45 #include <linux/spinlock.h> 46 47 #include <asm/processor.h> 48 #include <asm/io.h> 49 #include <asm/uaccess.h> 50 #include <asm/pgtable.h> 51 52 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 53 #include "coalesced_mmio.h" 54 #endif 55 56 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 57 #include <linux/pci.h> 58 #include <linux/interrupt.h> 59 #include "irq.h" 60 #endif 61 62 MODULE_AUTHOR("Qumranet"); 63 MODULE_LICENSE("GPL"); 64 65 DEFINE_SPINLOCK(kvm_lock); 66 LIST_HEAD(vm_list); 67 68 static cpumask_var_t cpus_hardware_enabled; 69 70 struct kmem_cache *kvm_vcpu_cache; 71 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 72 73 static __read_mostly struct preempt_ops kvm_preempt_ops; 74 75 struct dentry *kvm_debugfs_dir; 76 77 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 78 unsigned long arg); 79 80 static bool kvm_rebooting; 81 82 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 83 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, 84 int assigned_dev_id) 85 { 86 struct list_head *ptr; 87 struct kvm_assigned_dev_kernel *match; 88 89 list_for_each(ptr, head) { 90 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); 91 if (match->assigned_dev_id == assigned_dev_id) 92 return match; 93 } 94 return NULL; 95 } 96 97 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel 98 *assigned_dev, int irq) 99 { 100 int i, index; 101 struct msix_entry *host_msix_entries; 102 103 host_msix_entries = assigned_dev->host_msix_entries; 104 105 index = -1; 106 for (i = 0; i < assigned_dev->entries_nr; i++) 107 if (irq == host_msix_entries[i].vector) { 108 index = i; 109 break; 110 } 111 if (index < 0) { 112 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); 113 return 0; 114 } 115 116 return index; 117 } 118 119 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 120 { 121 struct kvm_assigned_dev_kernel *assigned_dev; 122 struct kvm *kvm; 123 int irq, i; 124 125 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 126 interrupt_work); 127 kvm = assigned_dev->kvm; 128 129 /* This is taken to safely inject irq inside the guest. When 130 * the interrupt injection (or the ioapic code) uses a 131 * finer-grained lock, update this 132 */ 133 mutex_lock(&kvm->lock); 134 spin_lock_irq(&assigned_dev->assigned_dev_lock); 135 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 136 struct kvm_guest_msix_entry *guest_entries = 137 assigned_dev->guest_msix_entries; 138 for (i = 0; i < assigned_dev->entries_nr; i++) { 139 if (!(guest_entries[i].flags & 140 KVM_ASSIGNED_MSIX_PENDING)) 141 continue; 142 guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING; 143 kvm_set_irq(assigned_dev->kvm, 144 assigned_dev->irq_source_id, 145 guest_entries[i].vector, 1); 146 irq = assigned_dev->host_msix_entries[i].vector; 147 if (irq != 0) 148 enable_irq(irq); 149 assigned_dev->host_irq_disabled = false; 150 } 151 } else { 152 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, 153 assigned_dev->guest_irq, 1); 154 if (assigned_dev->irq_requested_type & 155 KVM_DEV_IRQ_GUEST_MSI) { 156 enable_irq(assigned_dev->host_irq); 157 assigned_dev->host_irq_disabled = false; 158 } 159 } 160 161 spin_unlock_irq(&assigned_dev->assigned_dev_lock); 162 mutex_unlock(&assigned_dev->kvm->lock); 163 } 164 165 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) 166 { 167 unsigned long flags; 168 struct kvm_assigned_dev_kernel *assigned_dev = 169 (struct kvm_assigned_dev_kernel *) dev_id; 170 171 spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags); 172 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 173 int index = find_index_from_host_irq(assigned_dev, irq); 174 if (index < 0) 175 goto out; 176 assigned_dev->guest_msix_entries[index].flags |= 177 KVM_ASSIGNED_MSIX_PENDING; 178 } 179 180 schedule_work(&assigned_dev->interrupt_work); 181 182 disable_irq_nosync(irq); 183 assigned_dev->host_irq_disabled = true; 184 185 out: 186 spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags); 187 return IRQ_HANDLED; 188 } 189 190 /* Ack the irq line for an assigned device */ 191 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) 192 { 193 struct kvm_assigned_dev_kernel *dev; 194 unsigned long flags; 195 196 if (kian->gsi == -1) 197 return; 198 199 dev = container_of(kian, struct kvm_assigned_dev_kernel, 200 ack_notifier); 201 202 kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0); 203 204 /* The guest irq may be shared so this ack may be 205 * from another device. 206 */ 207 spin_lock_irqsave(&dev->assigned_dev_lock, flags); 208 if (dev->host_irq_disabled) { 209 enable_irq(dev->host_irq); 210 dev->host_irq_disabled = false; 211 } 212 spin_unlock_irqrestore(&dev->assigned_dev_lock, flags); 213 } 214 215 static void deassign_guest_irq(struct kvm *kvm, 216 struct kvm_assigned_dev_kernel *assigned_dev) 217 { 218 kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier); 219 assigned_dev->ack_notifier.gsi = -1; 220 221 if (assigned_dev->irq_source_id != -1) 222 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); 223 assigned_dev->irq_source_id = -1; 224 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); 225 } 226 227 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ 228 static void deassign_host_irq(struct kvm *kvm, 229 struct kvm_assigned_dev_kernel *assigned_dev) 230 { 231 /* 232 * In kvm_free_device_irq, cancel_work_sync return true if: 233 * 1. work is scheduled, and then cancelled. 234 * 2. work callback is executed. 235 * 236 * The first one ensured that the irq is disabled and no more events 237 * would happen. But for the second one, the irq may be enabled (e.g. 238 * for MSI). So we disable irq here to prevent further events. 239 * 240 * Notice this maybe result in nested disable if the interrupt type is 241 * INTx, but it's OK for we are going to free it. 242 * 243 * If this function is a part of VM destroy, please ensure that till 244 * now, the kvm state is still legal for probably we also have to wait 245 * interrupt_work done. 246 */ 247 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { 248 int i; 249 for (i = 0; i < assigned_dev->entries_nr; i++) 250 disable_irq_nosync(assigned_dev-> 251 host_msix_entries[i].vector); 252 253 cancel_work_sync(&assigned_dev->interrupt_work); 254 255 for (i = 0; i < assigned_dev->entries_nr; i++) 256 free_irq(assigned_dev->host_msix_entries[i].vector, 257 (void *)assigned_dev); 258 259 assigned_dev->entries_nr = 0; 260 kfree(assigned_dev->host_msix_entries); 261 kfree(assigned_dev->guest_msix_entries); 262 pci_disable_msix(assigned_dev->dev); 263 } else { 264 /* Deal with MSI and INTx */ 265 disable_irq_nosync(assigned_dev->host_irq); 266 cancel_work_sync(&assigned_dev->interrupt_work); 267 268 free_irq(assigned_dev->host_irq, (void *)assigned_dev); 269 270 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) 271 pci_disable_msi(assigned_dev->dev); 272 } 273 274 assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); 275 } 276 277 static int kvm_deassign_irq(struct kvm *kvm, 278 struct kvm_assigned_dev_kernel *assigned_dev, 279 unsigned long irq_requested_type) 280 { 281 unsigned long guest_irq_type, host_irq_type; 282 283 if (!irqchip_in_kernel(kvm)) 284 return -EINVAL; 285 /* no irq assignment to deassign */ 286 if (!assigned_dev->irq_requested_type) 287 return -ENXIO; 288 289 host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; 290 guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; 291 292 if (host_irq_type) 293 deassign_host_irq(kvm, assigned_dev); 294 if (guest_irq_type) 295 deassign_guest_irq(kvm, assigned_dev); 296 297 return 0; 298 } 299 300 static void kvm_free_assigned_irq(struct kvm *kvm, 301 struct kvm_assigned_dev_kernel *assigned_dev) 302 { 303 kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); 304 } 305 306 static void kvm_free_assigned_device(struct kvm *kvm, 307 struct kvm_assigned_dev_kernel 308 *assigned_dev) 309 { 310 kvm_free_assigned_irq(kvm, assigned_dev); 311 312 pci_reset_function(assigned_dev->dev); 313 314 pci_release_regions(assigned_dev->dev); 315 pci_disable_device(assigned_dev->dev); 316 pci_dev_put(assigned_dev->dev); 317 318 list_del(&assigned_dev->list); 319 kfree(assigned_dev); 320 } 321 322 void kvm_free_all_assigned_devices(struct kvm *kvm) 323 { 324 struct list_head *ptr, *ptr2; 325 struct kvm_assigned_dev_kernel *assigned_dev; 326 327 list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { 328 assigned_dev = list_entry(ptr, 329 struct kvm_assigned_dev_kernel, 330 list); 331 332 kvm_free_assigned_device(kvm, assigned_dev); 333 } 334 } 335 336 static int assigned_device_enable_host_intx(struct kvm *kvm, 337 struct kvm_assigned_dev_kernel *dev) 338 { 339 dev->host_irq = dev->dev->irq; 340 /* Even though this is PCI, we don't want to use shared 341 * interrupts. Sharing host devices with guest-assigned devices 342 * on the same interrupt line is not a happy situation: there 343 * are going to be long delays in accepting, acking, etc. 344 */ 345 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 346 0, "kvm_assigned_intx_device", (void *)dev)) 347 return -EIO; 348 return 0; 349 } 350 351 #ifdef __KVM_HAVE_MSI 352 static int assigned_device_enable_host_msi(struct kvm *kvm, 353 struct kvm_assigned_dev_kernel *dev) 354 { 355 int r; 356 357 if (!dev->dev->msi_enabled) { 358 r = pci_enable_msi(dev->dev); 359 if (r) 360 return r; 361 } 362 363 dev->host_irq = dev->dev->irq; 364 if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0, 365 "kvm_assigned_msi_device", (void *)dev)) { 366 pci_disable_msi(dev->dev); 367 return -EIO; 368 } 369 370 return 0; 371 } 372 #endif 373 374 #ifdef __KVM_HAVE_MSIX 375 static int assigned_device_enable_host_msix(struct kvm *kvm, 376 struct kvm_assigned_dev_kernel *dev) 377 { 378 int i, r = -EINVAL; 379 380 /* host_msix_entries and guest_msix_entries should have been 381 * initialized */ 382 if (dev->entries_nr == 0) 383 return r; 384 385 r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr); 386 if (r) 387 return r; 388 389 for (i = 0; i < dev->entries_nr; i++) { 390 r = request_irq(dev->host_msix_entries[i].vector, 391 kvm_assigned_dev_intr, 0, 392 "kvm_assigned_msix_device", 393 (void *)dev); 394 /* FIXME: free requested_irq's on failure */ 395 if (r) 396 return r; 397 } 398 399 return 0; 400 } 401 402 #endif 403 404 static int assigned_device_enable_guest_intx(struct kvm *kvm, 405 struct kvm_assigned_dev_kernel *dev, 406 struct kvm_assigned_irq *irq) 407 { 408 dev->guest_irq = irq->guest_irq; 409 dev->ack_notifier.gsi = irq->guest_irq; 410 return 0; 411 } 412 413 #ifdef __KVM_HAVE_MSI 414 static int assigned_device_enable_guest_msi(struct kvm *kvm, 415 struct kvm_assigned_dev_kernel *dev, 416 struct kvm_assigned_irq *irq) 417 { 418 dev->guest_irq = irq->guest_irq; 419 dev->ack_notifier.gsi = -1; 420 return 0; 421 } 422 #endif 423 #ifdef __KVM_HAVE_MSIX 424 static int assigned_device_enable_guest_msix(struct kvm *kvm, 425 struct kvm_assigned_dev_kernel *dev, 426 struct kvm_assigned_irq *irq) 427 { 428 dev->guest_irq = irq->guest_irq; 429 dev->ack_notifier.gsi = -1; 430 return 0; 431 } 432 #endif 433 434 static int assign_host_irq(struct kvm *kvm, 435 struct kvm_assigned_dev_kernel *dev, 436 __u32 host_irq_type) 437 { 438 int r = -EEXIST; 439 440 if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) 441 return r; 442 443 switch (host_irq_type) { 444 case KVM_DEV_IRQ_HOST_INTX: 445 r = assigned_device_enable_host_intx(kvm, dev); 446 break; 447 #ifdef __KVM_HAVE_MSI 448 case KVM_DEV_IRQ_HOST_MSI: 449 r = assigned_device_enable_host_msi(kvm, dev); 450 break; 451 #endif 452 #ifdef __KVM_HAVE_MSIX 453 case KVM_DEV_IRQ_HOST_MSIX: 454 r = assigned_device_enable_host_msix(kvm, dev); 455 break; 456 #endif 457 default: 458 r = -EINVAL; 459 } 460 461 if (!r) 462 dev->irq_requested_type |= host_irq_type; 463 464 return r; 465 } 466 467 static int assign_guest_irq(struct kvm *kvm, 468 struct kvm_assigned_dev_kernel *dev, 469 struct kvm_assigned_irq *irq, 470 unsigned long guest_irq_type) 471 { 472 int id; 473 int r = -EEXIST; 474 475 if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) 476 return r; 477 478 id = kvm_request_irq_source_id(kvm); 479 if (id < 0) 480 return id; 481 482 dev->irq_source_id = id; 483 484 switch (guest_irq_type) { 485 case KVM_DEV_IRQ_GUEST_INTX: 486 r = assigned_device_enable_guest_intx(kvm, dev, irq); 487 break; 488 #ifdef __KVM_HAVE_MSI 489 case KVM_DEV_IRQ_GUEST_MSI: 490 r = assigned_device_enable_guest_msi(kvm, dev, irq); 491 break; 492 #endif 493 #ifdef __KVM_HAVE_MSIX 494 case KVM_DEV_IRQ_GUEST_MSIX: 495 r = assigned_device_enable_guest_msix(kvm, dev, irq); 496 break; 497 #endif 498 default: 499 r = -EINVAL; 500 } 501 502 if (!r) { 503 dev->irq_requested_type |= guest_irq_type; 504 kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); 505 } else 506 kvm_free_irq_source_id(kvm, dev->irq_source_id); 507 508 return r; 509 } 510 511 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ 512 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, 513 struct kvm_assigned_irq *assigned_irq) 514 { 515 int r = -EINVAL; 516 struct kvm_assigned_dev_kernel *match; 517 unsigned long host_irq_type, guest_irq_type; 518 519 if (!capable(CAP_SYS_RAWIO)) 520 return -EPERM; 521 522 if (!irqchip_in_kernel(kvm)) 523 return r; 524 525 mutex_lock(&kvm->lock); 526 r = -ENODEV; 527 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 528 assigned_irq->assigned_dev_id); 529 if (!match) 530 goto out; 531 532 host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); 533 guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); 534 535 r = -EINVAL; 536 /* can only assign one type at a time */ 537 if (hweight_long(host_irq_type) > 1) 538 goto out; 539 if (hweight_long(guest_irq_type) > 1) 540 goto out; 541 if (host_irq_type == 0 && guest_irq_type == 0) 542 goto out; 543 544 r = 0; 545 if (host_irq_type) 546 r = assign_host_irq(kvm, match, host_irq_type); 547 if (r) 548 goto out; 549 550 if (guest_irq_type) 551 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); 552 out: 553 mutex_unlock(&kvm->lock); 554 return r; 555 } 556 557 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, 558 struct kvm_assigned_irq 559 *assigned_irq) 560 { 561 int r = -ENODEV; 562 struct kvm_assigned_dev_kernel *match; 563 564 mutex_lock(&kvm->lock); 565 566 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 567 assigned_irq->assigned_dev_id); 568 if (!match) 569 goto out; 570 571 r = kvm_deassign_irq(kvm, match, assigned_irq->flags); 572 out: 573 mutex_unlock(&kvm->lock); 574 return r; 575 } 576 577 static int kvm_vm_ioctl_assign_device(struct kvm *kvm, 578 struct kvm_assigned_pci_dev *assigned_dev) 579 { 580 int r = 0; 581 struct kvm_assigned_dev_kernel *match; 582 struct pci_dev *dev; 583 584 down_read(&kvm->slots_lock); 585 mutex_lock(&kvm->lock); 586 587 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 588 assigned_dev->assigned_dev_id); 589 if (match) { 590 /* device already assigned */ 591 r = -EEXIST; 592 goto out; 593 } 594 595 match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); 596 if (match == NULL) { 597 printk(KERN_INFO "%s: Couldn't allocate memory\n", 598 __func__); 599 r = -ENOMEM; 600 goto out; 601 } 602 dev = pci_get_bus_and_slot(assigned_dev->busnr, 603 assigned_dev->devfn); 604 if (!dev) { 605 printk(KERN_INFO "%s: host device not found\n", __func__); 606 r = -EINVAL; 607 goto out_free; 608 } 609 if (pci_enable_device(dev)) { 610 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); 611 r = -EBUSY; 612 goto out_put; 613 } 614 r = pci_request_regions(dev, "kvm_assigned_device"); 615 if (r) { 616 printk(KERN_INFO "%s: Could not get access to device regions\n", 617 __func__); 618 goto out_disable; 619 } 620 621 pci_reset_function(dev); 622 623 match->assigned_dev_id = assigned_dev->assigned_dev_id; 624 match->host_busnr = assigned_dev->busnr; 625 match->host_devfn = assigned_dev->devfn; 626 match->flags = assigned_dev->flags; 627 match->dev = dev; 628 spin_lock_init(&match->assigned_dev_lock); 629 match->irq_source_id = -1; 630 match->kvm = kvm; 631 match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; 632 INIT_WORK(&match->interrupt_work, 633 kvm_assigned_dev_interrupt_work_handler); 634 635 list_add(&match->list, &kvm->arch.assigned_dev_head); 636 637 if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { 638 if (!kvm->arch.iommu_domain) { 639 r = kvm_iommu_map_guest(kvm); 640 if (r) 641 goto out_list_del; 642 } 643 r = kvm_assign_device(kvm, match); 644 if (r) 645 goto out_list_del; 646 } 647 648 out: 649 mutex_unlock(&kvm->lock); 650 up_read(&kvm->slots_lock); 651 return r; 652 out_list_del: 653 list_del(&match->list); 654 pci_release_regions(dev); 655 out_disable: 656 pci_disable_device(dev); 657 out_put: 658 pci_dev_put(dev); 659 out_free: 660 kfree(match); 661 mutex_unlock(&kvm->lock); 662 up_read(&kvm->slots_lock); 663 return r; 664 } 665 #endif 666 667 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 668 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, 669 struct kvm_assigned_pci_dev *assigned_dev) 670 { 671 int r = 0; 672 struct kvm_assigned_dev_kernel *match; 673 674 mutex_lock(&kvm->lock); 675 676 match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 677 assigned_dev->assigned_dev_id); 678 if (!match) { 679 printk(KERN_INFO "%s: device hasn't been assigned before, " 680 "so cannot be deassigned\n", __func__); 681 r = -EINVAL; 682 goto out; 683 } 684 685 if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) 686 kvm_deassign_device(kvm, match); 687 688 kvm_free_assigned_device(kvm, match); 689 690 out: 691 mutex_unlock(&kvm->lock); 692 return r; 693 } 694 #endif 695 696 static inline int valid_vcpu(int n) 697 { 698 return likely(n >= 0 && n < KVM_MAX_VCPUS); 699 } 700 701 inline int kvm_is_mmio_pfn(pfn_t pfn) 702 { 703 if (pfn_valid(pfn)) { 704 struct page *page = compound_head(pfn_to_page(pfn)); 705 return PageReserved(page); 706 } 707 708 return true; 709 } 710 711 /* 712 * Switches to specified vcpu, until a matching vcpu_put() 713 */ 714 void vcpu_load(struct kvm_vcpu *vcpu) 715 { 716 int cpu; 717 718 mutex_lock(&vcpu->mutex); 719 cpu = get_cpu(); 720 preempt_notifier_register(&vcpu->preempt_notifier); 721 kvm_arch_vcpu_load(vcpu, cpu); 722 put_cpu(); 723 } 724 725 void vcpu_put(struct kvm_vcpu *vcpu) 726 { 727 preempt_disable(); 728 kvm_arch_vcpu_put(vcpu); 729 preempt_notifier_unregister(&vcpu->preempt_notifier); 730 preempt_enable(); 731 mutex_unlock(&vcpu->mutex); 732 } 733 734 static void ack_flush(void *_completed) 735 { 736 } 737 738 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) 739 { 740 int i, cpu, me; 741 cpumask_var_t cpus; 742 bool called = true; 743 struct kvm_vcpu *vcpu; 744 745 if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) 746 cpumask_clear(cpus); 747 748 me = get_cpu(); 749 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 750 vcpu = kvm->vcpus[i]; 751 if (!vcpu) 752 continue; 753 if (test_and_set_bit(req, &vcpu->requests)) 754 continue; 755 cpu = vcpu->cpu; 756 if (cpus != NULL && cpu != -1 && cpu != me) 757 cpumask_set_cpu(cpu, cpus); 758 } 759 if (unlikely(cpus == NULL)) 760 smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); 761 else if (!cpumask_empty(cpus)) 762 smp_call_function_many(cpus, ack_flush, NULL, 1); 763 else 764 called = false; 765 put_cpu(); 766 free_cpumask_var(cpus); 767 return called; 768 } 769 770 void kvm_flush_remote_tlbs(struct kvm *kvm) 771 { 772 if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH)) 773 ++kvm->stat.remote_tlb_flush; 774 } 775 776 void kvm_reload_remote_mmus(struct kvm *kvm) 777 { 778 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 779 } 780 781 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 782 { 783 struct page *page; 784 int r; 785 786 mutex_init(&vcpu->mutex); 787 vcpu->cpu = -1; 788 vcpu->kvm = kvm; 789 vcpu->vcpu_id = id; 790 init_waitqueue_head(&vcpu->wq); 791 792 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 793 if (!page) { 794 r = -ENOMEM; 795 goto fail; 796 } 797 vcpu->run = page_address(page); 798 799 r = kvm_arch_vcpu_init(vcpu); 800 if (r < 0) 801 goto fail_free_run; 802 return 0; 803 804 fail_free_run: 805 free_page((unsigned long)vcpu->run); 806 fail: 807 return r; 808 } 809 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 810 811 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 812 { 813 kvm_arch_vcpu_uninit(vcpu); 814 free_page((unsigned long)vcpu->run); 815 } 816 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 817 818 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 819 static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) 820 { 821 return container_of(mn, struct kvm, mmu_notifier); 822 } 823 824 static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, 825 struct mm_struct *mm, 826 unsigned long address) 827 { 828 struct kvm *kvm = mmu_notifier_to_kvm(mn); 829 int need_tlb_flush; 830 831 /* 832 * When ->invalidate_page runs, the linux pte has been zapped 833 * already but the page is still allocated until 834 * ->invalidate_page returns. So if we increase the sequence 835 * here the kvm page fault will notice if the spte can't be 836 * established because the page is going to be freed. If 837 * instead the kvm page fault establishes the spte before 838 * ->invalidate_page runs, kvm_unmap_hva will release it 839 * before returning. 840 * 841 * The sequence increase only need to be seen at spin_unlock 842 * time, and not at spin_lock time. 843 * 844 * Increasing the sequence after the spin_unlock would be 845 * unsafe because the kvm page fault could then establish the 846 * pte after kvm_unmap_hva returned, without noticing the page 847 * is going to be freed. 848 */ 849 spin_lock(&kvm->mmu_lock); 850 kvm->mmu_notifier_seq++; 851 need_tlb_flush = kvm_unmap_hva(kvm, address); 852 spin_unlock(&kvm->mmu_lock); 853 854 /* we've to flush the tlb before the pages can be freed */ 855 if (need_tlb_flush) 856 kvm_flush_remote_tlbs(kvm); 857 858 } 859 860 static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, 861 struct mm_struct *mm, 862 unsigned long start, 863 unsigned long end) 864 { 865 struct kvm *kvm = mmu_notifier_to_kvm(mn); 866 int need_tlb_flush = 0; 867 868 spin_lock(&kvm->mmu_lock); 869 /* 870 * The count increase must become visible at unlock time as no 871 * spte can be established without taking the mmu_lock and 872 * count is also read inside the mmu_lock critical section. 873 */ 874 kvm->mmu_notifier_count++; 875 for (; start < end; start += PAGE_SIZE) 876 need_tlb_flush |= kvm_unmap_hva(kvm, start); 877 spin_unlock(&kvm->mmu_lock); 878 879 /* we've to flush the tlb before the pages can be freed */ 880 if (need_tlb_flush) 881 kvm_flush_remote_tlbs(kvm); 882 } 883 884 static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, 885 struct mm_struct *mm, 886 unsigned long start, 887 unsigned long end) 888 { 889 struct kvm *kvm = mmu_notifier_to_kvm(mn); 890 891 spin_lock(&kvm->mmu_lock); 892 /* 893 * This sequence increase will notify the kvm page fault that 894 * the page that is going to be mapped in the spte could have 895 * been freed. 896 */ 897 kvm->mmu_notifier_seq++; 898 /* 899 * The above sequence increase must be visible before the 900 * below count decrease but both values are read by the kvm 901 * page fault under mmu_lock spinlock so we don't need to add 902 * a smb_wmb() here in between the two. 903 */ 904 kvm->mmu_notifier_count--; 905 spin_unlock(&kvm->mmu_lock); 906 907 BUG_ON(kvm->mmu_notifier_count < 0); 908 } 909 910 static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, 911 struct mm_struct *mm, 912 unsigned long address) 913 { 914 struct kvm *kvm = mmu_notifier_to_kvm(mn); 915 int young; 916 917 spin_lock(&kvm->mmu_lock); 918 young = kvm_age_hva(kvm, address); 919 spin_unlock(&kvm->mmu_lock); 920 921 if (young) 922 kvm_flush_remote_tlbs(kvm); 923 924 return young; 925 } 926 927 static void kvm_mmu_notifier_release(struct mmu_notifier *mn, 928 struct mm_struct *mm) 929 { 930 struct kvm *kvm = mmu_notifier_to_kvm(mn); 931 kvm_arch_flush_shadow(kvm); 932 } 933 934 static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { 935 .invalidate_page = kvm_mmu_notifier_invalidate_page, 936 .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, 937 .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, 938 .clear_flush_young = kvm_mmu_notifier_clear_flush_young, 939 .release = kvm_mmu_notifier_release, 940 }; 941 #endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */ 942 943 static struct kvm *kvm_create_vm(void) 944 { 945 struct kvm *kvm = kvm_arch_create_vm(); 946 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 947 struct page *page; 948 #endif 949 950 if (IS_ERR(kvm)) 951 goto out; 952 #ifdef CONFIG_HAVE_KVM_IRQCHIP 953 INIT_LIST_HEAD(&kvm->irq_routing); 954 INIT_HLIST_HEAD(&kvm->mask_notifier_list); 955 #endif 956 957 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 958 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 959 if (!page) { 960 kfree(kvm); 961 return ERR_PTR(-ENOMEM); 962 } 963 kvm->coalesced_mmio_ring = 964 (struct kvm_coalesced_mmio_ring *)page_address(page); 965 #endif 966 967 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 968 { 969 int err; 970 kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops; 971 err = mmu_notifier_register(&kvm->mmu_notifier, current->mm); 972 if (err) { 973 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 974 put_page(page); 975 #endif 976 kfree(kvm); 977 return ERR_PTR(err); 978 } 979 } 980 #endif 981 982 kvm->mm = current->mm; 983 atomic_inc(&kvm->mm->mm_count); 984 spin_lock_init(&kvm->mmu_lock); 985 kvm_io_bus_init(&kvm->pio_bus); 986 mutex_init(&kvm->lock); 987 kvm_io_bus_init(&kvm->mmio_bus); 988 init_rwsem(&kvm->slots_lock); 989 atomic_set(&kvm->users_count, 1); 990 spin_lock(&kvm_lock); 991 list_add(&kvm->vm_list, &vm_list); 992 spin_unlock(&kvm_lock); 993 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 994 kvm_coalesced_mmio_init(kvm); 995 #endif 996 out: 997 return kvm; 998 } 999 1000 /* 1001 * Free any memory in @free but not in @dont. 1002 */ 1003 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 1004 struct kvm_memory_slot *dont) 1005 { 1006 if (!dont || free->rmap != dont->rmap) 1007 vfree(free->rmap); 1008 1009 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 1010 vfree(free->dirty_bitmap); 1011 1012 if (!dont || free->lpage_info != dont->lpage_info) 1013 vfree(free->lpage_info); 1014 1015 free->npages = 0; 1016 free->dirty_bitmap = NULL; 1017 free->rmap = NULL; 1018 free->lpage_info = NULL; 1019 } 1020 1021 void kvm_free_physmem(struct kvm *kvm) 1022 { 1023 int i; 1024 1025 for (i = 0; i < kvm->nmemslots; ++i) 1026 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 1027 } 1028 1029 static void kvm_destroy_vm(struct kvm *kvm) 1030 { 1031 struct mm_struct *mm = kvm->mm; 1032 1033 kvm_arch_sync_events(kvm); 1034 spin_lock(&kvm_lock); 1035 list_del(&kvm->vm_list); 1036 spin_unlock(&kvm_lock); 1037 kvm_free_irq_routing(kvm); 1038 kvm_io_bus_destroy(&kvm->pio_bus); 1039 kvm_io_bus_destroy(&kvm->mmio_bus); 1040 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1041 if (kvm->coalesced_mmio_ring != NULL) 1042 free_page((unsigned long)kvm->coalesced_mmio_ring); 1043 #endif 1044 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 1045 mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); 1046 #else 1047 kvm_arch_flush_shadow(kvm); 1048 #endif 1049 kvm_arch_destroy_vm(kvm); 1050 mmdrop(mm); 1051 } 1052 1053 void kvm_get_kvm(struct kvm *kvm) 1054 { 1055 atomic_inc(&kvm->users_count); 1056 } 1057 EXPORT_SYMBOL_GPL(kvm_get_kvm); 1058 1059 void kvm_put_kvm(struct kvm *kvm) 1060 { 1061 if (atomic_dec_and_test(&kvm->users_count)) 1062 kvm_destroy_vm(kvm); 1063 } 1064 EXPORT_SYMBOL_GPL(kvm_put_kvm); 1065 1066 1067 static int kvm_vm_release(struct inode *inode, struct file *filp) 1068 { 1069 struct kvm *kvm = filp->private_data; 1070 1071 kvm_put_kvm(kvm); 1072 return 0; 1073 } 1074 1075 /* 1076 * Allocate some memory and give it an address in the guest physical address 1077 * space. 1078 * 1079 * Discontiguous memory is allowed, mostly for framebuffers. 1080 * 1081 * Must be called holding mmap_sem for write. 1082 */ 1083 int __kvm_set_memory_region(struct kvm *kvm, 1084 struct kvm_userspace_memory_region *mem, 1085 int user_alloc) 1086 { 1087 int r; 1088 gfn_t base_gfn; 1089 unsigned long npages, ugfn; 1090 unsigned long largepages, i; 1091 struct kvm_memory_slot *memslot; 1092 struct kvm_memory_slot old, new; 1093 1094 r = -EINVAL; 1095 /* General sanity checks */ 1096 if (mem->memory_size & (PAGE_SIZE - 1)) 1097 goto out; 1098 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 1099 goto out; 1100 if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1))) 1101 goto out; 1102 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 1103 goto out; 1104 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 1105 goto out; 1106 1107 memslot = &kvm->memslots[mem->slot]; 1108 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 1109 npages = mem->memory_size >> PAGE_SHIFT; 1110 1111 if (!npages) 1112 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 1113 1114 new = old = *memslot; 1115 1116 new.base_gfn = base_gfn; 1117 new.npages = npages; 1118 new.flags = mem->flags; 1119 1120 /* Disallow changing a memory slot's size. */ 1121 r = -EINVAL; 1122 if (npages && old.npages && npages != old.npages) 1123 goto out_free; 1124 1125 /* Check for overlaps */ 1126 r = -EEXIST; 1127 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1128 struct kvm_memory_slot *s = &kvm->memslots[i]; 1129 1130 if (s == memslot || !s->npages) 1131 continue; 1132 if (!((base_gfn + npages <= s->base_gfn) || 1133 (base_gfn >= s->base_gfn + s->npages))) 1134 goto out_free; 1135 } 1136 1137 /* Free page dirty bitmap if unneeded */ 1138 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 1139 new.dirty_bitmap = NULL; 1140 1141 r = -ENOMEM; 1142 1143 /* Allocate if a slot is being created */ 1144 #ifndef CONFIG_S390 1145 if (npages && !new.rmap) { 1146 new.rmap = vmalloc(npages * sizeof(struct page *)); 1147 1148 if (!new.rmap) 1149 goto out_free; 1150 1151 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 1152 1153 new.user_alloc = user_alloc; 1154 /* 1155 * hva_to_rmmap() serialzies with the mmu_lock and to be 1156 * safe it has to ignore memslots with !user_alloc && 1157 * !userspace_addr. 1158 */ 1159 if (user_alloc) 1160 new.userspace_addr = mem->userspace_addr; 1161 else 1162 new.userspace_addr = 0; 1163 } 1164 if (npages && !new.lpage_info) { 1165 largepages = 1 + (base_gfn + npages - 1) / KVM_PAGES_PER_HPAGE; 1166 largepages -= base_gfn / KVM_PAGES_PER_HPAGE; 1167 1168 new.lpage_info = vmalloc(largepages * sizeof(*new.lpage_info)); 1169 1170 if (!new.lpage_info) 1171 goto out_free; 1172 1173 memset(new.lpage_info, 0, largepages * sizeof(*new.lpage_info)); 1174 1175 if (base_gfn % KVM_PAGES_PER_HPAGE) 1176 new.lpage_info[0].write_count = 1; 1177 if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE) 1178 new.lpage_info[largepages-1].write_count = 1; 1179 ugfn = new.userspace_addr >> PAGE_SHIFT; 1180 /* 1181 * If the gfn and userspace address are not aligned wrt each 1182 * other, disable large page support for this slot 1183 */ 1184 if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1)) 1185 for (i = 0; i < largepages; ++i) 1186 new.lpage_info[i].write_count = 1; 1187 } 1188 1189 /* Allocate page dirty bitmap if needed */ 1190 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 1191 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 1192 1193 new.dirty_bitmap = vmalloc(dirty_bytes); 1194 if (!new.dirty_bitmap) 1195 goto out_free; 1196 memset(new.dirty_bitmap, 0, dirty_bytes); 1197 } 1198 #endif /* not defined CONFIG_S390 */ 1199 1200 if (!npages) 1201 kvm_arch_flush_shadow(kvm); 1202 1203 spin_lock(&kvm->mmu_lock); 1204 if (mem->slot >= kvm->nmemslots) 1205 kvm->nmemslots = mem->slot + 1; 1206 1207 *memslot = new; 1208 spin_unlock(&kvm->mmu_lock); 1209 1210 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 1211 if (r) { 1212 spin_lock(&kvm->mmu_lock); 1213 *memslot = old; 1214 spin_unlock(&kvm->mmu_lock); 1215 goto out_free; 1216 } 1217 1218 kvm_free_physmem_slot(&old, npages ? &new : NULL); 1219 /* Slot deletion case: we have to update the current slot */ 1220 spin_lock(&kvm->mmu_lock); 1221 if (!npages) 1222 *memslot = old; 1223 spin_unlock(&kvm->mmu_lock); 1224 #ifdef CONFIG_DMAR 1225 /* map the pages in iommu page table */ 1226 r = kvm_iommu_map_pages(kvm, base_gfn, npages); 1227 if (r) 1228 goto out; 1229 #endif 1230 return 0; 1231 1232 out_free: 1233 kvm_free_physmem_slot(&new, &old); 1234 out: 1235 return r; 1236 1237 } 1238 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 1239 1240 int kvm_set_memory_region(struct kvm *kvm, 1241 struct kvm_userspace_memory_region *mem, 1242 int user_alloc) 1243 { 1244 int r; 1245 1246 down_write(&kvm->slots_lock); 1247 r = __kvm_set_memory_region(kvm, mem, user_alloc); 1248 up_write(&kvm->slots_lock); 1249 return r; 1250 } 1251 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 1252 1253 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 1254 struct 1255 kvm_userspace_memory_region *mem, 1256 int user_alloc) 1257 { 1258 if (mem->slot >= KVM_MEMORY_SLOTS) 1259 return -EINVAL; 1260 return kvm_set_memory_region(kvm, mem, user_alloc); 1261 } 1262 1263 int kvm_get_dirty_log(struct kvm *kvm, 1264 struct kvm_dirty_log *log, int *is_dirty) 1265 { 1266 struct kvm_memory_slot *memslot; 1267 int r, i; 1268 int n; 1269 unsigned long any = 0; 1270 1271 r = -EINVAL; 1272 if (log->slot >= KVM_MEMORY_SLOTS) 1273 goto out; 1274 1275 memslot = &kvm->memslots[log->slot]; 1276 r = -ENOENT; 1277 if (!memslot->dirty_bitmap) 1278 goto out; 1279 1280 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 1281 1282 for (i = 0; !any && i < n/sizeof(long); ++i) 1283 any = memslot->dirty_bitmap[i]; 1284 1285 r = -EFAULT; 1286 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 1287 goto out; 1288 1289 if (any) 1290 *is_dirty = 1; 1291 1292 r = 0; 1293 out: 1294 return r; 1295 } 1296 1297 int is_error_page(struct page *page) 1298 { 1299 return page == bad_page; 1300 } 1301 EXPORT_SYMBOL_GPL(is_error_page); 1302 1303 int is_error_pfn(pfn_t pfn) 1304 { 1305 return pfn == bad_pfn; 1306 } 1307 EXPORT_SYMBOL_GPL(is_error_pfn); 1308 1309 static inline unsigned long bad_hva(void) 1310 { 1311 return PAGE_OFFSET; 1312 } 1313 1314 int kvm_is_error_hva(unsigned long addr) 1315 { 1316 return addr == bad_hva(); 1317 } 1318 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 1319 1320 struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 1321 { 1322 int i; 1323 1324 for (i = 0; i < kvm->nmemslots; ++i) { 1325 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1326 1327 if (gfn >= memslot->base_gfn 1328 && gfn < memslot->base_gfn + memslot->npages) 1329 return memslot; 1330 } 1331 return NULL; 1332 } 1333 EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 1334 1335 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 1336 { 1337 gfn = unalias_gfn(kvm, gfn); 1338 return gfn_to_memslot_unaliased(kvm, gfn); 1339 } 1340 1341 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 1342 { 1343 int i; 1344 1345 gfn = unalias_gfn(kvm, gfn); 1346 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 1347 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 1348 1349 if (gfn >= memslot->base_gfn 1350 && gfn < memslot->base_gfn + memslot->npages) 1351 return 1; 1352 } 1353 return 0; 1354 } 1355 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 1356 1357 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 1358 { 1359 struct kvm_memory_slot *slot; 1360 1361 gfn = unalias_gfn(kvm, gfn); 1362 slot = gfn_to_memslot_unaliased(kvm, gfn); 1363 if (!slot) 1364 return bad_hva(); 1365 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 1366 } 1367 EXPORT_SYMBOL_GPL(gfn_to_hva); 1368 1369 pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn) 1370 { 1371 struct page *page[1]; 1372 unsigned long addr; 1373 int npages; 1374 pfn_t pfn; 1375 1376 might_sleep(); 1377 1378 addr = gfn_to_hva(kvm, gfn); 1379 if (kvm_is_error_hva(addr)) { 1380 get_page(bad_page); 1381 return page_to_pfn(bad_page); 1382 } 1383 1384 npages = get_user_pages_fast(addr, 1, 1, page); 1385 1386 if (unlikely(npages != 1)) { 1387 struct vm_area_struct *vma; 1388 1389 down_read(¤t->mm->mmap_sem); 1390 vma = find_vma(current->mm, addr); 1391 1392 if (vma == NULL || addr < vma->vm_start || 1393 !(vma->vm_flags & VM_PFNMAP)) { 1394 up_read(¤t->mm->mmap_sem); 1395 get_page(bad_page); 1396 return page_to_pfn(bad_page); 1397 } 1398 1399 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; 1400 up_read(¤t->mm->mmap_sem); 1401 BUG_ON(!kvm_is_mmio_pfn(pfn)); 1402 } else 1403 pfn = page_to_pfn(page[0]); 1404 1405 return pfn; 1406 } 1407 1408 EXPORT_SYMBOL_GPL(gfn_to_pfn); 1409 1410 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 1411 { 1412 pfn_t pfn; 1413 1414 pfn = gfn_to_pfn(kvm, gfn); 1415 if (!kvm_is_mmio_pfn(pfn)) 1416 return pfn_to_page(pfn); 1417 1418 WARN_ON(kvm_is_mmio_pfn(pfn)); 1419 1420 get_page(bad_page); 1421 return bad_page; 1422 } 1423 1424 EXPORT_SYMBOL_GPL(gfn_to_page); 1425 1426 void kvm_release_page_clean(struct page *page) 1427 { 1428 kvm_release_pfn_clean(page_to_pfn(page)); 1429 } 1430 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 1431 1432 void kvm_release_pfn_clean(pfn_t pfn) 1433 { 1434 if (!kvm_is_mmio_pfn(pfn)) 1435 put_page(pfn_to_page(pfn)); 1436 } 1437 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); 1438 1439 void kvm_release_page_dirty(struct page *page) 1440 { 1441 kvm_release_pfn_dirty(page_to_pfn(page)); 1442 } 1443 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 1444 1445 void kvm_release_pfn_dirty(pfn_t pfn) 1446 { 1447 kvm_set_pfn_dirty(pfn); 1448 kvm_release_pfn_clean(pfn); 1449 } 1450 EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); 1451 1452 void kvm_set_page_dirty(struct page *page) 1453 { 1454 kvm_set_pfn_dirty(page_to_pfn(page)); 1455 } 1456 EXPORT_SYMBOL_GPL(kvm_set_page_dirty); 1457 1458 void kvm_set_pfn_dirty(pfn_t pfn) 1459 { 1460 if (!kvm_is_mmio_pfn(pfn)) { 1461 struct page *page = pfn_to_page(pfn); 1462 if (!PageReserved(page)) 1463 SetPageDirty(page); 1464 } 1465 } 1466 EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); 1467 1468 void kvm_set_pfn_accessed(pfn_t pfn) 1469 { 1470 if (!kvm_is_mmio_pfn(pfn)) 1471 mark_page_accessed(pfn_to_page(pfn)); 1472 } 1473 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); 1474 1475 void kvm_get_pfn(pfn_t pfn) 1476 { 1477 if (!kvm_is_mmio_pfn(pfn)) 1478 get_page(pfn_to_page(pfn)); 1479 } 1480 EXPORT_SYMBOL_GPL(kvm_get_pfn); 1481 1482 static int next_segment(unsigned long len, int offset) 1483 { 1484 if (len > PAGE_SIZE - offset) 1485 return PAGE_SIZE - offset; 1486 else 1487 return len; 1488 } 1489 1490 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 1491 int len) 1492 { 1493 int r; 1494 unsigned long addr; 1495 1496 addr = gfn_to_hva(kvm, gfn); 1497 if (kvm_is_error_hva(addr)) 1498 return -EFAULT; 1499 r = copy_from_user(data, (void __user *)addr + offset, len); 1500 if (r) 1501 return -EFAULT; 1502 return 0; 1503 } 1504 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 1505 1506 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 1507 { 1508 gfn_t gfn = gpa >> PAGE_SHIFT; 1509 int seg; 1510 int offset = offset_in_page(gpa); 1511 int ret; 1512 1513 while ((seg = next_segment(len, offset)) != 0) { 1514 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 1515 if (ret < 0) 1516 return ret; 1517 offset = 0; 1518 len -= seg; 1519 data += seg; 1520 ++gfn; 1521 } 1522 return 0; 1523 } 1524 EXPORT_SYMBOL_GPL(kvm_read_guest); 1525 1526 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 1527 unsigned long len) 1528 { 1529 int r; 1530 unsigned long addr; 1531 gfn_t gfn = gpa >> PAGE_SHIFT; 1532 int offset = offset_in_page(gpa); 1533 1534 addr = gfn_to_hva(kvm, gfn); 1535 if (kvm_is_error_hva(addr)) 1536 return -EFAULT; 1537 pagefault_disable(); 1538 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 1539 pagefault_enable(); 1540 if (r) 1541 return -EFAULT; 1542 return 0; 1543 } 1544 EXPORT_SYMBOL(kvm_read_guest_atomic); 1545 1546 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 1547 int offset, int len) 1548 { 1549 int r; 1550 unsigned long addr; 1551 1552 addr = gfn_to_hva(kvm, gfn); 1553 if (kvm_is_error_hva(addr)) 1554 return -EFAULT; 1555 r = copy_to_user((void __user *)addr + offset, data, len); 1556 if (r) 1557 return -EFAULT; 1558 mark_page_dirty(kvm, gfn); 1559 return 0; 1560 } 1561 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 1562 1563 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 1564 unsigned long len) 1565 { 1566 gfn_t gfn = gpa >> PAGE_SHIFT; 1567 int seg; 1568 int offset = offset_in_page(gpa); 1569 int ret; 1570 1571 while ((seg = next_segment(len, offset)) != 0) { 1572 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 1573 if (ret < 0) 1574 return ret; 1575 offset = 0; 1576 len -= seg; 1577 data += seg; 1578 ++gfn; 1579 } 1580 return 0; 1581 } 1582 1583 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 1584 { 1585 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 1586 } 1587 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 1588 1589 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 1590 { 1591 gfn_t gfn = gpa >> PAGE_SHIFT; 1592 int seg; 1593 int offset = offset_in_page(gpa); 1594 int ret; 1595 1596 while ((seg = next_segment(len, offset)) != 0) { 1597 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 1598 if (ret < 0) 1599 return ret; 1600 offset = 0; 1601 len -= seg; 1602 ++gfn; 1603 } 1604 return 0; 1605 } 1606 EXPORT_SYMBOL_GPL(kvm_clear_guest); 1607 1608 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 1609 { 1610 struct kvm_memory_slot *memslot; 1611 1612 gfn = unalias_gfn(kvm, gfn); 1613 memslot = gfn_to_memslot_unaliased(kvm, gfn); 1614 if (memslot && memslot->dirty_bitmap) { 1615 unsigned long rel_gfn = gfn - memslot->base_gfn; 1616 1617 /* avoid RMW */ 1618 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 1619 set_bit(rel_gfn, memslot->dirty_bitmap); 1620 } 1621 } 1622 1623 /* 1624 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 1625 */ 1626 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 1627 { 1628 DEFINE_WAIT(wait); 1629 1630 for (;;) { 1631 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1632 1633 if ((kvm_arch_interrupt_allowed(vcpu) && 1634 kvm_cpu_has_interrupt(vcpu)) || 1635 kvm_arch_vcpu_runnable(vcpu)) { 1636 set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1637 break; 1638 } 1639 if (kvm_cpu_has_pending_timer(vcpu)) 1640 break; 1641 if (signal_pending(current)) 1642 break; 1643 1644 vcpu_put(vcpu); 1645 schedule(); 1646 vcpu_load(vcpu); 1647 } 1648 1649 finish_wait(&vcpu->wq, &wait); 1650 } 1651 1652 void kvm_resched(struct kvm_vcpu *vcpu) 1653 { 1654 if (!need_resched()) 1655 return; 1656 cond_resched(); 1657 } 1658 EXPORT_SYMBOL_GPL(kvm_resched); 1659 1660 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 1661 { 1662 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 1663 struct page *page; 1664 1665 if (vmf->pgoff == 0) 1666 page = virt_to_page(vcpu->run); 1667 #ifdef CONFIG_X86 1668 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 1669 page = virt_to_page(vcpu->arch.pio_data); 1670 #endif 1671 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 1672 else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) 1673 page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); 1674 #endif 1675 else 1676 return VM_FAULT_SIGBUS; 1677 get_page(page); 1678 vmf->page = page; 1679 return 0; 1680 } 1681 1682 static struct vm_operations_struct kvm_vcpu_vm_ops = { 1683 .fault = kvm_vcpu_fault, 1684 }; 1685 1686 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 1687 { 1688 vma->vm_ops = &kvm_vcpu_vm_ops; 1689 return 0; 1690 } 1691 1692 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 1693 { 1694 struct kvm_vcpu *vcpu = filp->private_data; 1695 1696 kvm_put_kvm(vcpu->kvm); 1697 return 0; 1698 } 1699 1700 static struct file_operations kvm_vcpu_fops = { 1701 .release = kvm_vcpu_release, 1702 .unlocked_ioctl = kvm_vcpu_ioctl, 1703 .compat_ioctl = kvm_vcpu_ioctl, 1704 .mmap = kvm_vcpu_mmap, 1705 }; 1706 1707 /* 1708 * Allocates an inode for the vcpu. 1709 */ 1710 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 1711 { 1712 int fd = anon_inode_getfd("kvm-vcpu", &kvm_vcpu_fops, vcpu, 0); 1713 if (fd < 0) 1714 kvm_put_kvm(vcpu->kvm); 1715 return fd; 1716 } 1717 1718 /* 1719 * Creates some virtual cpus. Good luck creating more than one. 1720 */ 1721 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 1722 { 1723 int r; 1724 struct kvm_vcpu *vcpu; 1725 1726 if (!valid_vcpu(n)) 1727 return -EINVAL; 1728 1729 vcpu = kvm_arch_vcpu_create(kvm, n); 1730 if (IS_ERR(vcpu)) 1731 return PTR_ERR(vcpu); 1732 1733 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 1734 1735 r = kvm_arch_vcpu_setup(vcpu); 1736 if (r) 1737 return r; 1738 1739 mutex_lock(&kvm->lock); 1740 if (kvm->vcpus[n]) { 1741 r = -EEXIST; 1742 goto vcpu_destroy; 1743 } 1744 kvm->vcpus[n] = vcpu; 1745 mutex_unlock(&kvm->lock); 1746 1747 /* Now it's all set up, let userspace reach it */ 1748 kvm_get_kvm(kvm); 1749 r = create_vcpu_fd(vcpu); 1750 if (r < 0) 1751 goto unlink; 1752 return r; 1753 1754 unlink: 1755 mutex_lock(&kvm->lock); 1756 kvm->vcpus[n] = NULL; 1757 vcpu_destroy: 1758 mutex_unlock(&kvm->lock); 1759 kvm_arch_vcpu_destroy(vcpu); 1760 return r; 1761 } 1762 1763 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 1764 { 1765 if (sigset) { 1766 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 1767 vcpu->sigset_active = 1; 1768 vcpu->sigset = *sigset; 1769 } else 1770 vcpu->sigset_active = 0; 1771 return 0; 1772 } 1773 1774 #ifdef __KVM_HAVE_MSIX 1775 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, 1776 struct kvm_assigned_msix_nr *entry_nr) 1777 { 1778 int r = 0; 1779 struct kvm_assigned_dev_kernel *adev; 1780 1781 mutex_lock(&kvm->lock); 1782 1783 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1784 entry_nr->assigned_dev_id); 1785 if (!adev) { 1786 r = -EINVAL; 1787 goto msix_nr_out; 1788 } 1789 1790 if (adev->entries_nr == 0) { 1791 adev->entries_nr = entry_nr->entry_nr; 1792 if (adev->entries_nr == 0 || 1793 adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) { 1794 r = -EINVAL; 1795 goto msix_nr_out; 1796 } 1797 1798 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * 1799 entry_nr->entry_nr, 1800 GFP_KERNEL); 1801 if (!adev->host_msix_entries) { 1802 r = -ENOMEM; 1803 goto msix_nr_out; 1804 } 1805 adev->guest_msix_entries = kzalloc( 1806 sizeof(struct kvm_guest_msix_entry) * 1807 entry_nr->entry_nr, GFP_KERNEL); 1808 if (!adev->guest_msix_entries) { 1809 kfree(adev->host_msix_entries); 1810 r = -ENOMEM; 1811 goto msix_nr_out; 1812 } 1813 } else /* Not allowed set MSI-X number twice */ 1814 r = -EINVAL; 1815 msix_nr_out: 1816 mutex_unlock(&kvm->lock); 1817 return r; 1818 } 1819 1820 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, 1821 struct kvm_assigned_msix_entry *entry) 1822 { 1823 int r = 0, i; 1824 struct kvm_assigned_dev_kernel *adev; 1825 1826 mutex_lock(&kvm->lock); 1827 1828 adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, 1829 entry->assigned_dev_id); 1830 1831 if (!adev) { 1832 r = -EINVAL; 1833 goto msix_entry_out; 1834 } 1835 1836 for (i = 0; i < adev->entries_nr; i++) 1837 if (adev->guest_msix_entries[i].vector == 0 || 1838 adev->guest_msix_entries[i].entry == entry->entry) { 1839 adev->guest_msix_entries[i].entry = entry->entry; 1840 adev->guest_msix_entries[i].vector = entry->gsi; 1841 adev->host_msix_entries[i].entry = entry->entry; 1842 break; 1843 } 1844 if (i == adev->entries_nr) { 1845 r = -ENOSPC; 1846 goto msix_entry_out; 1847 } 1848 1849 msix_entry_out: 1850 mutex_unlock(&kvm->lock); 1851 1852 return r; 1853 } 1854 #endif 1855 1856 static long kvm_vcpu_ioctl(struct file *filp, 1857 unsigned int ioctl, unsigned long arg) 1858 { 1859 struct kvm_vcpu *vcpu = filp->private_data; 1860 void __user *argp = (void __user *)arg; 1861 int r; 1862 struct kvm_fpu *fpu = NULL; 1863 struct kvm_sregs *kvm_sregs = NULL; 1864 1865 if (vcpu->kvm->mm != current->mm) 1866 return -EIO; 1867 switch (ioctl) { 1868 case KVM_RUN: 1869 r = -EINVAL; 1870 if (arg) 1871 goto out; 1872 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 1873 break; 1874 case KVM_GET_REGS: { 1875 struct kvm_regs *kvm_regs; 1876 1877 r = -ENOMEM; 1878 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1879 if (!kvm_regs) 1880 goto out; 1881 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); 1882 if (r) 1883 goto out_free1; 1884 r = -EFAULT; 1885 if (copy_to_user(argp, kvm_regs, sizeof(struct kvm_regs))) 1886 goto out_free1; 1887 r = 0; 1888 out_free1: 1889 kfree(kvm_regs); 1890 break; 1891 } 1892 case KVM_SET_REGS: { 1893 struct kvm_regs *kvm_regs; 1894 1895 r = -ENOMEM; 1896 kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); 1897 if (!kvm_regs) 1898 goto out; 1899 r = -EFAULT; 1900 if (copy_from_user(kvm_regs, argp, sizeof(struct kvm_regs))) 1901 goto out_free2; 1902 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1903 if (r) 1904 goto out_free2; 1905 r = 0; 1906 out_free2: 1907 kfree(kvm_regs); 1908 break; 1909 } 1910 case KVM_GET_SREGS: { 1911 kvm_sregs = kzalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1912 r = -ENOMEM; 1913 if (!kvm_sregs) 1914 goto out; 1915 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); 1916 if (r) 1917 goto out; 1918 r = -EFAULT; 1919 if (copy_to_user(argp, kvm_sregs, sizeof(struct kvm_sregs))) 1920 goto out; 1921 r = 0; 1922 break; 1923 } 1924 case KVM_SET_SREGS: { 1925 kvm_sregs = kmalloc(sizeof(struct kvm_sregs), GFP_KERNEL); 1926 r = -ENOMEM; 1927 if (!kvm_sregs) 1928 goto out; 1929 r = -EFAULT; 1930 if (copy_from_user(kvm_sregs, argp, sizeof(struct kvm_sregs))) 1931 goto out; 1932 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1933 if (r) 1934 goto out; 1935 r = 0; 1936 break; 1937 } 1938 case KVM_GET_MP_STATE: { 1939 struct kvm_mp_state mp_state; 1940 1941 r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); 1942 if (r) 1943 goto out; 1944 r = -EFAULT; 1945 if (copy_to_user(argp, &mp_state, sizeof mp_state)) 1946 goto out; 1947 r = 0; 1948 break; 1949 } 1950 case KVM_SET_MP_STATE: { 1951 struct kvm_mp_state mp_state; 1952 1953 r = -EFAULT; 1954 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1955 goto out; 1956 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1957 if (r) 1958 goto out; 1959 r = 0; 1960 break; 1961 } 1962 case KVM_TRANSLATE: { 1963 struct kvm_translation tr; 1964 1965 r = -EFAULT; 1966 if (copy_from_user(&tr, argp, sizeof tr)) 1967 goto out; 1968 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 1969 if (r) 1970 goto out; 1971 r = -EFAULT; 1972 if (copy_to_user(argp, &tr, sizeof tr)) 1973 goto out; 1974 r = 0; 1975 break; 1976 } 1977 case KVM_SET_GUEST_DEBUG: { 1978 struct kvm_guest_debug dbg; 1979 1980 r = -EFAULT; 1981 if (copy_from_user(&dbg, argp, sizeof dbg)) 1982 goto out; 1983 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 1984 if (r) 1985 goto out; 1986 r = 0; 1987 break; 1988 } 1989 case KVM_SET_SIGNAL_MASK: { 1990 struct kvm_signal_mask __user *sigmask_arg = argp; 1991 struct kvm_signal_mask kvm_sigmask; 1992 sigset_t sigset, *p; 1993 1994 p = NULL; 1995 if (argp) { 1996 r = -EFAULT; 1997 if (copy_from_user(&kvm_sigmask, argp, 1998 sizeof kvm_sigmask)) 1999 goto out; 2000 r = -EINVAL; 2001 if (kvm_sigmask.len != sizeof sigset) 2002 goto out; 2003 r = -EFAULT; 2004 if (copy_from_user(&sigset, sigmask_arg->sigset, 2005 sizeof sigset)) 2006 goto out; 2007 p = &sigset; 2008 } 2009 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 2010 break; 2011 } 2012 case KVM_GET_FPU: { 2013 fpu = kzalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2014 r = -ENOMEM; 2015 if (!fpu) 2016 goto out; 2017 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); 2018 if (r) 2019 goto out; 2020 r = -EFAULT; 2021 if (copy_to_user(argp, fpu, sizeof(struct kvm_fpu))) 2022 goto out; 2023 r = 0; 2024 break; 2025 } 2026 case KVM_SET_FPU: { 2027 fpu = kmalloc(sizeof(struct kvm_fpu), GFP_KERNEL); 2028 r = -ENOMEM; 2029 if (!fpu) 2030 goto out; 2031 r = -EFAULT; 2032 if (copy_from_user(fpu, argp, sizeof(struct kvm_fpu))) 2033 goto out; 2034 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2035 if (r) 2036 goto out; 2037 r = 0; 2038 break; 2039 } 2040 default: 2041 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 2042 } 2043 out: 2044 kfree(fpu); 2045 kfree(kvm_sregs); 2046 return r; 2047 } 2048 2049 static long kvm_vm_ioctl(struct file *filp, 2050 unsigned int ioctl, unsigned long arg) 2051 { 2052 struct kvm *kvm = filp->private_data; 2053 void __user *argp = (void __user *)arg; 2054 int r; 2055 2056 if (kvm->mm != current->mm) 2057 return -EIO; 2058 switch (ioctl) { 2059 case KVM_CREATE_VCPU: 2060 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2061 if (r < 0) 2062 goto out; 2063 break; 2064 case KVM_SET_USER_MEMORY_REGION: { 2065 struct kvm_userspace_memory_region kvm_userspace_mem; 2066 2067 r = -EFAULT; 2068 if (copy_from_user(&kvm_userspace_mem, argp, 2069 sizeof kvm_userspace_mem)) 2070 goto out; 2071 2072 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2073 if (r) 2074 goto out; 2075 break; 2076 } 2077 case KVM_GET_DIRTY_LOG: { 2078 struct kvm_dirty_log log; 2079 2080 r = -EFAULT; 2081 if (copy_from_user(&log, argp, sizeof log)) 2082 goto out; 2083 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2084 if (r) 2085 goto out; 2086 break; 2087 } 2088 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2089 case KVM_REGISTER_COALESCED_MMIO: { 2090 struct kvm_coalesced_mmio_zone zone; 2091 r = -EFAULT; 2092 if (copy_from_user(&zone, argp, sizeof zone)) 2093 goto out; 2094 r = -ENXIO; 2095 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2096 if (r) 2097 goto out; 2098 r = 0; 2099 break; 2100 } 2101 case KVM_UNREGISTER_COALESCED_MMIO: { 2102 struct kvm_coalesced_mmio_zone zone; 2103 r = -EFAULT; 2104 if (copy_from_user(&zone, argp, sizeof zone)) 2105 goto out; 2106 r = -ENXIO; 2107 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2108 if (r) 2109 goto out; 2110 r = 0; 2111 break; 2112 } 2113 #endif 2114 #ifdef KVM_CAP_DEVICE_ASSIGNMENT 2115 case KVM_ASSIGN_PCI_DEVICE: { 2116 struct kvm_assigned_pci_dev assigned_dev; 2117 2118 r = -EFAULT; 2119 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2120 goto out; 2121 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); 2122 if (r) 2123 goto out; 2124 break; 2125 } 2126 case KVM_ASSIGN_IRQ: { 2127 r = -EOPNOTSUPP; 2128 break; 2129 } 2130 #ifdef KVM_CAP_ASSIGN_DEV_IRQ 2131 case KVM_ASSIGN_DEV_IRQ: { 2132 struct kvm_assigned_irq assigned_irq; 2133 2134 r = -EFAULT; 2135 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2136 goto out; 2137 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); 2138 if (r) 2139 goto out; 2140 break; 2141 } 2142 case KVM_DEASSIGN_DEV_IRQ: { 2143 struct kvm_assigned_irq assigned_irq; 2144 2145 r = -EFAULT; 2146 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) 2147 goto out; 2148 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); 2149 if (r) 2150 goto out; 2151 break; 2152 } 2153 #endif 2154 #endif 2155 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT 2156 case KVM_DEASSIGN_PCI_DEVICE: { 2157 struct kvm_assigned_pci_dev assigned_dev; 2158 2159 r = -EFAULT; 2160 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) 2161 goto out; 2162 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); 2163 if (r) 2164 goto out; 2165 break; 2166 } 2167 #endif 2168 #ifdef KVM_CAP_IRQ_ROUTING 2169 case KVM_SET_GSI_ROUTING: { 2170 struct kvm_irq_routing routing; 2171 struct kvm_irq_routing __user *urouting; 2172 struct kvm_irq_routing_entry *entries; 2173 2174 r = -EFAULT; 2175 if (copy_from_user(&routing, argp, sizeof(routing))) 2176 goto out; 2177 r = -EINVAL; 2178 if (routing.nr >= KVM_MAX_IRQ_ROUTES) 2179 goto out; 2180 if (routing.flags) 2181 goto out; 2182 r = -ENOMEM; 2183 entries = vmalloc(routing.nr * sizeof(*entries)); 2184 if (!entries) 2185 goto out; 2186 r = -EFAULT; 2187 urouting = argp; 2188 if (copy_from_user(entries, urouting->entries, 2189 routing.nr * sizeof(*entries))) 2190 goto out_free_irq_routing; 2191 r = kvm_set_irq_routing(kvm, entries, routing.nr, 2192 routing.flags); 2193 out_free_irq_routing: 2194 vfree(entries); 2195 break; 2196 } 2197 #ifdef __KVM_HAVE_MSIX 2198 case KVM_ASSIGN_SET_MSIX_NR: { 2199 struct kvm_assigned_msix_nr entry_nr; 2200 r = -EFAULT; 2201 if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) 2202 goto out; 2203 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); 2204 if (r) 2205 goto out; 2206 break; 2207 } 2208 case KVM_ASSIGN_SET_MSIX_ENTRY: { 2209 struct kvm_assigned_msix_entry entry; 2210 r = -EFAULT; 2211 if (copy_from_user(&entry, argp, sizeof entry)) 2212 goto out; 2213 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); 2214 if (r) 2215 goto out; 2216 break; 2217 } 2218 #endif 2219 #endif /* KVM_CAP_IRQ_ROUTING */ 2220 default: 2221 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 2222 } 2223 out: 2224 return r; 2225 } 2226 2227 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2228 { 2229 struct page *page[1]; 2230 unsigned long addr; 2231 int npages; 2232 gfn_t gfn = vmf->pgoff; 2233 struct kvm *kvm = vma->vm_file->private_data; 2234 2235 addr = gfn_to_hva(kvm, gfn); 2236 if (kvm_is_error_hva(addr)) 2237 return VM_FAULT_SIGBUS; 2238 2239 npages = get_user_pages(current, current->mm, addr, 1, 1, 0, page, 2240 NULL); 2241 if (unlikely(npages != 1)) 2242 return VM_FAULT_SIGBUS; 2243 2244 vmf->page = page[0]; 2245 return 0; 2246 } 2247 2248 static struct vm_operations_struct kvm_vm_vm_ops = { 2249 .fault = kvm_vm_fault, 2250 }; 2251 2252 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 2253 { 2254 vma->vm_ops = &kvm_vm_vm_ops; 2255 return 0; 2256 } 2257 2258 static struct file_operations kvm_vm_fops = { 2259 .release = kvm_vm_release, 2260 .unlocked_ioctl = kvm_vm_ioctl, 2261 .compat_ioctl = kvm_vm_ioctl, 2262 .mmap = kvm_vm_mmap, 2263 }; 2264 2265 static int kvm_dev_ioctl_create_vm(void) 2266 { 2267 int fd; 2268 struct kvm *kvm; 2269 2270 kvm = kvm_create_vm(); 2271 if (IS_ERR(kvm)) 2272 return PTR_ERR(kvm); 2273 fd = anon_inode_getfd("kvm-vm", &kvm_vm_fops, kvm, 0); 2274 if (fd < 0) 2275 kvm_put_kvm(kvm); 2276 2277 return fd; 2278 } 2279 2280 static long kvm_dev_ioctl_check_extension_generic(long arg) 2281 { 2282 switch (arg) { 2283 case KVM_CAP_USER_MEMORY: 2284 case KVM_CAP_DESTROY_MEMORY_REGION_WORKS: 2285 case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS: 2286 return 1; 2287 #ifdef CONFIG_HAVE_KVM_IRQCHIP 2288 case KVM_CAP_IRQ_ROUTING: 2289 return KVM_MAX_IRQ_ROUTES; 2290 #endif 2291 default: 2292 break; 2293 } 2294 return kvm_dev_ioctl_check_extension(arg); 2295 } 2296 2297 static long kvm_dev_ioctl(struct file *filp, 2298 unsigned int ioctl, unsigned long arg) 2299 { 2300 long r = -EINVAL; 2301 2302 switch (ioctl) { 2303 case KVM_GET_API_VERSION: 2304 r = -EINVAL; 2305 if (arg) 2306 goto out; 2307 r = KVM_API_VERSION; 2308 break; 2309 case KVM_CREATE_VM: 2310 r = -EINVAL; 2311 if (arg) 2312 goto out; 2313 r = kvm_dev_ioctl_create_vm(); 2314 break; 2315 case KVM_CHECK_EXTENSION: 2316 r = kvm_dev_ioctl_check_extension_generic(arg); 2317 break; 2318 case KVM_GET_VCPU_MMAP_SIZE: 2319 r = -EINVAL; 2320 if (arg) 2321 goto out; 2322 r = PAGE_SIZE; /* struct kvm_run */ 2323 #ifdef CONFIG_X86 2324 r += PAGE_SIZE; /* pio data page */ 2325 #endif 2326 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET 2327 r += PAGE_SIZE; /* coalesced mmio ring page */ 2328 #endif 2329 break; 2330 case KVM_TRACE_ENABLE: 2331 case KVM_TRACE_PAUSE: 2332 case KVM_TRACE_DISABLE: 2333 r = kvm_trace_ioctl(ioctl, arg); 2334 break; 2335 default: 2336 return kvm_arch_dev_ioctl(filp, ioctl, arg); 2337 } 2338 out: 2339 return r; 2340 } 2341 2342 static struct file_operations kvm_chardev_ops = { 2343 .unlocked_ioctl = kvm_dev_ioctl, 2344 .compat_ioctl = kvm_dev_ioctl, 2345 }; 2346 2347 static struct miscdevice kvm_dev = { 2348 KVM_MINOR, 2349 "kvm", 2350 &kvm_chardev_ops, 2351 }; 2352 2353 static void hardware_enable(void *junk) 2354 { 2355 int cpu = raw_smp_processor_id(); 2356 2357 if (cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2358 return; 2359 cpumask_set_cpu(cpu, cpus_hardware_enabled); 2360 kvm_arch_hardware_enable(NULL); 2361 } 2362 2363 static void hardware_disable(void *junk) 2364 { 2365 int cpu = raw_smp_processor_id(); 2366 2367 if (!cpumask_test_cpu(cpu, cpus_hardware_enabled)) 2368 return; 2369 cpumask_clear_cpu(cpu, cpus_hardware_enabled); 2370 kvm_arch_hardware_disable(NULL); 2371 } 2372 2373 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 2374 void *v) 2375 { 2376 int cpu = (long)v; 2377 2378 val &= ~CPU_TASKS_FROZEN; 2379 switch (val) { 2380 case CPU_DYING: 2381 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2382 cpu); 2383 hardware_disable(NULL); 2384 break; 2385 case CPU_UP_CANCELED: 2386 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 2387 cpu); 2388 smp_call_function_single(cpu, hardware_disable, NULL, 1); 2389 break; 2390 case CPU_ONLINE: 2391 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 2392 cpu); 2393 smp_call_function_single(cpu, hardware_enable, NULL, 1); 2394 break; 2395 } 2396 return NOTIFY_OK; 2397 } 2398 2399 2400 asmlinkage void kvm_handle_fault_on_reboot(void) 2401 { 2402 if (kvm_rebooting) 2403 /* spin while reset goes on */ 2404 while (true) 2405 ; 2406 /* Fault while not rebooting. We want the trace. */ 2407 BUG(); 2408 } 2409 EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot); 2410 2411 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 2412 void *v) 2413 { 2414 /* 2415 * Some (well, at least mine) BIOSes hang on reboot if 2416 * in vmx root mode. 2417 * 2418 * And Intel TXT required VMX off for all cpu when system shutdown. 2419 */ 2420 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 2421 kvm_rebooting = true; 2422 on_each_cpu(hardware_disable, NULL, 1); 2423 return NOTIFY_OK; 2424 } 2425 2426 static struct notifier_block kvm_reboot_notifier = { 2427 .notifier_call = kvm_reboot, 2428 .priority = 0, 2429 }; 2430 2431 void kvm_io_bus_init(struct kvm_io_bus *bus) 2432 { 2433 memset(bus, 0, sizeof(*bus)); 2434 } 2435 2436 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 2437 { 2438 int i; 2439 2440 for (i = 0; i < bus->dev_count; i++) { 2441 struct kvm_io_device *pos = bus->devs[i]; 2442 2443 kvm_iodevice_destructor(pos); 2444 } 2445 } 2446 2447 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, 2448 gpa_t addr, int len, int is_write) 2449 { 2450 int i; 2451 2452 for (i = 0; i < bus->dev_count; i++) { 2453 struct kvm_io_device *pos = bus->devs[i]; 2454 2455 if (pos->in_range(pos, addr, len, is_write)) 2456 return pos; 2457 } 2458 2459 return NULL; 2460 } 2461 2462 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 2463 { 2464 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 2465 2466 bus->devs[bus->dev_count++] = dev; 2467 } 2468 2469 static struct notifier_block kvm_cpu_notifier = { 2470 .notifier_call = kvm_cpu_hotplug, 2471 .priority = 20, /* must be > scheduler priority */ 2472 }; 2473 2474 static int vm_stat_get(void *_offset, u64 *val) 2475 { 2476 unsigned offset = (long)_offset; 2477 struct kvm *kvm; 2478 2479 *val = 0; 2480 spin_lock(&kvm_lock); 2481 list_for_each_entry(kvm, &vm_list, vm_list) 2482 *val += *(u32 *)((void *)kvm + offset); 2483 spin_unlock(&kvm_lock); 2484 return 0; 2485 } 2486 2487 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 2488 2489 static int vcpu_stat_get(void *_offset, u64 *val) 2490 { 2491 unsigned offset = (long)_offset; 2492 struct kvm *kvm; 2493 struct kvm_vcpu *vcpu; 2494 int i; 2495 2496 *val = 0; 2497 spin_lock(&kvm_lock); 2498 list_for_each_entry(kvm, &vm_list, vm_list) 2499 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 2500 vcpu = kvm->vcpus[i]; 2501 if (vcpu) 2502 *val += *(u32 *)((void *)vcpu + offset); 2503 } 2504 spin_unlock(&kvm_lock); 2505 return 0; 2506 } 2507 2508 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 2509 2510 static struct file_operations *stat_fops[] = { 2511 [KVM_STAT_VCPU] = &vcpu_stat_fops, 2512 [KVM_STAT_VM] = &vm_stat_fops, 2513 }; 2514 2515 static void kvm_init_debug(void) 2516 { 2517 struct kvm_stats_debugfs_item *p; 2518 2519 kvm_debugfs_dir = debugfs_create_dir("kvm", NULL); 2520 for (p = debugfs_entries; p->name; ++p) 2521 p->dentry = debugfs_create_file(p->name, 0444, kvm_debugfs_dir, 2522 (void *)(long)p->offset, 2523 stat_fops[p->kind]); 2524 } 2525 2526 static void kvm_exit_debug(void) 2527 { 2528 struct kvm_stats_debugfs_item *p; 2529 2530 for (p = debugfs_entries; p->name; ++p) 2531 debugfs_remove(p->dentry); 2532 debugfs_remove(kvm_debugfs_dir); 2533 } 2534 2535 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 2536 { 2537 hardware_disable(NULL); 2538 return 0; 2539 } 2540 2541 static int kvm_resume(struct sys_device *dev) 2542 { 2543 hardware_enable(NULL); 2544 return 0; 2545 } 2546 2547 static struct sysdev_class kvm_sysdev_class = { 2548 .name = "kvm", 2549 .suspend = kvm_suspend, 2550 .resume = kvm_resume, 2551 }; 2552 2553 static struct sys_device kvm_sysdev = { 2554 .id = 0, 2555 .cls = &kvm_sysdev_class, 2556 }; 2557 2558 struct page *bad_page; 2559 pfn_t bad_pfn; 2560 2561 static inline 2562 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 2563 { 2564 return container_of(pn, struct kvm_vcpu, preempt_notifier); 2565 } 2566 2567 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 2568 { 2569 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2570 2571 kvm_arch_vcpu_load(vcpu, cpu); 2572 } 2573 2574 static void kvm_sched_out(struct preempt_notifier *pn, 2575 struct task_struct *next) 2576 { 2577 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 2578 2579 kvm_arch_vcpu_put(vcpu); 2580 } 2581 2582 int kvm_init(void *opaque, unsigned int vcpu_size, 2583 struct module *module) 2584 { 2585 int r; 2586 int cpu; 2587 2588 kvm_init_debug(); 2589 2590 r = kvm_arch_init(opaque); 2591 if (r) 2592 goto out_fail; 2593 2594 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2595 2596 if (bad_page == NULL) { 2597 r = -ENOMEM; 2598 goto out; 2599 } 2600 2601 bad_pfn = page_to_pfn(bad_page); 2602 2603 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2604 r = -ENOMEM; 2605 goto out_free_0; 2606 } 2607 cpumask_clear(cpus_hardware_enabled); 2608 2609 r = kvm_arch_hardware_setup(); 2610 if (r < 0) 2611 goto out_free_0a; 2612 2613 for_each_online_cpu(cpu) { 2614 smp_call_function_single(cpu, 2615 kvm_arch_check_processor_compat, 2616 &r, 1); 2617 if (r < 0) 2618 goto out_free_1; 2619 } 2620 2621 on_each_cpu(hardware_enable, NULL, 1); 2622 r = register_cpu_notifier(&kvm_cpu_notifier); 2623 if (r) 2624 goto out_free_2; 2625 register_reboot_notifier(&kvm_reboot_notifier); 2626 2627 r = sysdev_class_register(&kvm_sysdev_class); 2628 if (r) 2629 goto out_free_3; 2630 2631 r = sysdev_register(&kvm_sysdev); 2632 if (r) 2633 goto out_free_4; 2634 2635 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 2636 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 2637 __alignof__(struct kvm_vcpu), 2638 0, NULL); 2639 if (!kvm_vcpu_cache) { 2640 r = -ENOMEM; 2641 goto out_free_5; 2642 } 2643 2644 kvm_chardev_ops.owner = module; 2645 kvm_vm_fops.owner = module; 2646 kvm_vcpu_fops.owner = module; 2647 2648 r = misc_register(&kvm_dev); 2649 if (r) { 2650 printk(KERN_ERR "kvm: misc device register failed\n"); 2651 goto out_free; 2652 } 2653 2654 kvm_preempt_ops.sched_in = kvm_sched_in; 2655 kvm_preempt_ops.sched_out = kvm_sched_out; 2656 2657 return 0; 2658 2659 out_free: 2660 kmem_cache_destroy(kvm_vcpu_cache); 2661 out_free_5: 2662 sysdev_unregister(&kvm_sysdev); 2663 out_free_4: 2664 sysdev_class_unregister(&kvm_sysdev_class); 2665 out_free_3: 2666 unregister_reboot_notifier(&kvm_reboot_notifier); 2667 unregister_cpu_notifier(&kvm_cpu_notifier); 2668 out_free_2: 2669 on_each_cpu(hardware_disable, NULL, 1); 2670 out_free_1: 2671 kvm_arch_hardware_unsetup(); 2672 out_free_0a: 2673 free_cpumask_var(cpus_hardware_enabled); 2674 out_free_0: 2675 __free_page(bad_page); 2676 out: 2677 kvm_arch_exit(); 2678 kvm_exit_debug(); 2679 out_fail: 2680 return r; 2681 } 2682 EXPORT_SYMBOL_GPL(kvm_init); 2683 2684 void kvm_exit(void) 2685 { 2686 kvm_trace_cleanup(); 2687 misc_deregister(&kvm_dev); 2688 kmem_cache_destroy(kvm_vcpu_cache); 2689 sysdev_unregister(&kvm_sysdev); 2690 sysdev_class_unregister(&kvm_sysdev_class); 2691 unregister_reboot_notifier(&kvm_reboot_notifier); 2692 unregister_cpu_notifier(&kvm_cpu_notifier); 2693 on_each_cpu(hardware_disable, NULL, 1); 2694 kvm_arch_hardware_unsetup(); 2695 kvm_arch_exit(); 2696 kvm_exit_debug(); 2697 free_cpumask_var(cpus_hardware_enabled); 2698 __free_page(bad_page); 2699 } 2700 EXPORT_SYMBOL_GPL(kvm_exit); 2701