1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 44 #include <asm/processor.h> 45 #include <asm/io.h> 46 #include <asm/uaccess.h> 47 #include <asm/pgtable.h> 48 49 MODULE_AUTHOR("Qumranet"); 50 MODULE_LICENSE("GPL"); 51 52 DEFINE_SPINLOCK(kvm_lock); 53 LIST_HEAD(vm_list); 54 55 static cpumask_t cpus_hardware_enabled; 56 57 struct kmem_cache *kvm_vcpu_cache; 58 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 59 60 static __read_mostly struct preempt_ops kvm_preempt_ops; 61 62 static struct dentry *debugfs_dir; 63 64 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 65 unsigned long arg); 66 67 static inline int valid_vcpu(int n) 68 { 69 return likely(n >= 0 && n < KVM_MAX_VCPUS); 70 } 71 72 /* 73 * Switches to specified vcpu, until a matching vcpu_put() 74 */ 75 void vcpu_load(struct kvm_vcpu *vcpu) 76 { 77 int cpu; 78 79 mutex_lock(&vcpu->mutex); 80 cpu = get_cpu(); 81 preempt_notifier_register(&vcpu->preempt_notifier); 82 kvm_arch_vcpu_load(vcpu, cpu); 83 put_cpu(); 84 } 85 86 void vcpu_put(struct kvm_vcpu *vcpu) 87 { 88 preempt_disable(); 89 kvm_arch_vcpu_put(vcpu); 90 preempt_notifier_unregister(&vcpu->preempt_notifier); 91 preempt_enable(); 92 mutex_unlock(&vcpu->mutex); 93 } 94 95 static void ack_flush(void *_completed) 96 { 97 } 98 99 void kvm_flush_remote_tlbs(struct kvm *kvm) 100 { 101 int i, cpu; 102 cpumask_t cpus; 103 struct kvm_vcpu *vcpu; 104 105 cpus_clear(cpus); 106 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 107 vcpu = kvm->vcpus[i]; 108 if (!vcpu) 109 continue; 110 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 111 continue; 112 cpu = vcpu->cpu; 113 if (cpu != -1 && cpu != raw_smp_processor_id()) 114 cpu_set(cpu, cpus); 115 } 116 if (cpus_empty(cpus)) 117 return; 118 ++kvm->stat.remote_tlb_flush; 119 smp_call_function_mask(cpus, ack_flush, NULL, 1); 120 } 121 122 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 123 { 124 struct page *page; 125 int r; 126 127 mutex_init(&vcpu->mutex); 128 vcpu->cpu = -1; 129 vcpu->kvm = kvm; 130 vcpu->vcpu_id = id; 131 init_waitqueue_head(&vcpu->wq); 132 133 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 134 if (!page) { 135 r = -ENOMEM; 136 goto fail; 137 } 138 vcpu->run = page_address(page); 139 140 r = kvm_arch_vcpu_init(vcpu); 141 if (r < 0) 142 goto fail_free_run; 143 return 0; 144 145 fail_free_run: 146 free_page((unsigned long)vcpu->run); 147 fail: 148 return r; 149 } 150 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 151 152 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 153 { 154 kvm_arch_vcpu_uninit(vcpu); 155 free_page((unsigned long)vcpu->run); 156 } 157 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 158 159 static struct kvm *kvm_create_vm(void) 160 { 161 struct kvm *kvm = kvm_arch_create_vm(); 162 163 if (IS_ERR(kvm)) 164 goto out; 165 166 kvm->mm = current->mm; 167 atomic_inc(&kvm->mm->mm_count); 168 spin_lock_init(&kvm->mmu_lock); 169 kvm_io_bus_init(&kvm->pio_bus); 170 mutex_init(&kvm->lock); 171 kvm_io_bus_init(&kvm->mmio_bus); 172 init_rwsem(&kvm->slots_lock); 173 spin_lock(&kvm_lock); 174 list_add(&kvm->vm_list, &vm_list); 175 spin_unlock(&kvm_lock); 176 out: 177 return kvm; 178 } 179 180 /* 181 * Free any memory in @free but not in @dont. 182 */ 183 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 184 struct kvm_memory_slot *dont) 185 { 186 if (!dont || free->rmap != dont->rmap) 187 vfree(free->rmap); 188 189 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 190 vfree(free->dirty_bitmap); 191 192 free->npages = 0; 193 free->dirty_bitmap = NULL; 194 free->rmap = NULL; 195 } 196 197 void kvm_free_physmem(struct kvm *kvm) 198 { 199 int i; 200 201 for (i = 0; i < kvm->nmemslots; ++i) 202 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 203 } 204 205 static void kvm_destroy_vm(struct kvm *kvm) 206 { 207 struct mm_struct *mm = kvm->mm; 208 209 spin_lock(&kvm_lock); 210 list_del(&kvm->vm_list); 211 spin_unlock(&kvm_lock); 212 kvm_io_bus_destroy(&kvm->pio_bus); 213 kvm_io_bus_destroy(&kvm->mmio_bus); 214 kvm_arch_destroy_vm(kvm); 215 mmdrop(mm); 216 } 217 218 static int kvm_vm_release(struct inode *inode, struct file *filp) 219 { 220 struct kvm *kvm = filp->private_data; 221 222 kvm_destroy_vm(kvm); 223 return 0; 224 } 225 226 /* 227 * Allocate some memory and give it an address in the guest physical address 228 * space. 229 * 230 * Discontiguous memory is allowed, mostly for framebuffers. 231 * 232 * Must be called holding mmap_sem for write. 233 */ 234 int __kvm_set_memory_region(struct kvm *kvm, 235 struct kvm_userspace_memory_region *mem, 236 int user_alloc) 237 { 238 int r; 239 gfn_t base_gfn; 240 unsigned long npages; 241 unsigned long i; 242 struct kvm_memory_slot *memslot; 243 struct kvm_memory_slot old, new; 244 245 r = -EINVAL; 246 /* General sanity checks */ 247 if (mem->memory_size & (PAGE_SIZE - 1)) 248 goto out; 249 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 250 goto out; 251 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 252 goto out; 253 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 254 goto out; 255 256 memslot = &kvm->memslots[mem->slot]; 257 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 258 npages = mem->memory_size >> PAGE_SHIFT; 259 260 if (!npages) 261 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 262 263 new = old = *memslot; 264 265 new.base_gfn = base_gfn; 266 new.npages = npages; 267 new.flags = mem->flags; 268 269 /* Disallow changing a memory slot's size. */ 270 r = -EINVAL; 271 if (npages && old.npages && npages != old.npages) 272 goto out_free; 273 274 /* Check for overlaps */ 275 r = -EEXIST; 276 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 277 struct kvm_memory_slot *s = &kvm->memslots[i]; 278 279 if (s == memslot) 280 continue; 281 if (!((base_gfn + npages <= s->base_gfn) || 282 (base_gfn >= s->base_gfn + s->npages))) 283 goto out_free; 284 } 285 286 /* Free page dirty bitmap if unneeded */ 287 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 288 new.dirty_bitmap = NULL; 289 290 r = -ENOMEM; 291 292 /* Allocate if a slot is being created */ 293 if (npages && !new.rmap) { 294 new.rmap = vmalloc(npages * sizeof(struct page *)); 295 296 if (!new.rmap) 297 goto out_free; 298 299 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 300 301 new.user_alloc = user_alloc; 302 new.userspace_addr = mem->userspace_addr; 303 } 304 305 /* Allocate page dirty bitmap if needed */ 306 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 307 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 308 309 new.dirty_bitmap = vmalloc(dirty_bytes); 310 if (!new.dirty_bitmap) 311 goto out_free; 312 memset(new.dirty_bitmap, 0, dirty_bytes); 313 } 314 315 if (mem->slot >= kvm->nmemslots) 316 kvm->nmemslots = mem->slot + 1; 317 318 *memslot = new; 319 320 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 321 if (r) { 322 *memslot = old; 323 goto out_free; 324 } 325 326 kvm_free_physmem_slot(&old, &new); 327 return 0; 328 329 out_free: 330 kvm_free_physmem_slot(&new, &old); 331 out: 332 return r; 333 334 } 335 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 336 337 int kvm_set_memory_region(struct kvm *kvm, 338 struct kvm_userspace_memory_region *mem, 339 int user_alloc) 340 { 341 int r; 342 343 down_write(&kvm->slots_lock); 344 r = __kvm_set_memory_region(kvm, mem, user_alloc); 345 up_write(&kvm->slots_lock); 346 return r; 347 } 348 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 349 350 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 351 struct 352 kvm_userspace_memory_region *mem, 353 int user_alloc) 354 { 355 if (mem->slot >= KVM_MEMORY_SLOTS) 356 return -EINVAL; 357 return kvm_set_memory_region(kvm, mem, user_alloc); 358 } 359 360 int kvm_get_dirty_log(struct kvm *kvm, 361 struct kvm_dirty_log *log, int *is_dirty) 362 { 363 struct kvm_memory_slot *memslot; 364 int r, i; 365 int n; 366 unsigned long any = 0; 367 368 r = -EINVAL; 369 if (log->slot >= KVM_MEMORY_SLOTS) 370 goto out; 371 372 memslot = &kvm->memslots[log->slot]; 373 r = -ENOENT; 374 if (!memslot->dirty_bitmap) 375 goto out; 376 377 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 378 379 for (i = 0; !any && i < n/sizeof(long); ++i) 380 any = memslot->dirty_bitmap[i]; 381 382 r = -EFAULT; 383 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 384 goto out; 385 386 if (any) 387 *is_dirty = 1; 388 389 r = 0; 390 out: 391 return r; 392 } 393 394 int is_error_page(struct page *page) 395 { 396 return page == bad_page; 397 } 398 EXPORT_SYMBOL_GPL(is_error_page); 399 400 static inline unsigned long bad_hva(void) 401 { 402 return PAGE_OFFSET; 403 } 404 405 int kvm_is_error_hva(unsigned long addr) 406 { 407 return addr == bad_hva(); 408 } 409 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 410 411 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 412 { 413 int i; 414 415 for (i = 0; i < kvm->nmemslots; ++i) { 416 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 417 418 if (gfn >= memslot->base_gfn 419 && gfn < memslot->base_gfn + memslot->npages) 420 return memslot; 421 } 422 return NULL; 423 } 424 425 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 426 { 427 gfn = unalias_gfn(kvm, gfn); 428 return __gfn_to_memslot(kvm, gfn); 429 } 430 431 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 432 { 433 int i; 434 435 gfn = unalias_gfn(kvm, gfn); 436 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 437 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 438 439 if (gfn >= memslot->base_gfn 440 && gfn < memslot->base_gfn + memslot->npages) 441 return 1; 442 } 443 return 0; 444 } 445 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 446 447 static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 448 { 449 struct kvm_memory_slot *slot; 450 451 gfn = unalias_gfn(kvm, gfn); 452 slot = __gfn_to_memslot(kvm, gfn); 453 if (!slot) 454 return bad_hva(); 455 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 456 } 457 458 /* 459 * Requires current->mm->mmap_sem to be held 460 */ 461 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 462 { 463 struct page *page[1]; 464 unsigned long addr; 465 int npages; 466 467 might_sleep(); 468 469 addr = gfn_to_hva(kvm, gfn); 470 if (kvm_is_error_hva(addr)) { 471 get_page(bad_page); 472 return bad_page; 473 } 474 475 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, 476 NULL); 477 478 if (npages != 1) { 479 get_page(bad_page); 480 return bad_page; 481 } 482 483 return page[0]; 484 } 485 486 EXPORT_SYMBOL_GPL(gfn_to_page); 487 488 void kvm_release_page_clean(struct page *page) 489 { 490 put_page(page); 491 } 492 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 493 494 void kvm_release_page_dirty(struct page *page) 495 { 496 if (!PageReserved(page)) 497 SetPageDirty(page); 498 put_page(page); 499 } 500 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 501 502 static int next_segment(unsigned long len, int offset) 503 { 504 if (len > PAGE_SIZE - offset) 505 return PAGE_SIZE - offset; 506 else 507 return len; 508 } 509 510 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 511 int len) 512 { 513 int r; 514 unsigned long addr; 515 516 addr = gfn_to_hva(kvm, gfn); 517 if (kvm_is_error_hva(addr)) 518 return -EFAULT; 519 r = copy_from_user(data, (void __user *)addr + offset, len); 520 if (r) 521 return -EFAULT; 522 return 0; 523 } 524 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 525 526 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 527 { 528 gfn_t gfn = gpa >> PAGE_SHIFT; 529 int seg; 530 int offset = offset_in_page(gpa); 531 int ret; 532 533 while ((seg = next_segment(len, offset)) != 0) { 534 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 535 if (ret < 0) 536 return ret; 537 offset = 0; 538 len -= seg; 539 data += seg; 540 ++gfn; 541 } 542 return 0; 543 } 544 EXPORT_SYMBOL_GPL(kvm_read_guest); 545 546 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 547 unsigned long len) 548 { 549 int r; 550 unsigned long addr; 551 gfn_t gfn = gpa >> PAGE_SHIFT; 552 int offset = offset_in_page(gpa); 553 554 addr = gfn_to_hva(kvm, gfn); 555 if (kvm_is_error_hva(addr)) 556 return -EFAULT; 557 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 558 if (r) 559 return -EFAULT; 560 return 0; 561 } 562 EXPORT_SYMBOL(kvm_read_guest_atomic); 563 564 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 565 int offset, int len) 566 { 567 int r; 568 unsigned long addr; 569 570 addr = gfn_to_hva(kvm, gfn); 571 if (kvm_is_error_hva(addr)) 572 return -EFAULT; 573 r = copy_to_user((void __user *)addr + offset, data, len); 574 if (r) 575 return -EFAULT; 576 mark_page_dirty(kvm, gfn); 577 return 0; 578 } 579 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 580 581 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 582 unsigned long len) 583 { 584 gfn_t gfn = gpa >> PAGE_SHIFT; 585 int seg; 586 int offset = offset_in_page(gpa); 587 int ret; 588 589 while ((seg = next_segment(len, offset)) != 0) { 590 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 591 if (ret < 0) 592 return ret; 593 offset = 0; 594 len -= seg; 595 data += seg; 596 ++gfn; 597 } 598 return 0; 599 } 600 601 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 602 { 603 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 604 } 605 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 606 607 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 608 { 609 gfn_t gfn = gpa >> PAGE_SHIFT; 610 int seg; 611 int offset = offset_in_page(gpa); 612 int ret; 613 614 while ((seg = next_segment(len, offset)) != 0) { 615 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 616 if (ret < 0) 617 return ret; 618 offset = 0; 619 len -= seg; 620 ++gfn; 621 } 622 return 0; 623 } 624 EXPORT_SYMBOL_GPL(kvm_clear_guest); 625 626 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 627 { 628 struct kvm_memory_slot *memslot; 629 630 gfn = unalias_gfn(kvm, gfn); 631 memslot = __gfn_to_memslot(kvm, gfn); 632 if (memslot && memslot->dirty_bitmap) { 633 unsigned long rel_gfn = gfn - memslot->base_gfn; 634 635 /* avoid RMW */ 636 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 637 set_bit(rel_gfn, memslot->dirty_bitmap); 638 } 639 } 640 641 /* 642 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 643 */ 644 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 645 { 646 DECLARE_WAITQUEUE(wait, current); 647 648 add_wait_queue(&vcpu->wq, &wait); 649 650 /* 651 * We will block until either an interrupt or a signal wakes us up 652 */ 653 while (!kvm_cpu_has_interrupt(vcpu) 654 && !signal_pending(current) 655 && !kvm_arch_vcpu_runnable(vcpu)) { 656 set_current_state(TASK_INTERRUPTIBLE); 657 vcpu_put(vcpu); 658 schedule(); 659 vcpu_load(vcpu); 660 } 661 662 __set_current_state(TASK_RUNNING); 663 remove_wait_queue(&vcpu->wq, &wait); 664 } 665 666 void kvm_resched(struct kvm_vcpu *vcpu) 667 { 668 if (!need_resched()) 669 return; 670 cond_resched(); 671 } 672 EXPORT_SYMBOL_GPL(kvm_resched); 673 674 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 675 { 676 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 677 struct page *page; 678 679 if (vmf->pgoff == 0) 680 page = virt_to_page(vcpu->run); 681 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 682 page = virt_to_page(vcpu->arch.pio_data); 683 else 684 return VM_FAULT_SIGBUS; 685 get_page(page); 686 vmf->page = page; 687 return 0; 688 } 689 690 static struct vm_operations_struct kvm_vcpu_vm_ops = { 691 .fault = kvm_vcpu_fault, 692 }; 693 694 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 695 { 696 vma->vm_ops = &kvm_vcpu_vm_ops; 697 return 0; 698 } 699 700 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 701 { 702 struct kvm_vcpu *vcpu = filp->private_data; 703 704 fput(vcpu->kvm->filp); 705 return 0; 706 } 707 708 static struct file_operations kvm_vcpu_fops = { 709 .release = kvm_vcpu_release, 710 .unlocked_ioctl = kvm_vcpu_ioctl, 711 .compat_ioctl = kvm_vcpu_ioctl, 712 .mmap = kvm_vcpu_mmap, 713 }; 714 715 /* 716 * Allocates an inode for the vcpu. 717 */ 718 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 719 { 720 int fd, r; 721 struct inode *inode; 722 struct file *file; 723 724 r = anon_inode_getfd(&fd, &inode, &file, 725 "kvm-vcpu", &kvm_vcpu_fops, vcpu); 726 if (r) 727 return r; 728 atomic_inc(&vcpu->kvm->filp->f_count); 729 return fd; 730 } 731 732 /* 733 * Creates some virtual cpus. Good luck creating more than one. 734 */ 735 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 736 { 737 int r; 738 struct kvm_vcpu *vcpu; 739 740 if (!valid_vcpu(n)) 741 return -EINVAL; 742 743 vcpu = kvm_arch_vcpu_create(kvm, n); 744 if (IS_ERR(vcpu)) 745 return PTR_ERR(vcpu); 746 747 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 748 749 r = kvm_arch_vcpu_setup(vcpu); 750 if (r) 751 goto vcpu_destroy; 752 753 mutex_lock(&kvm->lock); 754 if (kvm->vcpus[n]) { 755 r = -EEXIST; 756 mutex_unlock(&kvm->lock); 757 goto vcpu_destroy; 758 } 759 kvm->vcpus[n] = vcpu; 760 mutex_unlock(&kvm->lock); 761 762 /* Now it's all set up, let userspace reach it */ 763 r = create_vcpu_fd(vcpu); 764 if (r < 0) 765 goto unlink; 766 return r; 767 768 unlink: 769 mutex_lock(&kvm->lock); 770 kvm->vcpus[n] = NULL; 771 mutex_unlock(&kvm->lock); 772 vcpu_destroy: 773 kvm_arch_vcpu_destroy(vcpu); 774 return r; 775 } 776 777 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 778 { 779 if (sigset) { 780 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 781 vcpu->sigset_active = 1; 782 vcpu->sigset = *sigset; 783 } else 784 vcpu->sigset_active = 0; 785 return 0; 786 } 787 788 static long kvm_vcpu_ioctl(struct file *filp, 789 unsigned int ioctl, unsigned long arg) 790 { 791 struct kvm_vcpu *vcpu = filp->private_data; 792 void __user *argp = (void __user *)arg; 793 int r; 794 795 if (vcpu->kvm->mm != current->mm) 796 return -EIO; 797 switch (ioctl) { 798 case KVM_RUN: 799 r = -EINVAL; 800 if (arg) 801 goto out; 802 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 803 break; 804 case KVM_GET_REGS: { 805 struct kvm_regs kvm_regs; 806 807 memset(&kvm_regs, 0, sizeof kvm_regs); 808 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); 809 if (r) 810 goto out; 811 r = -EFAULT; 812 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) 813 goto out; 814 r = 0; 815 break; 816 } 817 case KVM_SET_REGS: { 818 struct kvm_regs kvm_regs; 819 820 r = -EFAULT; 821 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) 822 goto out; 823 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); 824 if (r) 825 goto out; 826 r = 0; 827 break; 828 } 829 case KVM_GET_SREGS: { 830 struct kvm_sregs kvm_sregs; 831 832 memset(&kvm_sregs, 0, sizeof kvm_sregs); 833 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 834 if (r) 835 goto out; 836 r = -EFAULT; 837 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 838 goto out; 839 r = 0; 840 break; 841 } 842 case KVM_SET_SREGS: { 843 struct kvm_sregs kvm_sregs; 844 845 r = -EFAULT; 846 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 847 goto out; 848 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 849 if (r) 850 goto out; 851 r = 0; 852 break; 853 } 854 case KVM_TRANSLATE: { 855 struct kvm_translation tr; 856 857 r = -EFAULT; 858 if (copy_from_user(&tr, argp, sizeof tr)) 859 goto out; 860 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 861 if (r) 862 goto out; 863 r = -EFAULT; 864 if (copy_to_user(argp, &tr, sizeof tr)) 865 goto out; 866 r = 0; 867 break; 868 } 869 case KVM_DEBUG_GUEST: { 870 struct kvm_debug_guest dbg; 871 872 r = -EFAULT; 873 if (copy_from_user(&dbg, argp, sizeof dbg)) 874 goto out; 875 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); 876 if (r) 877 goto out; 878 r = 0; 879 break; 880 } 881 case KVM_SET_SIGNAL_MASK: { 882 struct kvm_signal_mask __user *sigmask_arg = argp; 883 struct kvm_signal_mask kvm_sigmask; 884 sigset_t sigset, *p; 885 886 p = NULL; 887 if (argp) { 888 r = -EFAULT; 889 if (copy_from_user(&kvm_sigmask, argp, 890 sizeof kvm_sigmask)) 891 goto out; 892 r = -EINVAL; 893 if (kvm_sigmask.len != sizeof sigset) 894 goto out; 895 r = -EFAULT; 896 if (copy_from_user(&sigset, sigmask_arg->sigset, 897 sizeof sigset)) 898 goto out; 899 p = &sigset; 900 } 901 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 902 break; 903 } 904 case KVM_GET_FPU: { 905 struct kvm_fpu fpu; 906 907 memset(&fpu, 0, sizeof fpu); 908 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); 909 if (r) 910 goto out; 911 r = -EFAULT; 912 if (copy_to_user(argp, &fpu, sizeof fpu)) 913 goto out; 914 r = 0; 915 break; 916 } 917 case KVM_SET_FPU: { 918 struct kvm_fpu fpu; 919 920 r = -EFAULT; 921 if (copy_from_user(&fpu, argp, sizeof fpu)) 922 goto out; 923 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); 924 if (r) 925 goto out; 926 r = 0; 927 break; 928 } 929 default: 930 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 931 } 932 out: 933 return r; 934 } 935 936 static long kvm_vm_ioctl(struct file *filp, 937 unsigned int ioctl, unsigned long arg) 938 { 939 struct kvm *kvm = filp->private_data; 940 void __user *argp = (void __user *)arg; 941 int r; 942 943 if (kvm->mm != current->mm) 944 return -EIO; 945 switch (ioctl) { 946 case KVM_CREATE_VCPU: 947 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 948 if (r < 0) 949 goto out; 950 break; 951 case KVM_SET_USER_MEMORY_REGION: { 952 struct kvm_userspace_memory_region kvm_userspace_mem; 953 954 r = -EFAULT; 955 if (copy_from_user(&kvm_userspace_mem, argp, 956 sizeof kvm_userspace_mem)) 957 goto out; 958 959 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 960 if (r) 961 goto out; 962 break; 963 } 964 case KVM_GET_DIRTY_LOG: { 965 struct kvm_dirty_log log; 966 967 r = -EFAULT; 968 if (copy_from_user(&log, argp, sizeof log)) 969 goto out; 970 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 971 if (r) 972 goto out; 973 break; 974 } 975 default: 976 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 977 } 978 out: 979 return r; 980 } 981 982 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 983 { 984 struct kvm *kvm = vma->vm_file->private_data; 985 struct page *page; 986 987 if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) 988 return VM_FAULT_SIGBUS; 989 page = gfn_to_page(kvm, vmf->pgoff); 990 if (is_error_page(page)) { 991 kvm_release_page_clean(page); 992 return VM_FAULT_SIGBUS; 993 } 994 vmf->page = page; 995 return 0; 996 } 997 998 static struct vm_operations_struct kvm_vm_vm_ops = { 999 .fault = kvm_vm_fault, 1000 }; 1001 1002 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1003 { 1004 vma->vm_ops = &kvm_vm_vm_ops; 1005 return 0; 1006 } 1007 1008 static struct file_operations kvm_vm_fops = { 1009 .release = kvm_vm_release, 1010 .unlocked_ioctl = kvm_vm_ioctl, 1011 .compat_ioctl = kvm_vm_ioctl, 1012 .mmap = kvm_vm_mmap, 1013 }; 1014 1015 static int kvm_dev_ioctl_create_vm(void) 1016 { 1017 int fd, r; 1018 struct inode *inode; 1019 struct file *file; 1020 struct kvm *kvm; 1021 1022 kvm = kvm_create_vm(); 1023 if (IS_ERR(kvm)) 1024 return PTR_ERR(kvm); 1025 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); 1026 if (r) { 1027 kvm_destroy_vm(kvm); 1028 return r; 1029 } 1030 1031 kvm->filp = file; 1032 1033 return fd; 1034 } 1035 1036 static long kvm_dev_ioctl(struct file *filp, 1037 unsigned int ioctl, unsigned long arg) 1038 { 1039 void __user *argp = (void __user *)arg; 1040 long r = -EINVAL; 1041 1042 switch (ioctl) { 1043 case KVM_GET_API_VERSION: 1044 r = -EINVAL; 1045 if (arg) 1046 goto out; 1047 r = KVM_API_VERSION; 1048 break; 1049 case KVM_CREATE_VM: 1050 r = -EINVAL; 1051 if (arg) 1052 goto out; 1053 r = kvm_dev_ioctl_create_vm(); 1054 break; 1055 case KVM_CHECK_EXTENSION: 1056 r = kvm_dev_ioctl_check_extension((long)argp); 1057 break; 1058 case KVM_GET_VCPU_MMAP_SIZE: 1059 r = -EINVAL; 1060 if (arg) 1061 goto out; 1062 r = 2 * PAGE_SIZE; 1063 break; 1064 default: 1065 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1066 } 1067 out: 1068 return r; 1069 } 1070 1071 static struct file_operations kvm_chardev_ops = { 1072 .unlocked_ioctl = kvm_dev_ioctl, 1073 .compat_ioctl = kvm_dev_ioctl, 1074 }; 1075 1076 static struct miscdevice kvm_dev = { 1077 KVM_MINOR, 1078 "kvm", 1079 &kvm_chardev_ops, 1080 }; 1081 1082 static void hardware_enable(void *junk) 1083 { 1084 int cpu = raw_smp_processor_id(); 1085 1086 if (cpu_isset(cpu, cpus_hardware_enabled)) 1087 return; 1088 cpu_set(cpu, cpus_hardware_enabled); 1089 kvm_arch_hardware_enable(NULL); 1090 } 1091 1092 static void hardware_disable(void *junk) 1093 { 1094 int cpu = raw_smp_processor_id(); 1095 1096 if (!cpu_isset(cpu, cpus_hardware_enabled)) 1097 return; 1098 cpu_clear(cpu, cpus_hardware_enabled); 1099 decache_vcpus_on_cpu(cpu); 1100 kvm_arch_hardware_disable(NULL); 1101 } 1102 1103 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1104 void *v) 1105 { 1106 int cpu = (long)v; 1107 1108 val &= ~CPU_TASKS_FROZEN; 1109 switch (val) { 1110 case CPU_DYING: 1111 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1112 cpu); 1113 hardware_disable(NULL); 1114 break; 1115 case CPU_UP_CANCELED: 1116 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1117 cpu); 1118 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); 1119 break; 1120 case CPU_ONLINE: 1121 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 1122 cpu); 1123 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); 1124 break; 1125 } 1126 return NOTIFY_OK; 1127 } 1128 1129 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 1130 void *v) 1131 { 1132 if (val == SYS_RESTART) { 1133 /* 1134 * Some (well, at least mine) BIOSes hang on reboot if 1135 * in vmx root mode. 1136 */ 1137 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 1138 on_each_cpu(hardware_disable, NULL, 0, 1); 1139 } 1140 return NOTIFY_OK; 1141 } 1142 1143 static struct notifier_block kvm_reboot_notifier = { 1144 .notifier_call = kvm_reboot, 1145 .priority = 0, 1146 }; 1147 1148 void kvm_io_bus_init(struct kvm_io_bus *bus) 1149 { 1150 memset(bus, 0, sizeof(*bus)); 1151 } 1152 1153 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 1154 { 1155 int i; 1156 1157 for (i = 0; i < bus->dev_count; i++) { 1158 struct kvm_io_device *pos = bus->devs[i]; 1159 1160 kvm_iodevice_destructor(pos); 1161 } 1162 } 1163 1164 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) 1165 { 1166 int i; 1167 1168 for (i = 0; i < bus->dev_count; i++) { 1169 struct kvm_io_device *pos = bus->devs[i]; 1170 1171 if (pos->in_range(pos, addr)) 1172 return pos; 1173 } 1174 1175 return NULL; 1176 } 1177 1178 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 1179 { 1180 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 1181 1182 bus->devs[bus->dev_count++] = dev; 1183 } 1184 1185 static struct notifier_block kvm_cpu_notifier = { 1186 .notifier_call = kvm_cpu_hotplug, 1187 .priority = 20, /* must be > scheduler priority */ 1188 }; 1189 1190 static int vm_stat_get(void *_offset, u64 *val) 1191 { 1192 unsigned offset = (long)_offset; 1193 struct kvm *kvm; 1194 1195 *val = 0; 1196 spin_lock(&kvm_lock); 1197 list_for_each_entry(kvm, &vm_list, vm_list) 1198 *val += *(u32 *)((void *)kvm + offset); 1199 spin_unlock(&kvm_lock); 1200 return 0; 1201 } 1202 1203 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 1204 1205 static int vcpu_stat_get(void *_offset, u64 *val) 1206 { 1207 unsigned offset = (long)_offset; 1208 struct kvm *kvm; 1209 struct kvm_vcpu *vcpu; 1210 int i; 1211 1212 *val = 0; 1213 spin_lock(&kvm_lock); 1214 list_for_each_entry(kvm, &vm_list, vm_list) 1215 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 1216 vcpu = kvm->vcpus[i]; 1217 if (vcpu) 1218 *val += *(u32 *)((void *)vcpu + offset); 1219 } 1220 spin_unlock(&kvm_lock); 1221 return 0; 1222 } 1223 1224 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 1225 1226 static struct file_operations *stat_fops[] = { 1227 [KVM_STAT_VCPU] = &vcpu_stat_fops, 1228 [KVM_STAT_VM] = &vm_stat_fops, 1229 }; 1230 1231 static void kvm_init_debug(void) 1232 { 1233 struct kvm_stats_debugfs_item *p; 1234 1235 debugfs_dir = debugfs_create_dir("kvm", NULL); 1236 for (p = debugfs_entries; p->name; ++p) 1237 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, 1238 (void *)(long)p->offset, 1239 stat_fops[p->kind]); 1240 } 1241 1242 static void kvm_exit_debug(void) 1243 { 1244 struct kvm_stats_debugfs_item *p; 1245 1246 for (p = debugfs_entries; p->name; ++p) 1247 debugfs_remove(p->dentry); 1248 debugfs_remove(debugfs_dir); 1249 } 1250 1251 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 1252 { 1253 hardware_disable(NULL); 1254 return 0; 1255 } 1256 1257 static int kvm_resume(struct sys_device *dev) 1258 { 1259 hardware_enable(NULL); 1260 return 0; 1261 } 1262 1263 static struct sysdev_class kvm_sysdev_class = { 1264 .name = "kvm", 1265 .suspend = kvm_suspend, 1266 .resume = kvm_resume, 1267 }; 1268 1269 static struct sys_device kvm_sysdev = { 1270 .id = 0, 1271 .cls = &kvm_sysdev_class, 1272 }; 1273 1274 struct page *bad_page; 1275 1276 static inline 1277 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 1278 { 1279 return container_of(pn, struct kvm_vcpu, preempt_notifier); 1280 } 1281 1282 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 1283 { 1284 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1285 1286 kvm_arch_vcpu_load(vcpu, cpu); 1287 } 1288 1289 static void kvm_sched_out(struct preempt_notifier *pn, 1290 struct task_struct *next) 1291 { 1292 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1293 1294 kvm_arch_vcpu_put(vcpu); 1295 } 1296 1297 int kvm_init(void *opaque, unsigned int vcpu_size, 1298 struct module *module) 1299 { 1300 int r; 1301 int cpu; 1302 1303 kvm_init_debug(); 1304 1305 r = kvm_arch_init(opaque); 1306 if (r) 1307 goto out_fail; 1308 1309 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1310 1311 if (bad_page == NULL) { 1312 r = -ENOMEM; 1313 goto out; 1314 } 1315 1316 r = kvm_arch_hardware_setup(); 1317 if (r < 0) 1318 goto out_free_0; 1319 1320 for_each_online_cpu(cpu) { 1321 smp_call_function_single(cpu, 1322 kvm_arch_check_processor_compat, 1323 &r, 0, 1); 1324 if (r < 0) 1325 goto out_free_1; 1326 } 1327 1328 on_each_cpu(hardware_enable, NULL, 0, 1); 1329 r = register_cpu_notifier(&kvm_cpu_notifier); 1330 if (r) 1331 goto out_free_2; 1332 register_reboot_notifier(&kvm_reboot_notifier); 1333 1334 r = sysdev_class_register(&kvm_sysdev_class); 1335 if (r) 1336 goto out_free_3; 1337 1338 r = sysdev_register(&kvm_sysdev); 1339 if (r) 1340 goto out_free_4; 1341 1342 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 1343 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 1344 __alignof__(struct kvm_vcpu), 1345 0, NULL); 1346 if (!kvm_vcpu_cache) { 1347 r = -ENOMEM; 1348 goto out_free_5; 1349 } 1350 1351 kvm_chardev_ops.owner = module; 1352 1353 r = misc_register(&kvm_dev); 1354 if (r) { 1355 printk(KERN_ERR "kvm: misc device register failed\n"); 1356 goto out_free; 1357 } 1358 1359 kvm_preempt_ops.sched_in = kvm_sched_in; 1360 kvm_preempt_ops.sched_out = kvm_sched_out; 1361 1362 return 0; 1363 1364 out_free: 1365 kmem_cache_destroy(kvm_vcpu_cache); 1366 out_free_5: 1367 sysdev_unregister(&kvm_sysdev); 1368 out_free_4: 1369 sysdev_class_unregister(&kvm_sysdev_class); 1370 out_free_3: 1371 unregister_reboot_notifier(&kvm_reboot_notifier); 1372 unregister_cpu_notifier(&kvm_cpu_notifier); 1373 out_free_2: 1374 on_each_cpu(hardware_disable, NULL, 0, 1); 1375 out_free_1: 1376 kvm_arch_hardware_unsetup(); 1377 out_free_0: 1378 __free_page(bad_page); 1379 out: 1380 kvm_arch_exit(); 1381 kvm_exit_debug(); 1382 out_fail: 1383 return r; 1384 } 1385 EXPORT_SYMBOL_GPL(kvm_init); 1386 1387 void kvm_exit(void) 1388 { 1389 misc_deregister(&kvm_dev); 1390 kmem_cache_destroy(kvm_vcpu_cache); 1391 sysdev_unregister(&kvm_sysdev); 1392 sysdev_class_unregister(&kvm_sysdev_class); 1393 unregister_reboot_notifier(&kvm_reboot_notifier); 1394 unregister_cpu_notifier(&kvm_cpu_notifier); 1395 on_each_cpu(hardware_disable, NULL, 0, 1); 1396 kvm_arch_hardware_unsetup(); 1397 kvm_arch_exit(); 1398 kvm_exit_debug(); 1399 __free_page(bad_page); 1400 } 1401 EXPORT_SYMBOL_GPL(kvm_exit); 1402