1 /* 2 * Kernel-based Virtual Machine driver for Linux 3 * 4 * This module enables machines with Intel VT-x extensions to run virtual 5 * machines without emulation or binary translation. 6 * 7 * Copyright (C) 2006 Qumranet, Inc. 8 * 9 * Authors: 10 * Avi Kivity <avi@qumranet.com> 11 * Yaniv Kamay <yaniv@qumranet.com> 12 * 13 * This work is licensed under the terms of the GNU GPL, version 2. See 14 * the COPYING file in the top-level directory. 15 * 16 */ 17 18 #include "iodev.h" 19 20 #include <linux/kvm_host.h> 21 #include <linux/kvm.h> 22 #include <linux/module.h> 23 #include <linux/errno.h> 24 #include <linux/percpu.h> 25 #include <linux/gfp.h> 26 #include <linux/mm.h> 27 #include <linux/miscdevice.h> 28 #include <linux/vmalloc.h> 29 #include <linux/reboot.h> 30 #include <linux/debugfs.h> 31 #include <linux/highmem.h> 32 #include <linux/file.h> 33 #include <linux/sysdev.h> 34 #include <linux/cpu.h> 35 #include <linux/sched.h> 36 #include <linux/cpumask.h> 37 #include <linux/smp.h> 38 #include <linux/anon_inodes.h> 39 #include <linux/profile.h> 40 #include <linux/kvm_para.h> 41 #include <linux/pagemap.h> 42 #include <linux/mman.h> 43 44 #include <asm/processor.h> 45 #include <asm/io.h> 46 #include <asm/uaccess.h> 47 #include <asm/pgtable.h> 48 49 MODULE_AUTHOR("Qumranet"); 50 MODULE_LICENSE("GPL"); 51 52 DEFINE_SPINLOCK(kvm_lock); 53 LIST_HEAD(vm_list); 54 55 static cpumask_t cpus_hardware_enabled; 56 57 struct kmem_cache *kvm_vcpu_cache; 58 EXPORT_SYMBOL_GPL(kvm_vcpu_cache); 59 60 static __read_mostly struct preempt_ops kvm_preempt_ops; 61 62 static struct dentry *debugfs_dir; 63 64 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, 65 unsigned long arg); 66 67 static inline int valid_vcpu(int n) 68 { 69 return likely(n >= 0 && n < KVM_MAX_VCPUS); 70 } 71 72 /* 73 * Switches to specified vcpu, until a matching vcpu_put() 74 */ 75 void vcpu_load(struct kvm_vcpu *vcpu) 76 { 77 int cpu; 78 79 mutex_lock(&vcpu->mutex); 80 cpu = get_cpu(); 81 preempt_notifier_register(&vcpu->preempt_notifier); 82 kvm_arch_vcpu_load(vcpu, cpu); 83 put_cpu(); 84 } 85 86 void vcpu_put(struct kvm_vcpu *vcpu) 87 { 88 preempt_disable(); 89 kvm_arch_vcpu_put(vcpu); 90 preempt_notifier_unregister(&vcpu->preempt_notifier); 91 preempt_enable(); 92 mutex_unlock(&vcpu->mutex); 93 } 94 95 static void ack_flush(void *_completed) 96 { 97 } 98 99 void kvm_flush_remote_tlbs(struct kvm *kvm) 100 { 101 int i, cpu; 102 cpumask_t cpus; 103 struct kvm_vcpu *vcpu; 104 105 cpus_clear(cpus); 106 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 107 vcpu = kvm->vcpus[i]; 108 if (!vcpu) 109 continue; 110 if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 111 continue; 112 cpu = vcpu->cpu; 113 if (cpu != -1 && cpu != raw_smp_processor_id()) 114 cpu_set(cpu, cpus); 115 } 116 if (cpus_empty(cpus)) 117 return; 118 ++kvm->stat.remote_tlb_flush; 119 smp_call_function_mask(cpus, ack_flush, NULL, 1); 120 } 121 122 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 123 { 124 struct page *page; 125 int r; 126 127 mutex_init(&vcpu->mutex); 128 vcpu->cpu = -1; 129 vcpu->kvm = kvm; 130 vcpu->vcpu_id = id; 131 init_waitqueue_head(&vcpu->wq); 132 133 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 134 if (!page) { 135 r = -ENOMEM; 136 goto fail; 137 } 138 vcpu->run = page_address(page); 139 140 r = kvm_arch_vcpu_init(vcpu); 141 if (r < 0) 142 goto fail_free_run; 143 return 0; 144 145 fail_free_run: 146 free_page((unsigned long)vcpu->run); 147 fail: 148 return r; 149 } 150 EXPORT_SYMBOL_GPL(kvm_vcpu_init); 151 152 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) 153 { 154 kvm_arch_vcpu_uninit(vcpu); 155 free_page((unsigned long)vcpu->run); 156 } 157 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit); 158 159 static struct kvm *kvm_create_vm(void) 160 { 161 struct kvm *kvm = kvm_arch_create_vm(); 162 163 if (IS_ERR(kvm)) 164 goto out; 165 166 kvm->mm = current->mm; 167 atomic_inc(&kvm->mm->mm_count); 168 spin_lock_init(&kvm->mmu_lock); 169 kvm_io_bus_init(&kvm->pio_bus); 170 mutex_init(&kvm->lock); 171 kvm_io_bus_init(&kvm->mmio_bus); 172 spin_lock(&kvm_lock); 173 list_add(&kvm->vm_list, &vm_list); 174 spin_unlock(&kvm_lock); 175 out: 176 return kvm; 177 } 178 179 /* 180 * Free any memory in @free but not in @dont. 181 */ 182 static void kvm_free_physmem_slot(struct kvm_memory_slot *free, 183 struct kvm_memory_slot *dont) 184 { 185 if (!dont || free->rmap != dont->rmap) 186 vfree(free->rmap); 187 188 if (!dont || free->dirty_bitmap != dont->dirty_bitmap) 189 vfree(free->dirty_bitmap); 190 191 free->npages = 0; 192 free->dirty_bitmap = NULL; 193 free->rmap = NULL; 194 } 195 196 void kvm_free_physmem(struct kvm *kvm) 197 { 198 int i; 199 200 for (i = 0; i < kvm->nmemslots; ++i) 201 kvm_free_physmem_slot(&kvm->memslots[i], NULL); 202 } 203 204 static void kvm_destroy_vm(struct kvm *kvm) 205 { 206 struct mm_struct *mm = kvm->mm; 207 208 spin_lock(&kvm_lock); 209 list_del(&kvm->vm_list); 210 spin_unlock(&kvm_lock); 211 kvm_io_bus_destroy(&kvm->pio_bus); 212 kvm_io_bus_destroy(&kvm->mmio_bus); 213 kvm_arch_destroy_vm(kvm); 214 mmdrop(mm); 215 } 216 217 static int kvm_vm_release(struct inode *inode, struct file *filp) 218 { 219 struct kvm *kvm = filp->private_data; 220 221 kvm_destroy_vm(kvm); 222 return 0; 223 } 224 225 /* 226 * Allocate some memory and give it an address in the guest physical address 227 * space. 228 * 229 * Discontiguous memory is allowed, mostly for framebuffers. 230 * 231 * Must be called holding mmap_sem for write. 232 */ 233 int __kvm_set_memory_region(struct kvm *kvm, 234 struct kvm_userspace_memory_region *mem, 235 int user_alloc) 236 { 237 int r; 238 gfn_t base_gfn; 239 unsigned long npages; 240 unsigned long i; 241 struct kvm_memory_slot *memslot; 242 struct kvm_memory_slot old, new; 243 244 r = -EINVAL; 245 /* General sanity checks */ 246 if (mem->memory_size & (PAGE_SIZE - 1)) 247 goto out; 248 if (mem->guest_phys_addr & (PAGE_SIZE - 1)) 249 goto out; 250 if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS) 251 goto out; 252 if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr) 253 goto out; 254 255 memslot = &kvm->memslots[mem->slot]; 256 base_gfn = mem->guest_phys_addr >> PAGE_SHIFT; 257 npages = mem->memory_size >> PAGE_SHIFT; 258 259 if (!npages) 260 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES; 261 262 new = old = *memslot; 263 264 new.base_gfn = base_gfn; 265 new.npages = npages; 266 new.flags = mem->flags; 267 268 /* Disallow changing a memory slot's size. */ 269 r = -EINVAL; 270 if (npages && old.npages && npages != old.npages) 271 goto out_free; 272 273 /* Check for overlaps */ 274 r = -EEXIST; 275 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 276 struct kvm_memory_slot *s = &kvm->memslots[i]; 277 278 if (s == memslot) 279 continue; 280 if (!((base_gfn + npages <= s->base_gfn) || 281 (base_gfn >= s->base_gfn + s->npages))) 282 goto out_free; 283 } 284 285 /* Free page dirty bitmap if unneeded */ 286 if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES)) 287 new.dirty_bitmap = NULL; 288 289 r = -ENOMEM; 290 291 /* Allocate if a slot is being created */ 292 if (npages && !new.rmap) { 293 new.rmap = vmalloc(npages * sizeof(struct page *)); 294 295 if (!new.rmap) 296 goto out_free; 297 298 memset(new.rmap, 0, npages * sizeof(*new.rmap)); 299 300 new.user_alloc = user_alloc; 301 new.userspace_addr = mem->userspace_addr; 302 } 303 304 /* Allocate page dirty bitmap if needed */ 305 if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) { 306 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8; 307 308 new.dirty_bitmap = vmalloc(dirty_bytes); 309 if (!new.dirty_bitmap) 310 goto out_free; 311 memset(new.dirty_bitmap, 0, dirty_bytes); 312 } 313 314 if (mem->slot >= kvm->nmemslots) 315 kvm->nmemslots = mem->slot + 1; 316 317 *memslot = new; 318 319 r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); 320 if (r) { 321 *memslot = old; 322 goto out_free; 323 } 324 325 kvm_free_physmem_slot(&old, &new); 326 return 0; 327 328 out_free: 329 kvm_free_physmem_slot(&new, &old); 330 out: 331 return r; 332 333 } 334 EXPORT_SYMBOL_GPL(__kvm_set_memory_region); 335 336 int kvm_set_memory_region(struct kvm *kvm, 337 struct kvm_userspace_memory_region *mem, 338 int user_alloc) 339 { 340 int r; 341 342 down_write(¤t->mm->mmap_sem); 343 r = __kvm_set_memory_region(kvm, mem, user_alloc); 344 up_write(¤t->mm->mmap_sem); 345 return r; 346 } 347 EXPORT_SYMBOL_GPL(kvm_set_memory_region); 348 349 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, 350 struct 351 kvm_userspace_memory_region *mem, 352 int user_alloc) 353 { 354 if (mem->slot >= KVM_MEMORY_SLOTS) 355 return -EINVAL; 356 return kvm_set_memory_region(kvm, mem, user_alloc); 357 } 358 359 int kvm_get_dirty_log(struct kvm *kvm, 360 struct kvm_dirty_log *log, int *is_dirty) 361 { 362 struct kvm_memory_slot *memslot; 363 int r, i; 364 int n; 365 unsigned long any = 0; 366 367 r = -EINVAL; 368 if (log->slot >= KVM_MEMORY_SLOTS) 369 goto out; 370 371 memslot = &kvm->memslots[log->slot]; 372 r = -ENOENT; 373 if (!memslot->dirty_bitmap) 374 goto out; 375 376 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 377 378 for (i = 0; !any && i < n/sizeof(long); ++i) 379 any = memslot->dirty_bitmap[i]; 380 381 r = -EFAULT; 382 if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) 383 goto out; 384 385 if (any) 386 *is_dirty = 1; 387 388 r = 0; 389 out: 390 return r; 391 } 392 393 int is_error_page(struct page *page) 394 { 395 return page == bad_page; 396 } 397 EXPORT_SYMBOL_GPL(is_error_page); 398 399 static inline unsigned long bad_hva(void) 400 { 401 return PAGE_OFFSET; 402 } 403 404 int kvm_is_error_hva(unsigned long addr) 405 { 406 return addr == bad_hva(); 407 } 408 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 409 410 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 411 { 412 int i; 413 414 for (i = 0; i < kvm->nmemslots; ++i) { 415 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 416 417 if (gfn >= memslot->base_gfn 418 && gfn < memslot->base_gfn + memslot->npages) 419 return memslot; 420 } 421 return NULL; 422 } 423 424 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 425 { 426 gfn = unalias_gfn(kvm, gfn); 427 return __gfn_to_memslot(kvm, gfn); 428 } 429 430 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 431 { 432 int i; 433 434 gfn = unalias_gfn(kvm, gfn); 435 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 436 struct kvm_memory_slot *memslot = &kvm->memslots[i]; 437 438 if (gfn >= memslot->base_gfn 439 && gfn < memslot->base_gfn + memslot->npages) 440 return 1; 441 } 442 return 0; 443 } 444 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); 445 446 static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn) 447 { 448 struct kvm_memory_slot *slot; 449 450 gfn = unalias_gfn(kvm, gfn); 451 slot = __gfn_to_memslot(kvm, gfn); 452 if (!slot) 453 return bad_hva(); 454 return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE); 455 } 456 457 /* 458 * Requires current->mm->mmap_sem to be held 459 */ 460 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) 461 { 462 struct page *page[1]; 463 unsigned long addr; 464 int npages; 465 466 might_sleep(); 467 468 addr = gfn_to_hva(kvm, gfn); 469 if (kvm_is_error_hva(addr)) { 470 get_page(bad_page); 471 return bad_page; 472 } 473 474 npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page, 475 NULL); 476 477 if (npages != 1) { 478 get_page(bad_page); 479 return bad_page; 480 } 481 482 return page[0]; 483 } 484 485 EXPORT_SYMBOL_GPL(gfn_to_page); 486 487 void kvm_release_page_clean(struct page *page) 488 { 489 put_page(page); 490 } 491 EXPORT_SYMBOL_GPL(kvm_release_page_clean); 492 493 void kvm_release_page_dirty(struct page *page) 494 { 495 if (!PageReserved(page)) 496 SetPageDirty(page); 497 put_page(page); 498 } 499 EXPORT_SYMBOL_GPL(kvm_release_page_dirty); 500 501 static int next_segment(unsigned long len, int offset) 502 { 503 if (len > PAGE_SIZE - offset) 504 return PAGE_SIZE - offset; 505 else 506 return len; 507 } 508 509 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, 510 int len) 511 { 512 int r; 513 unsigned long addr; 514 515 addr = gfn_to_hva(kvm, gfn); 516 if (kvm_is_error_hva(addr)) 517 return -EFAULT; 518 r = copy_from_user(data, (void __user *)addr + offset, len); 519 if (r) 520 return -EFAULT; 521 return 0; 522 } 523 EXPORT_SYMBOL_GPL(kvm_read_guest_page); 524 525 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len) 526 { 527 gfn_t gfn = gpa >> PAGE_SHIFT; 528 int seg; 529 int offset = offset_in_page(gpa); 530 int ret; 531 532 while ((seg = next_segment(len, offset)) != 0) { 533 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg); 534 if (ret < 0) 535 return ret; 536 offset = 0; 537 len -= seg; 538 data += seg; 539 ++gfn; 540 } 541 return 0; 542 } 543 EXPORT_SYMBOL_GPL(kvm_read_guest); 544 545 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, 546 unsigned long len) 547 { 548 int r; 549 unsigned long addr; 550 gfn_t gfn = gpa >> PAGE_SHIFT; 551 int offset = offset_in_page(gpa); 552 553 addr = gfn_to_hva(kvm, gfn); 554 if (kvm_is_error_hva(addr)) 555 return -EFAULT; 556 r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len); 557 if (r) 558 return -EFAULT; 559 return 0; 560 } 561 EXPORT_SYMBOL(kvm_read_guest_atomic); 562 563 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, 564 int offset, int len) 565 { 566 int r; 567 unsigned long addr; 568 569 addr = gfn_to_hva(kvm, gfn); 570 if (kvm_is_error_hva(addr)) 571 return -EFAULT; 572 r = copy_to_user((void __user *)addr + offset, data, len); 573 if (r) 574 return -EFAULT; 575 mark_page_dirty(kvm, gfn); 576 return 0; 577 } 578 EXPORT_SYMBOL_GPL(kvm_write_guest_page); 579 580 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, 581 unsigned long len) 582 { 583 gfn_t gfn = gpa >> PAGE_SHIFT; 584 int seg; 585 int offset = offset_in_page(gpa); 586 int ret; 587 588 while ((seg = next_segment(len, offset)) != 0) { 589 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg); 590 if (ret < 0) 591 return ret; 592 offset = 0; 593 len -= seg; 594 data += seg; 595 ++gfn; 596 } 597 return 0; 598 } 599 600 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) 601 { 602 return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len); 603 } 604 EXPORT_SYMBOL_GPL(kvm_clear_guest_page); 605 606 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len) 607 { 608 gfn_t gfn = gpa >> PAGE_SHIFT; 609 int seg; 610 int offset = offset_in_page(gpa); 611 int ret; 612 613 while ((seg = next_segment(len, offset)) != 0) { 614 ret = kvm_clear_guest_page(kvm, gfn, offset, seg); 615 if (ret < 0) 616 return ret; 617 offset = 0; 618 len -= seg; 619 ++gfn; 620 } 621 return 0; 622 } 623 EXPORT_SYMBOL_GPL(kvm_clear_guest); 624 625 void mark_page_dirty(struct kvm *kvm, gfn_t gfn) 626 { 627 struct kvm_memory_slot *memslot; 628 629 gfn = unalias_gfn(kvm, gfn); 630 memslot = __gfn_to_memslot(kvm, gfn); 631 if (memslot && memslot->dirty_bitmap) { 632 unsigned long rel_gfn = gfn - memslot->base_gfn; 633 634 /* avoid RMW */ 635 if (!test_bit(rel_gfn, memslot->dirty_bitmap)) 636 set_bit(rel_gfn, memslot->dirty_bitmap); 637 } 638 } 639 640 /* 641 * The vCPU has executed a HLT instruction with in-kernel mode enabled. 642 */ 643 void kvm_vcpu_block(struct kvm_vcpu *vcpu) 644 { 645 DECLARE_WAITQUEUE(wait, current); 646 647 add_wait_queue(&vcpu->wq, &wait); 648 649 /* 650 * We will block until either an interrupt or a signal wakes us up 651 */ 652 while (!kvm_cpu_has_interrupt(vcpu) 653 && !signal_pending(current) 654 && !kvm_arch_vcpu_runnable(vcpu)) { 655 set_current_state(TASK_INTERRUPTIBLE); 656 vcpu_put(vcpu); 657 schedule(); 658 vcpu_load(vcpu); 659 } 660 661 __set_current_state(TASK_RUNNING); 662 remove_wait_queue(&vcpu->wq, &wait); 663 } 664 665 void kvm_resched(struct kvm_vcpu *vcpu) 666 { 667 if (!need_resched()) 668 return; 669 cond_resched(); 670 } 671 EXPORT_SYMBOL_GPL(kvm_resched); 672 673 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 674 { 675 struct kvm_vcpu *vcpu = vma->vm_file->private_data; 676 struct page *page; 677 678 if (vmf->pgoff == 0) 679 page = virt_to_page(vcpu->run); 680 else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) 681 page = virt_to_page(vcpu->arch.pio_data); 682 else 683 return VM_FAULT_SIGBUS; 684 get_page(page); 685 vmf->page = page; 686 return 0; 687 } 688 689 static struct vm_operations_struct kvm_vcpu_vm_ops = { 690 .fault = kvm_vcpu_fault, 691 }; 692 693 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma) 694 { 695 vma->vm_ops = &kvm_vcpu_vm_ops; 696 return 0; 697 } 698 699 static int kvm_vcpu_release(struct inode *inode, struct file *filp) 700 { 701 struct kvm_vcpu *vcpu = filp->private_data; 702 703 fput(vcpu->kvm->filp); 704 return 0; 705 } 706 707 static struct file_operations kvm_vcpu_fops = { 708 .release = kvm_vcpu_release, 709 .unlocked_ioctl = kvm_vcpu_ioctl, 710 .compat_ioctl = kvm_vcpu_ioctl, 711 .mmap = kvm_vcpu_mmap, 712 }; 713 714 /* 715 * Allocates an inode for the vcpu. 716 */ 717 static int create_vcpu_fd(struct kvm_vcpu *vcpu) 718 { 719 int fd, r; 720 struct inode *inode; 721 struct file *file; 722 723 r = anon_inode_getfd(&fd, &inode, &file, 724 "kvm-vcpu", &kvm_vcpu_fops, vcpu); 725 if (r) 726 return r; 727 atomic_inc(&vcpu->kvm->filp->f_count); 728 return fd; 729 } 730 731 /* 732 * Creates some virtual cpus. Good luck creating more than one. 733 */ 734 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n) 735 { 736 int r; 737 struct kvm_vcpu *vcpu; 738 739 if (!valid_vcpu(n)) 740 return -EINVAL; 741 742 vcpu = kvm_arch_vcpu_create(kvm, n); 743 if (IS_ERR(vcpu)) 744 return PTR_ERR(vcpu); 745 746 preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); 747 748 r = kvm_arch_vcpu_setup(vcpu); 749 if (r) 750 goto vcpu_destroy; 751 752 mutex_lock(&kvm->lock); 753 if (kvm->vcpus[n]) { 754 r = -EEXIST; 755 mutex_unlock(&kvm->lock); 756 goto vcpu_destroy; 757 } 758 kvm->vcpus[n] = vcpu; 759 mutex_unlock(&kvm->lock); 760 761 /* Now it's all set up, let userspace reach it */ 762 r = create_vcpu_fd(vcpu); 763 if (r < 0) 764 goto unlink; 765 return r; 766 767 unlink: 768 mutex_lock(&kvm->lock); 769 kvm->vcpus[n] = NULL; 770 mutex_unlock(&kvm->lock); 771 vcpu_destroy: 772 kvm_arch_vcpu_destroy(vcpu); 773 return r; 774 } 775 776 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset) 777 { 778 if (sigset) { 779 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP)); 780 vcpu->sigset_active = 1; 781 vcpu->sigset = *sigset; 782 } else 783 vcpu->sigset_active = 0; 784 return 0; 785 } 786 787 static long kvm_vcpu_ioctl(struct file *filp, 788 unsigned int ioctl, unsigned long arg) 789 { 790 struct kvm_vcpu *vcpu = filp->private_data; 791 void __user *argp = (void __user *)arg; 792 int r; 793 794 if (vcpu->kvm->mm != current->mm) 795 return -EIO; 796 switch (ioctl) { 797 case KVM_RUN: 798 r = -EINVAL; 799 if (arg) 800 goto out; 801 r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); 802 break; 803 case KVM_GET_REGS: { 804 struct kvm_regs kvm_regs; 805 806 memset(&kvm_regs, 0, sizeof kvm_regs); 807 r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs); 808 if (r) 809 goto out; 810 r = -EFAULT; 811 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs)) 812 goto out; 813 r = 0; 814 break; 815 } 816 case KVM_SET_REGS: { 817 struct kvm_regs kvm_regs; 818 819 r = -EFAULT; 820 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs)) 821 goto out; 822 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs); 823 if (r) 824 goto out; 825 r = 0; 826 break; 827 } 828 case KVM_GET_SREGS: { 829 struct kvm_sregs kvm_sregs; 830 831 memset(&kvm_sregs, 0, sizeof kvm_sregs); 832 r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs); 833 if (r) 834 goto out; 835 r = -EFAULT; 836 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs)) 837 goto out; 838 r = 0; 839 break; 840 } 841 case KVM_SET_SREGS: { 842 struct kvm_sregs kvm_sregs; 843 844 r = -EFAULT; 845 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs)) 846 goto out; 847 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs); 848 if (r) 849 goto out; 850 r = 0; 851 break; 852 } 853 case KVM_TRANSLATE: { 854 struct kvm_translation tr; 855 856 r = -EFAULT; 857 if (copy_from_user(&tr, argp, sizeof tr)) 858 goto out; 859 r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); 860 if (r) 861 goto out; 862 r = -EFAULT; 863 if (copy_to_user(argp, &tr, sizeof tr)) 864 goto out; 865 r = 0; 866 break; 867 } 868 case KVM_DEBUG_GUEST: { 869 struct kvm_debug_guest dbg; 870 871 r = -EFAULT; 872 if (copy_from_user(&dbg, argp, sizeof dbg)) 873 goto out; 874 r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg); 875 if (r) 876 goto out; 877 r = 0; 878 break; 879 } 880 case KVM_SET_SIGNAL_MASK: { 881 struct kvm_signal_mask __user *sigmask_arg = argp; 882 struct kvm_signal_mask kvm_sigmask; 883 sigset_t sigset, *p; 884 885 p = NULL; 886 if (argp) { 887 r = -EFAULT; 888 if (copy_from_user(&kvm_sigmask, argp, 889 sizeof kvm_sigmask)) 890 goto out; 891 r = -EINVAL; 892 if (kvm_sigmask.len != sizeof sigset) 893 goto out; 894 r = -EFAULT; 895 if (copy_from_user(&sigset, sigmask_arg->sigset, 896 sizeof sigset)) 897 goto out; 898 p = &sigset; 899 } 900 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 901 break; 902 } 903 case KVM_GET_FPU: { 904 struct kvm_fpu fpu; 905 906 memset(&fpu, 0, sizeof fpu); 907 r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu); 908 if (r) 909 goto out; 910 r = -EFAULT; 911 if (copy_to_user(argp, &fpu, sizeof fpu)) 912 goto out; 913 r = 0; 914 break; 915 } 916 case KVM_SET_FPU: { 917 struct kvm_fpu fpu; 918 919 r = -EFAULT; 920 if (copy_from_user(&fpu, argp, sizeof fpu)) 921 goto out; 922 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu); 923 if (r) 924 goto out; 925 r = 0; 926 break; 927 } 928 default: 929 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 930 } 931 out: 932 return r; 933 } 934 935 static long kvm_vm_ioctl(struct file *filp, 936 unsigned int ioctl, unsigned long arg) 937 { 938 struct kvm *kvm = filp->private_data; 939 void __user *argp = (void __user *)arg; 940 int r; 941 942 if (kvm->mm != current->mm) 943 return -EIO; 944 switch (ioctl) { 945 case KVM_CREATE_VCPU: 946 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 947 if (r < 0) 948 goto out; 949 break; 950 case KVM_SET_USER_MEMORY_REGION: { 951 struct kvm_userspace_memory_region kvm_userspace_mem; 952 953 r = -EFAULT; 954 if (copy_from_user(&kvm_userspace_mem, argp, 955 sizeof kvm_userspace_mem)) 956 goto out; 957 958 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 959 if (r) 960 goto out; 961 break; 962 } 963 case KVM_GET_DIRTY_LOG: { 964 struct kvm_dirty_log log; 965 966 r = -EFAULT; 967 if (copy_from_user(&log, argp, sizeof log)) 968 goto out; 969 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 970 if (r) 971 goto out; 972 break; 973 } 974 default: 975 r = kvm_arch_vm_ioctl(filp, ioctl, arg); 976 } 977 out: 978 return r; 979 } 980 981 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 982 { 983 struct kvm *kvm = vma->vm_file->private_data; 984 struct page *page; 985 986 if (!kvm_is_visible_gfn(kvm, vmf->pgoff)) 987 return VM_FAULT_SIGBUS; 988 page = gfn_to_page(kvm, vmf->pgoff); 989 if (is_error_page(page)) { 990 kvm_release_page_clean(page); 991 return VM_FAULT_SIGBUS; 992 } 993 vmf->page = page; 994 return 0; 995 } 996 997 static struct vm_operations_struct kvm_vm_vm_ops = { 998 .fault = kvm_vm_fault, 999 }; 1000 1001 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma) 1002 { 1003 vma->vm_ops = &kvm_vm_vm_ops; 1004 return 0; 1005 } 1006 1007 static struct file_operations kvm_vm_fops = { 1008 .release = kvm_vm_release, 1009 .unlocked_ioctl = kvm_vm_ioctl, 1010 .compat_ioctl = kvm_vm_ioctl, 1011 .mmap = kvm_vm_mmap, 1012 }; 1013 1014 static int kvm_dev_ioctl_create_vm(void) 1015 { 1016 int fd, r; 1017 struct inode *inode; 1018 struct file *file; 1019 struct kvm *kvm; 1020 1021 kvm = kvm_create_vm(); 1022 if (IS_ERR(kvm)) 1023 return PTR_ERR(kvm); 1024 r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm); 1025 if (r) { 1026 kvm_destroy_vm(kvm); 1027 return r; 1028 } 1029 1030 kvm->filp = file; 1031 1032 return fd; 1033 } 1034 1035 static long kvm_dev_ioctl(struct file *filp, 1036 unsigned int ioctl, unsigned long arg) 1037 { 1038 void __user *argp = (void __user *)arg; 1039 long r = -EINVAL; 1040 1041 switch (ioctl) { 1042 case KVM_GET_API_VERSION: 1043 r = -EINVAL; 1044 if (arg) 1045 goto out; 1046 r = KVM_API_VERSION; 1047 break; 1048 case KVM_CREATE_VM: 1049 r = -EINVAL; 1050 if (arg) 1051 goto out; 1052 r = kvm_dev_ioctl_create_vm(); 1053 break; 1054 case KVM_CHECK_EXTENSION: 1055 r = kvm_dev_ioctl_check_extension((long)argp); 1056 break; 1057 case KVM_GET_VCPU_MMAP_SIZE: 1058 r = -EINVAL; 1059 if (arg) 1060 goto out; 1061 r = 2 * PAGE_SIZE; 1062 break; 1063 default: 1064 return kvm_arch_dev_ioctl(filp, ioctl, arg); 1065 } 1066 out: 1067 return r; 1068 } 1069 1070 static struct file_operations kvm_chardev_ops = { 1071 .unlocked_ioctl = kvm_dev_ioctl, 1072 .compat_ioctl = kvm_dev_ioctl, 1073 }; 1074 1075 static struct miscdevice kvm_dev = { 1076 KVM_MINOR, 1077 "kvm", 1078 &kvm_chardev_ops, 1079 }; 1080 1081 static void hardware_enable(void *junk) 1082 { 1083 int cpu = raw_smp_processor_id(); 1084 1085 if (cpu_isset(cpu, cpus_hardware_enabled)) 1086 return; 1087 cpu_set(cpu, cpus_hardware_enabled); 1088 kvm_arch_hardware_enable(NULL); 1089 } 1090 1091 static void hardware_disable(void *junk) 1092 { 1093 int cpu = raw_smp_processor_id(); 1094 1095 if (!cpu_isset(cpu, cpus_hardware_enabled)) 1096 return; 1097 cpu_clear(cpu, cpus_hardware_enabled); 1098 decache_vcpus_on_cpu(cpu); 1099 kvm_arch_hardware_disable(NULL); 1100 } 1101 1102 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val, 1103 void *v) 1104 { 1105 int cpu = (long)v; 1106 1107 val &= ~CPU_TASKS_FROZEN; 1108 switch (val) { 1109 case CPU_DYING: 1110 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1111 cpu); 1112 hardware_disable(NULL); 1113 break; 1114 case CPU_UP_CANCELED: 1115 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n", 1116 cpu); 1117 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1); 1118 break; 1119 case CPU_ONLINE: 1120 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n", 1121 cpu); 1122 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1); 1123 break; 1124 } 1125 return NOTIFY_OK; 1126 } 1127 1128 static int kvm_reboot(struct notifier_block *notifier, unsigned long val, 1129 void *v) 1130 { 1131 if (val == SYS_RESTART) { 1132 /* 1133 * Some (well, at least mine) BIOSes hang on reboot if 1134 * in vmx root mode. 1135 */ 1136 printk(KERN_INFO "kvm: exiting hardware virtualization\n"); 1137 on_each_cpu(hardware_disable, NULL, 0, 1); 1138 } 1139 return NOTIFY_OK; 1140 } 1141 1142 static struct notifier_block kvm_reboot_notifier = { 1143 .notifier_call = kvm_reboot, 1144 .priority = 0, 1145 }; 1146 1147 void kvm_io_bus_init(struct kvm_io_bus *bus) 1148 { 1149 memset(bus, 0, sizeof(*bus)); 1150 } 1151 1152 void kvm_io_bus_destroy(struct kvm_io_bus *bus) 1153 { 1154 int i; 1155 1156 for (i = 0; i < bus->dev_count; i++) { 1157 struct kvm_io_device *pos = bus->devs[i]; 1158 1159 kvm_iodevice_destructor(pos); 1160 } 1161 } 1162 1163 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr) 1164 { 1165 int i; 1166 1167 for (i = 0; i < bus->dev_count; i++) { 1168 struct kvm_io_device *pos = bus->devs[i]; 1169 1170 if (pos->in_range(pos, addr)) 1171 return pos; 1172 } 1173 1174 return NULL; 1175 } 1176 1177 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev) 1178 { 1179 BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1)); 1180 1181 bus->devs[bus->dev_count++] = dev; 1182 } 1183 1184 static struct notifier_block kvm_cpu_notifier = { 1185 .notifier_call = kvm_cpu_hotplug, 1186 .priority = 20, /* must be > scheduler priority */ 1187 }; 1188 1189 static u64 vm_stat_get(void *_offset) 1190 { 1191 unsigned offset = (long)_offset; 1192 u64 total = 0; 1193 struct kvm *kvm; 1194 1195 spin_lock(&kvm_lock); 1196 list_for_each_entry(kvm, &vm_list, vm_list) 1197 total += *(u32 *)((void *)kvm + offset); 1198 spin_unlock(&kvm_lock); 1199 return total; 1200 } 1201 1202 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n"); 1203 1204 static u64 vcpu_stat_get(void *_offset) 1205 { 1206 unsigned offset = (long)_offset; 1207 u64 total = 0; 1208 struct kvm *kvm; 1209 struct kvm_vcpu *vcpu; 1210 int i; 1211 1212 spin_lock(&kvm_lock); 1213 list_for_each_entry(kvm, &vm_list, vm_list) 1214 for (i = 0; i < KVM_MAX_VCPUS; ++i) { 1215 vcpu = kvm->vcpus[i]; 1216 if (vcpu) 1217 total += *(u32 *)((void *)vcpu + offset); 1218 } 1219 spin_unlock(&kvm_lock); 1220 return total; 1221 } 1222 1223 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n"); 1224 1225 static struct file_operations *stat_fops[] = { 1226 [KVM_STAT_VCPU] = &vcpu_stat_fops, 1227 [KVM_STAT_VM] = &vm_stat_fops, 1228 }; 1229 1230 static void kvm_init_debug(void) 1231 { 1232 struct kvm_stats_debugfs_item *p; 1233 1234 debugfs_dir = debugfs_create_dir("kvm", NULL); 1235 for (p = debugfs_entries; p->name; ++p) 1236 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir, 1237 (void *)(long)p->offset, 1238 stat_fops[p->kind]); 1239 } 1240 1241 static void kvm_exit_debug(void) 1242 { 1243 struct kvm_stats_debugfs_item *p; 1244 1245 for (p = debugfs_entries; p->name; ++p) 1246 debugfs_remove(p->dentry); 1247 debugfs_remove(debugfs_dir); 1248 } 1249 1250 static int kvm_suspend(struct sys_device *dev, pm_message_t state) 1251 { 1252 hardware_disable(NULL); 1253 return 0; 1254 } 1255 1256 static int kvm_resume(struct sys_device *dev) 1257 { 1258 hardware_enable(NULL); 1259 return 0; 1260 } 1261 1262 static struct sysdev_class kvm_sysdev_class = { 1263 .name = "kvm", 1264 .suspend = kvm_suspend, 1265 .resume = kvm_resume, 1266 }; 1267 1268 static struct sys_device kvm_sysdev = { 1269 .id = 0, 1270 .cls = &kvm_sysdev_class, 1271 }; 1272 1273 struct page *bad_page; 1274 1275 static inline 1276 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn) 1277 { 1278 return container_of(pn, struct kvm_vcpu, preempt_notifier); 1279 } 1280 1281 static void kvm_sched_in(struct preempt_notifier *pn, int cpu) 1282 { 1283 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1284 1285 kvm_arch_vcpu_load(vcpu, cpu); 1286 } 1287 1288 static void kvm_sched_out(struct preempt_notifier *pn, 1289 struct task_struct *next) 1290 { 1291 struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); 1292 1293 kvm_arch_vcpu_put(vcpu); 1294 } 1295 1296 int kvm_init(void *opaque, unsigned int vcpu_size, 1297 struct module *module) 1298 { 1299 int r; 1300 int cpu; 1301 1302 kvm_init_debug(); 1303 1304 r = kvm_arch_init(opaque); 1305 if (r) 1306 goto out_fail; 1307 1308 bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 1309 1310 if (bad_page == NULL) { 1311 r = -ENOMEM; 1312 goto out; 1313 } 1314 1315 r = kvm_arch_hardware_setup(); 1316 if (r < 0) 1317 goto out_free_0; 1318 1319 for_each_online_cpu(cpu) { 1320 smp_call_function_single(cpu, 1321 kvm_arch_check_processor_compat, 1322 &r, 0, 1); 1323 if (r < 0) 1324 goto out_free_1; 1325 } 1326 1327 on_each_cpu(hardware_enable, NULL, 0, 1); 1328 r = register_cpu_notifier(&kvm_cpu_notifier); 1329 if (r) 1330 goto out_free_2; 1331 register_reboot_notifier(&kvm_reboot_notifier); 1332 1333 r = sysdev_class_register(&kvm_sysdev_class); 1334 if (r) 1335 goto out_free_3; 1336 1337 r = sysdev_register(&kvm_sysdev); 1338 if (r) 1339 goto out_free_4; 1340 1341 /* A kmem cache lets us meet the alignment requirements of fx_save. */ 1342 kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size, 1343 __alignof__(struct kvm_vcpu), 1344 0, NULL); 1345 if (!kvm_vcpu_cache) { 1346 r = -ENOMEM; 1347 goto out_free_5; 1348 } 1349 1350 kvm_chardev_ops.owner = module; 1351 1352 r = misc_register(&kvm_dev); 1353 if (r) { 1354 printk(KERN_ERR "kvm: misc device register failed\n"); 1355 goto out_free; 1356 } 1357 1358 kvm_preempt_ops.sched_in = kvm_sched_in; 1359 kvm_preempt_ops.sched_out = kvm_sched_out; 1360 1361 return 0; 1362 1363 out_free: 1364 kmem_cache_destroy(kvm_vcpu_cache); 1365 out_free_5: 1366 sysdev_unregister(&kvm_sysdev); 1367 out_free_4: 1368 sysdev_class_unregister(&kvm_sysdev_class); 1369 out_free_3: 1370 unregister_reboot_notifier(&kvm_reboot_notifier); 1371 unregister_cpu_notifier(&kvm_cpu_notifier); 1372 out_free_2: 1373 on_each_cpu(hardware_disable, NULL, 0, 1); 1374 out_free_1: 1375 kvm_arch_hardware_unsetup(); 1376 out_free_0: 1377 __free_page(bad_page); 1378 out: 1379 kvm_arch_exit(); 1380 kvm_exit_debug(); 1381 out_fail: 1382 return r; 1383 } 1384 EXPORT_SYMBOL_GPL(kvm_init); 1385 1386 void kvm_exit(void) 1387 { 1388 misc_deregister(&kvm_dev); 1389 kmem_cache_destroy(kvm_vcpu_cache); 1390 sysdev_unregister(&kvm_sysdev); 1391 sysdev_class_unregister(&kvm_sysdev_class); 1392 unregister_reboot_notifier(&kvm_reboot_notifier); 1393 unregister_cpu_notifier(&kvm_cpu_notifier); 1394 on_each_cpu(hardware_disable, NULL, 0, 1); 1395 kvm_arch_hardware_unsetup(); 1396 kvm_arch_exit(); 1397 kvm_exit_debug(); 1398 __free_page(bad_page); 1399 } 1400 EXPORT_SYMBOL_GPL(kvm_exit); 1401