xref: /openbmc/linux/virt/kvm/kvm_main.c (revision a1e58bbd)
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17 
18 #include "iodev.h"
19 
20 #include <linux/kvm_host.h>
21 #include <linux/kvm.h>
22 #include <linux/module.h>
23 #include <linux/errno.h>
24 #include <linux/percpu.h>
25 #include <linux/gfp.h>
26 #include <linux/mm.h>
27 #include <linux/miscdevice.h>
28 #include <linux/vmalloc.h>
29 #include <linux/reboot.h>
30 #include <linux/debugfs.h>
31 #include <linux/highmem.h>
32 #include <linux/file.h>
33 #include <linux/sysdev.h>
34 #include <linux/cpu.h>
35 #include <linux/sched.h>
36 #include <linux/cpumask.h>
37 #include <linux/smp.h>
38 #include <linux/anon_inodes.h>
39 #include <linux/profile.h>
40 #include <linux/kvm_para.h>
41 #include <linux/pagemap.h>
42 #include <linux/mman.h>
43 
44 #include <asm/processor.h>
45 #include <asm/io.h>
46 #include <asm/uaccess.h>
47 #include <asm/pgtable.h>
48 
49 MODULE_AUTHOR("Qumranet");
50 MODULE_LICENSE("GPL");
51 
52 DEFINE_SPINLOCK(kvm_lock);
53 LIST_HEAD(vm_list);
54 
55 static cpumask_t cpus_hardware_enabled;
56 
57 struct kmem_cache *kvm_vcpu_cache;
58 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
59 
60 static __read_mostly struct preempt_ops kvm_preempt_ops;
61 
62 static struct dentry *debugfs_dir;
63 
64 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
65 			   unsigned long arg);
66 
67 static inline int valid_vcpu(int n)
68 {
69 	return likely(n >= 0 && n < KVM_MAX_VCPUS);
70 }
71 
72 /*
73  * Switches to specified vcpu, until a matching vcpu_put()
74  */
75 void vcpu_load(struct kvm_vcpu *vcpu)
76 {
77 	int cpu;
78 
79 	mutex_lock(&vcpu->mutex);
80 	cpu = get_cpu();
81 	preempt_notifier_register(&vcpu->preempt_notifier);
82 	kvm_arch_vcpu_load(vcpu, cpu);
83 	put_cpu();
84 }
85 
86 void vcpu_put(struct kvm_vcpu *vcpu)
87 {
88 	preempt_disable();
89 	kvm_arch_vcpu_put(vcpu);
90 	preempt_notifier_unregister(&vcpu->preempt_notifier);
91 	preempt_enable();
92 	mutex_unlock(&vcpu->mutex);
93 }
94 
95 static void ack_flush(void *_completed)
96 {
97 }
98 
99 void kvm_flush_remote_tlbs(struct kvm *kvm)
100 {
101 	int i, cpu;
102 	cpumask_t cpus;
103 	struct kvm_vcpu *vcpu;
104 
105 	cpus_clear(cpus);
106 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
107 		vcpu = kvm->vcpus[i];
108 		if (!vcpu)
109 			continue;
110 		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
111 			continue;
112 		cpu = vcpu->cpu;
113 		if (cpu != -1 && cpu != raw_smp_processor_id())
114 			cpu_set(cpu, cpus);
115 	}
116 	if (cpus_empty(cpus))
117 		return;
118 	++kvm->stat.remote_tlb_flush;
119 	smp_call_function_mask(cpus, ack_flush, NULL, 1);
120 }
121 
122 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
123 {
124 	struct page *page;
125 	int r;
126 
127 	mutex_init(&vcpu->mutex);
128 	vcpu->cpu = -1;
129 	vcpu->kvm = kvm;
130 	vcpu->vcpu_id = id;
131 	init_waitqueue_head(&vcpu->wq);
132 
133 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
134 	if (!page) {
135 		r = -ENOMEM;
136 		goto fail;
137 	}
138 	vcpu->run = page_address(page);
139 
140 	r = kvm_arch_vcpu_init(vcpu);
141 	if (r < 0)
142 		goto fail_free_run;
143 	return 0;
144 
145 fail_free_run:
146 	free_page((unsigned long)vcpu->run);
147 fail:
148 	return r;
149 }
150 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
151 
152 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
153 {
154 	kvm_arch_vcpu_uninit(vcpu);
155 	free_page((unsigned long)vcpu->run);
156 }
157 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
158 
159 static struct kvm *kvm_create_vm(void)
160 {
161 	struct kvm *kvm = kvm_arch_create_vm();
162 
163 	if (IS_ERR(kvm))
164 		goto out;
165 
166 	kvm->mm = current->mm;
167 	atomic_inc(&kvm->mm->mm_count);
168 	spin_lock_init(&kvm->mmu_lock);
169 	kvm_io_bus_init(&kvm->pio_bus);
170 	mutex_init(&kvm->lock);
171 	kvm_io_bus_init(&kvm->mmio_bus);
172 	init_rwsem(&kvm->slots_lock);
173 	spin_lock(&kvm_lock);
174 	list_add(&kvm->vm_list, &vm_list);
175 	spin_unlock(&kvm_lock);
176 out:
177 	return kvm;
178 }
179 
180 /*
181  * Free any memory in @free but not in @dont.
182  */
183 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
184 				  struct kvm_memory_slot *dont)
185 {
186 	if (!dont || free->rmap != dont->rmap)
187 		vfree(free->rmap);
188 
189 	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
190 		vfree(free->dirty_bitmap);
191 
192 	free->npages = 0;
193 	free->dirty_bitmap = NULL;
194 	free->rmap = NULL;
195 }
196 
197 void kvm_free_physmem(struct kvm *kvm)
198 {
199 	int i;
200 
201 	for (i = 0; i < kvm->nmemslots; ++i)
202 		kvm_free_physmem_slot(&kvm->memslots[i], NULL);
203 }
204 
205 static void kvm_destroy_vm(struct kvm *kvm)
206 {
207 	struct mm_struct *mm = kvm->mm;
208 
209 	spin_lock(&kvm_lock);
210 	list_del(&kvm->vm_list);
211 	spin_unlock(&kvm_lock);
212 	kvm_io_bus_destroy(&kvm->pio_bus);
213 	kvm_io_bus_destroy(&kvm->mmio_bus);
214 	kvm_arch_destroy_vm(kvm);
215 	mmdrop(mm);
216 }
217 
218 static int kvm_vm_release(struct inode *inode, struct file *filp)
219 {
220 	struct kvm *kvm = filp->private_data;
221 
222 	kvm_destroy_vm(kvm);
223 	return 0;
224 }
225 
226 /*
227  * Allocate some memory and give it an address in the guest physical address
228  * space.
229  *
230  * Discontiguous memory is allowed, mostly for framebuffers.
231  *
232  * Must be called holding mmap_sem for write.
233  */
234 int __kvm_set_memory_region(struct kvm *kvm,
235 			    struct kvm_userspace_memory_region *mem,
236 			    int user_alloc)
237 {
238 	int r;
239 	gfn_t base_gfn;
240 	unsigned long npages;
241 	unsigned long i;
242 	struct kvm_memory_slot *memslot;
243 	struct kvm_memory_slot old, new;
244 
245 	r = -EINVAL;
246 	/* General sanity checks */
247 	if (mem->memory_size & (PAGE_SIZE - 1))
248 		goto out;
249 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
250 		goto out;
251 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
252 		goto out;
253 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
254 		goto out;
255 
256 	memslot = &kvm->memslots[mem->slot];
257 	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
258 	npages = mem->memory_size >> PAGE_SHIFT;
259 
260 	if (!npages)
261 		mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
262 
263 	new = old = *memslot;
264 
265 	new.base_gfn = base_gfn;
266 	new.npages = npages;
267 	new.flags = mem->flags;
268 
269 	/* Disallow changing a memory slot's size. */
270 	r = -EINVAL;
271 	if (npages && old.npages && npages != old.npages)
272 		goto out_free;
273 
274 	/* Check for overlaps */
275 	r = -EEXIST;
276 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
277 		struct kvm_memory_slot *s = &kvm->memslots[i];
278 
279 		if (s == memslot)
280 			continue;
281 		if (!((base_gfn + npages <= s->base_gfn) ||
282 		      (base_gfn >= s->base_gfn + s->npages)))
283 			goto out_free;
284 	}
285 
286 	/* Free page dirty bitmap if unneeded */
287 	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
288 		new.dirty_bitmap = NULL;
289 
290 	r = -ENOMEM;
291 
292 	/* Allocate if a slot is being created */
293 	if (npages && !new.rmap) {
294 		new.rmap = vmalloc(npages * sizeof(struct page *));
295 
296 		if (!new.rmap)
297 			goto out_free;
298 
299 		memset(new.rmap, 0, npages * sizeof(*new.rmap));
300 
301 		new.user_alloc = user_alloc;
302 		new.userspace_addr = mem->userspace_addr;
303 	}
304 
305 	/* Allocate page dirty bitmap if needed */
306 	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
307 		unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
308 
309 		new.dirty_bitmap = vmalloc(dirty_bytes);
310 		if (!new.dirty_bitmap)
311 			goto out_free;
312 		memset(new.dirty_bitmap, 0, dirty_bytes);
313 	}
314 
315 	if (mem->slot >= kvm->nmemslots)
316 		kvm->nmemslots = mem->slot + 1;
317 
318 	*memslot = new;
319 
320 	r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc);
321 	if (r) {
322 		*memslot = old;
323 		goto out_free;
324 	}
325 
326 	kvm_free_physmem_slot(&old, &new);
327 	return 0;
328 
329 out_free:
330 	kvm_free_physmem_slot(&new, &old);
331 out:
332 	return r;
333 
334 }
335 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
336 
337 int kvm_set_memory_region(struct kvm *kvm,
338 			  struct kvm_userspace_memory_region *mem,
339 			  int user_alloc)
340 {
341 	int r;
342 
343 	down_write(&kvm->slots_lock);
344 	r = __kvm_set_memory_region(kvm, mem, user_alloc);
345 	up_write(&kvm->slots_lock);
346 	return r;
347 }
348 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
349 
350 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
351 				   struct
352 				   kvm_userspace_memory_region *mem,
353 				   int user_alloc)
354 {
355 	if (mem->slot >= KVM_MEMORY_SLOTS)
356 		return -EINVAL;
357 	return kvm_set_memory_region(kvm, mem, user_alloc);
358 }
359 
360 int kvm_get_dirty_log(struct kvm *kvm,
361 			struct kvm_dirty_log *log, int *is_dirty)
362 {
363 	struct kvm_memory_slot *memslot;
364 	int r, i;
365 	int n;
366 	unsigned long any = 0;
367 
368 	r = -EINVAL;
369 	if (log->slot >= KVM_MEMORY_SLOTS)
370 		goto out;
371 
372 	memslot = &kvm->memslots[log->slot];
373 	r = -ENOENT;
374 	if (!memslot->dirty_bitmap)
375 		goto out;
376 
377 	n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
378 
379 	for (i = 0; !any && i < n/sizeof(long); ++i)
380 		any = memslot->dirty_bitmap[i];
381 
382 	r = -EFAULT;
383 	if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
384 		goto out;
385 
386 	if (any)
387 		*is_dirty = 1;
388 
389 	r = 0;
390 out:
391 	return r;
392 }
393 
394 int is_error_page(struct page *page)
395 {
396 	return page == bad_page;
397 }
398 EXPORT_SYMBOL_GPL(is_error_page);
399 
400 static inline unsigned long bad_hva(void)
401 {
402 	return PAGE_OFFSET;
403 }
404 
405 int kvm_is_error_hva(unsigned long addr)
406 {
407 	return addr == bad_hva();
408 }
409 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
410 
411 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
412 {
413 	int i;
414 
415 	for (i = 0; i < kvm->nmemslots; ++i) {
416 		struct kvm_memory_slot *memslot = &kvm->memslots[i];
417 
418 		if (gfn >= memslot->base_gfn
419 		    && gfn < memslot->base_gfn + memslot->npages)
420 			return memslot;
421 	}
422 	return NULL;
423 }
424 
425 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
426 {
427 	gfn = unalias_gfn(kvm, gfn);
428 	return __gfn_to_memslot(kvm, gfn);
429 }
430 
431 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
432 {
433 	int i;
434 
435 	gfn = unalias_gfn(kvm, gfn);
436 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
437 		struct kvm_memory_slot *memslot = &kvm->memslots[i];
438 
439 		if (gfn >= memslot->base_gfn
440 		    && gfn < memslot->base_gfn + memslot->npages)
441 			return 1;
442 	}
443 	return 0;
444 }
445 EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
446 
447 static unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
448 {
449 	struct kvm_memory_slot *slot;
450 
451 	gfn = unalias_gfn(kvm, gfn);
452 	slot = __gfn_to_memslot(kvm, gfn);
453 	if (!slot)
454 		return bad_hva();
455 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
456 }
457 
458 /*
459  * Requires current->mm->mmap_sem to be held
460  */
461 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
462 {
463 	struct page *page[1];
464 	unsigned long addr;
465 	int npages;
466 
467 	might_sleep();
468 
469 	addr = gfn_to_hva(kvm, gfn);
470 	if (kvm_is_error_hva(addr)) {
471 		get_page(bad_page);
472 		return bad_page;
473 	}
474 
475 	npages = get_user_pages(current, current->mm, addr, 1, 1, 1, page,
476 				NULL);
477 
478 	if (npages != 1) {
479 		get_page(bad_page);
480 		return bad_page;
481 	}
482 
483 	return page[0];
484 }
485 
486 EXPORT_SYMBOL_GPL(gfn_to_page);
487 
488 void kvm_release_page_clean(struct page *page)
489 {
490 	put_page(page);
491 }
492 EXPORT_SYMBOL_GPL(kvm_release_page_clean);
493 
494 void kvm_release_page_dirty(struct page *page)
495 {
496 	if (!PageReserved(page))
497 		SetPageDirty(page);
498 	put_page(page);
499 }
500 EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
501 
502 static int next_segment(unsigned long len, int offset)
503 {
504 	if (len > PAGE_SIZE - offset)
505 		return PAGE_SIZE - offset;
506 	else
507 		return len;
508 }
509 
510 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
511 			int len)
512 {
513 	int r;
514 	unsigned long addr;
515 
516 	addr = gfn_to_hva(kvm, gfn);
517 	if (kvm_is_error_hva(addr))
518 		return -EFAULT;
519 	r = copy_from_user(data, (void __user *)addr + offset, len);
520 	if (r)
521 		return -EFAULT;
522 	return 0;
523 }
524 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
525 
526 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
527 {
528 	gfn_t gfn = gpa >> PAGE_SHIFT;
529 	int seg;
530 	int offset = offset_in_page(gpa);
531 	int ret;
532 
533 	while ((seg = next_segment(len, offset)) != 0) {
534 		ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
535 		if (ret < 0)
536 			return ret;
537 		offset = 0;
538 		len -= seg;
539 		data += seg;
540 		++gfn;
541 	}
542 	return 0;
543 }
544 EXPORT_SYMBOL_GPL(kvm_read_guest);
545 
546 int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data,
547 			  unsigned long len)
548 {
549 	int r;
550 	unsigned long addr;
551 	gfn_t gfn = gpa >> PAGE_SHIFT;
552 	int offset = offset_in_page(gpa);
553 
554 	addr = gfn_to_hva(kvm, gfn);
555 	if (kvm_is_error_hva(addr))
556 		return -EFAULT;
557 	r = __copy_from_user_inatomic(data, (void __user *)addr + offset, len);
558 	if (r)
559 		return -EFAULT;
560 	return 0;
561 }
562 EXPORT_SYMBOL(kvm_read_guest_atomic);
563 
564 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
565 			 int offset, int len)
566 {
567 	int r;
568 	unsigned long addr;
569 
570 	addr = gfn_to_hva(kvm, gfn);
571 	if (kvm_is_error_hva(addr))
572 		return -EFAULT;
573 	r = copy_to_user((void __user *)addr + offset, data, len);
574 	if (r)
575 		return -EFAULT;
576 	mark_page_dirty(kvm, gfn);
577 	return 0;
578 }
579 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
580 
581 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
582 		    unsigned long len)
583 {
584 	gfn_t gfn = gpa >> PAGE_SHIFT;
585 	int seg;
586 	int offset = offset_in_page(gpa);
587 	int ret;
588 
589 	while ((seg = next_segment(len, offset)) != 0) {
590 		ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
591 		if (ret < 0)
592 			return ret;
593 		offset = 0;
594 		len -= seg;
595 		data += seg;
596 		++gfn;
597 	}
598 	return 0;
599 }
600 
601 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
602 {
603 	return kvm_write_guest_page(kvm, gfn, empty_zero_page, offset, len);
604 }
605 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
606 
607 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
608 {
609 	gfn_t gfn = gpa >> PAGE_SHIFT;
610 	int seg;
611 	int offset = offset_in_page(gpa);
612 	int ret;
613 
614         while ((seg = next_segment(len, offset)) != 0) {
615 		ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
616 		if (ret < 0)
617 			return ret;
618 		offset = 0;
619 		len -= seg;
620 		++gfn;
621 	}
622 	return 0;
623 }
624 EXPORT_SYMBOL_GPL(kvm_clear_guest);
625 
626 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
627 {
628 	struct kvm_memory_slot *memslot;
629 
630 	gfn = unalias_gfn(kvm, gfn);
631 	memslot = __gfn_to_memslot(kvm, gfn);
632 	if (memslot && memslot->dirty_bitmap) {
633 		unsigned long rel_gfn = gfn - memslot->base_gfn;
634 
635 		/* avoid RMW */
636 		if (!test_bit(rel_gfn, memslot->dirty_bitmap))
637 			set_bit(rel_gfn, memslot->dirty_bitmap);
638 	}
639 }
640 
641 /*
642  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
643  */
644 void kvm_vcpu_block(struct kvm_vcpu *vcpu)
645 {
646 	DECLARE_WAITQUEUE(wait, current);
647 
648 	add_wait_queue(&vcpu->wq, &wait);
649 
650 	/*
651 	 * We will block until either an interrupt or a signal wakes us up
652 	 */
653 	while (!kvm_cpu_has_interrupt(vcpu)
654 	       && !signal_pending(current)
655 	       && !kvm_arch_vcpu_runnable(vcpu)) {
656 		set_current_state(TASK_INTERRUPTIBLE);
657 		vcpu_put(vcpu);
658 		schedule();
659 		vcpu_load(vcpu);
660 	}
661 
662 	__set_current_state(TASK_RUNNING);
663 	remove_wait_queue(&vcpu->wq, &wait);
664 }
665 
666 void kvm_resched(struct kvm_vcpu *vcpu)
667 {
668 	if (!need_resched())
669 		return;
670 	cond_resched();
671 }
672 EXPORT_SYMBOL_GPL(kvm_resched);
673 
674 static int kvm_vcpu_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
675 {
676 	struct kvm_vcpu *vcpu = vma->vm_file->private_data;
677 	struct page *page;
678 
679 	if (vmf->pgoff == 0)
680 		page = virt_to_page(vcpu->run);
681 	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
682 		page = virt_to_page(vcpu->arch.pio_data);
683 	else
684 		return VM_FAULT_SIGBUS;
685 	get_page(page);
686 	vmf->page = page;
687 	return 0;
688 }
689 
690 static struct vm_operations_struct kvm_vcpu_vm_ops = {
691 	.fault = kvm_vcpu_fault,
692 };
693 
694 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
695 {
696 	vma->vm_ops = &kvm_vcpu_vm_ops;
697 	return 0;
698 }
699 
700 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
701 {
702 	struct kvm_vcpu *vcpu = filp->private_data;
703 
704 	fput(vcpu->kvm->filp);
705 	return 0;
706 }
707 
708 static struct file_operations kvm_vcpu_fops = {
709 	.release        = kvm_vcpu_release,
710 	.unlocked_ioctl = kvm_vcpu_ioctl,
711 	.compat_ioctl   = kvm_vcpu_ioctl,
712 	.mmap           = kvm_vcpu_mmap,
713 };
714 
715 /*
716  * Allocates an inode for the vcpu.
717  */
718 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
719 {
720 	int fd, r;
721 	struct inode *inode;
722 	struct file *file;
723 
724 	r = anon_inode_getfd(&fd, &inode, &file,
725 			     "kvm-vcpu", &kvm_vcpu_fops, vcpu);
726 	if (r)
727 		return r;
728 	atomic_inc(&vcpu->kvm->filp->f_count);
729 	return fd;
730 }
731 
732 /*
733  * Creates some virtual cpus.  Good luck creating more than one.
734  */
735 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
736 {
737 	int r;
738 	struct kvm_vcpu *vcpu;
739 
740 	if (!valid_vcpu(n))
741 		return -EINVAL;
742 
743 	vcpu = kvm_arch_vcpu_create(kvm, n);
744 	if (IS_ERR(vcpu))
745 		return PTR_ERR(vcpu);
746 
747 	preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
748 
749 	r = kvm_arch_vcpu_setup(vcpu);
750 	if (r)
751 		goto vcpu_destroy;
752 
753 	mutex_lock(&kvm->lock);
754 	if (kvm->vcpus[n]) {
755 		r = -EEXIST;
756 		mutex_unlock(&kvm->lock);
757 		goto vcpu_destroy;
758 	}
759 	kvm->vcpus[n] = vcpu;
760 	mutex_unlock(&kvm->lock);
761 
762 	/* Now it's all set up, let userspace reach it */
763 	r = create_vcpu_fd(vcpu);
764 	if (r < 0)
765 		goto unlink;
766 	return r;
767 
768 unlink:
769 	mutex_lock(&kvm->lock);
770 	kvm->vcpus[n] = NULL;
771 	mutex_unlock(&kvm->lock);
772 vcpu_destroy:
773 	kvm_arch_vcpu_destroy(vcpu);
774 	return r;
775 }
776 
777 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
778 {
779 	if (sigset) {
780 		sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
781 		vcpu->sigset_active = 1;
782 		vcpu->sigset = *sigset;
783 	} else
784 		vcpu->sigset_active = 0;
785 	return 0;
786 }
787 
788 static long kvm_vcpu_ioctl(struct file *filp,
789 			   unsigned int ioctl, unsigned long arg)
790 {
791 	struct kvm_vcpu *vcpu = filp->private_data;
792 	void __user *argp = (void __user *)arg;
793 	int r;
794 
795 	if (vcpu->kvm->mm != current->mm)
796 		return -EIO;
797 	switch (ioctl) {
798 	case KVM_RUN:
799 		r = -EINVAL;
800 		if (arg)
801 			goto out;
802 		r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
803 		break;
804 	case KVM_GET_REGS: {
805 		struct kvm_regs kvm_regs;
806 
807 		memset(&kvm_regs, 0, sizeof kvm_regs);
808 		r = kvm_arch_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
809 		if (r)
810 			goto out;
811 		r = -EFAULT;
812 		if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
813 			goto out;
814 		r = 0;
815 		break;
816 	}
817 	case KVM_SET_REGS: {
818 		struct kvm_regs kvm_regs;
819 
820 		r = -EFAULT;
821 		if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
822 			goto out;
823 		r = kvm_arch_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
824 		if (r)
825 			goto out;
826 		r = 0;
827 		break;
828 	}
829 	case KVM_GET_SREGS: {
830 		struct kvm_sregs kvm_sregs;
831 
832 		memset(&kvm_sregs, 0, sizeof kvm_sregs);
833 		r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
834 		if (r)
835 			goto out;
836 		r = -EFAULT;
837 		if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
838 			goto out;
839 		r = 0;
840 		break;
841 	}
842 	case KVM_SET_SREGS: {
843 		struct kvm_sregs kvm_sregs;
844 
845 		r = -EFAULT;
846 		if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
847 			goto out;
848 		r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
849 		if (r)
850 			goto out;
851 		r = 0;
852 		break;
853 	}
854 	case KVM_TRANSLATE: {
855 		struct kvm_translation tr;
856 
857 		r = -EFAULT;
858 		if (copy_from_user(&tr, argp, sizeof tr))
859 			goto out;
860 		r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
861 		if (r)
862 			goto out;
863 		r = -EFAULT;
864 		if (copy_to_user(argp, &tr, sizeof tr))
865 			goto out;
866 		r = 0;
867 		break;
868 	}
869 	case KVM_DEBUG_GUEST: {
870 		struct kvm_debug_guest dbg;
871 
872 		r = -EFAULT;
873 		if (copy_from_user(&dbg, argp, sizeof dbg))
874 			goto out;
875 		r = kvm_arch_vcpu_ioctl_debug_guest(vcpu, &dbg);
876 		if (r)
877 			goto out;
878 		r = 0;
879 		break;
880 	}
881 	case KVM_SET_SIGNAL_MASK: {
882 		struct kvm_signal_mask __user *sigmask_arg = argp;
883 		struct kvm_signal_mask kvm_sigmask;
884 		sigset_t sigset, *p;
885 
886 		p = NULL;
887 		if (argp) {
888 			r = -EFAULT;
889 			if (copy_from_user(&kvm_sigmask, argp,
890 					   sizeof kvm_sigmask))
891 				goto out;
892 			r = -EINVAL;
893 			if (kvm_sigmask.len != sizeof sigset)
894 				goto out;
895 			r = -EFAULT;
896 			if (copy_from_user(&sigset, sigmask_arg->sigset,
897 					   sizeof sigset))
898 				goto out;
899 			p = &sigset;
900 		}
901 		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
902 		break;
903 	}
904 	case KVM_GET_FPU: {
905 		struct kvm_fpu fpu;
906 
907 		memset(&fpu, 0, sizeof fpu);
908 		r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, &fpu);
909 		if (r)
910 			goto out;
911 		r = -EFAULT;
912 		if (copy_to_user(argp, &fpu, sizeof fpu))
913 			goto out;
914 		r = 0;
915 		break;
916 	}
917 	case KVM_SET_FPU: {
918 		struct kvm_fpu fpu;
919 
920 		r = -EFAULT;
921 		if (copy_from_user(&fpu, argp, sizeof fpu))
922 			goto out;
923 		r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, &fpu);
924 		if (r)
925 			goto out;
926 		r = 0;
927 		break;
928 	}
929 	default:
930 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
931 	}
932 out:
933 	return r;
934 }
935 
936 static long kvm_vm_ioctl(struct file *filp,
937 			   unsigned int ioctl, unsigned long arg)
938 {
939 	struct kvm *kvm = filp->private_data;
940 	void __user *argp = (void __user *)arg;
941 	int r;
942 
943 	if (kvm->mm != current->mm)
944 		return -EIO;
945 	switch (ioctl) {
946 	case KVM_CREATE_VCPU:
947 		r = kvm_vm_ioctl_create_vcpu(kvm, arg);
948 		if (r < 0)
949 			goto out;
950 		break;
951 	case KVM_SET_USER_MEMORY_REGION: {
952 		struct kvm_userspace_memory_region kvm_userspace_mem;
953 
954 		r = -EFAULT;
955 		if (copy_from_user(&kvm_userspace_mem, argp,
956 						sizeof kvm_userspace_mem))
957 			goto out;
958 
959 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
960 		if (r)
961 			goto out;
962 		break;
963 	}
964 	case KVM_GET_DIRTY_LOG: {
965 		struct kvm_dirty_log log;
966 
967 		r = -EFAULT;
968 		if (copy_from_user(&log, argp, sizeof log))
969 			goto out;
970 		r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
971 		if (r)
972 			goto out;
973 		break;
974 	}
975 	default:
976 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
977 	}
978 out:
979 	return r;
980 }
981 
982 static int kvm_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
983 {
984 	struct kvm *kvm = vma->vm_file->private_data;
985 	struct page *page;
986 
987 	if (!kvm_is_visible_gfn(kvm, vmf->pgoff))
988 		return VM_FAULT_SIGBUS;
989 	page = gfn_to_page(kvm, vmf->pgoff);
990 	if (is_error_page(page)) {
991 		kvm_release_page_clean(page);
992 		return VM_FAULT_SIGBUS;
993 	}
994 	vmf->page = page;
995 	return 0;
996 }
997 
998 static struct vm_operations_struct kvm_vm_vm_ops = {
999 	.fault = kvm_vm_fault,
1000 };
1001 
1002 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
1003 {
1004 	vma->vm_ops = &kvm_vm_vm_ops;
1005 	return 0;
1006 }
1007 
1008 static struct file_operations kvm_vm_fops = {
1009 	.release        = kvm_vm_release,
1010 	.unlocked_ioctl = kvm_vm_ioctl,
1011 	.compat_ioctl   = kvm_vm_ioctl,
1012 	.mmap           = kvm_vm_mmap,
1013 };
1014 
1015 static int kvm_dev_ioctl_create_vm(void)
1016 {
1017 	int fd, r;
1018 	struct inode *inode;
1019 	struct file *file;
1020 	struct kvm *kvm;
1021 
1022 	kvm = kvm_create_vm();
1023 	if (IS_ERR(kvm))
1024 		return PTR_ERR(kvm);
1025 	r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
1026 	if (r) {
1027 		kvm_destroy_vm(kvm);
1028 		return r;
1029 	}
1030 
1031 	kvm->filp = file;
1032 
1033 	return fd;
1034 }
1035 
1036 static long kvm_dev_ioctl(struct file *filp,
1037 			  unsigned int ioctl, unsigned long arg)
1038 {
1039 	void __user *argp = (void __user *)arg;
1040 	long r = -EINVAL;
1041 
1042 	switch (ioctl) {
1043 	case KVM_GET_API_VERSION:
1044 		r = -EINVAL;
1045 		if (arg)
1046 			goto out;
1047 		r = KVM_API_VERSION;
1048 		break;
1049 	case KVM_CREATE_VM:
1050 		r = -EINVAL;
1051 		if (arg)
1052 			goto out;
1053 		r = kvm_dev_ioctl_create_vm();
1054 		break;
1055 	case KVM_CHECK_EXTENSION:
1056 		r = kvm_dev_ioctl_check_extension((long)argp);
1057 		break;
1058 	case KVM_GET_VCPU_MMAP_SIZE:
1059 		r = -EINVAL;
1060 		if (arg)
1061 			goto out;
1062 		r = 2 * PAGE_SIZE;
1063 		break;
1064 	default:
1065 		return kvm_arch_dev_ioctl(filp, ioctl, arg);
1066 	}
1067 out:
1068 	return r;
1069 }
1070 
1071 static struct file_operations kvm_chardev_ops = {
1072 	.unlocked_ioctl = kvm_dev_ioctl,
1073 	.compat_ioctl   = kvm_dev_ioctl,
1074 };
1075 
1076 static struct miscdevice kvm_dev = {
1077 	KVM_MINOR,
1078 	"kvm",
1079 	&kvm_chardev_ops,
1080 };
1081 
1082 static void hardware_enable(void *junk)
1083 {
1084 	int cpu = raw_smp_processor_id();
1085 
1086 	if (cpu_isset(cpu, cpus_hardware_enabled))
1087 		return;
1088 	cpu_set(cpu, cpus_hardware_enabled);
1089 	kvm_arch_hardware_enable(NULL);
1090 }
1091 
1092 static void hardware_disable(void *junk)
1093 {
1094 	int cpu = raw_smp_processor_id();
1095 
1096 	if (!cpu_isset(cpu, cpus_hardware_enabled))
1097 		return;
1098 	cpu_clear(cpu, cpus_hardware_enabled);
1099 	decache_vcpus_on_cpu(cpu);
1100 	kvm_arch_hardware_disable(NULL);
1101 }
1102 
1103 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
1104 			   void *v)
1105 {
1106 	int cpu = (long)v;
1107 
1108 	val &= ~CPU_TASKS_FROZEN;
1109 	switch (val) {
1110 	case CPU_DYING:
1111 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1112 		       cpu);
1113 		hardware_disable(NULL);
1114 		break;
1115 	case CPU_UP_CANCELED:
1116 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
1117 		       cpu);
1118 		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
1119 		break;
1120 	case CPU_ONLINE:
1121 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
1122 		       cpu);
1123 		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
1124 		break;
1125 	}
1126 	return NOTIFY_OK;
1127 }
1128 
1129 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
1130 		      void *v)
1131 {
1132 	if (val == SYS_RESTART) {
1133 		/*
1134 		 * Some (well, at least mine) BIOSes hang on reboot if
1135 		 * in vmx root mode.
1136 		 */
1137 		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
1138 		on_each_cpu(hardware_disable, NULL, 0, 1);
1139 	}
1140 	return NOTIFY_OK;
1141 }
1142 
1143 static struct notifier_block kvm_reboot_notifier = {
1144 	.notifier_call = kvm_reboot,
1145 	.priority = 0,
1146 };
1147 
1148 void kvm_io_bus_init(struct kvm_io_bus *bus)
1149 {
1150 	memset(bus, 0, sizeof(*bus));
1151 }
1152 
1153 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
1154 {
1155 	int i;
1156 
1157 	for (i = 0; i < bus->dev_count; i++) {
1158 		struct kvm_io_device *pos = bus->devs[i];
1159 
1160 		kvm_iodevice_destructor(pos);
1161 	}
1162 }
1163 
1164 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
1165 {
1166 	int i;
1167 
1168 	for (i = 0; i < bus->dev_count; i++) {
1169 		struct kvm_io_device *pos = bus->devs[i];
1170 
1171 		if (pos->in_range(pos, addr))
1172 			return pos;
1173 	}
1174 
1175 	return NULL;
1176 }
1177 
1178 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
1179 {
1180 	BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
1181 
1182 	bus->devs[bus->dev_count++] = dev;
1183 }
1184 
1185 static struct notifier_block kvm_cpu_notifier = {
1186 	.notifier_call = kvm_cpu_hotplug,
1187 	.priority = 20, /* must be > scheduler priority */
1188 };
1189 
1190 static int vm_stat_get(void *_offset, u64 *val)
1191 {
1192 	unsigned offset = (long)_offset;
1193 	struct kvm *kvm;
1194 
1195 	*val = 0;
1196 	spin_lock(&kvm_lock);
1197 	list_for_each_entry(kvm, &vm_list, vm_list)
1198 		*val += *(u32 *)((void *)kvm + offset);
1199 	spin_unlock(&kvm_lock);
1200 	return 0;
1201 }
1202 
1203 DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, NULL, "%llu\n");
1204 
1205 static int vcpu_stat_get(void *_offset, u64 *val)
1206 {
1207 	unsigned offset = (long)_offset;
1208 	struct kvm *kvm;
1209 	struct kvm_vcpu *vcpu;
1210 	int i;
1211 
1212 	*val = 0;
1213 	spin_lock(&kvm_lock);
1214 	list_for_each_entry(kvm, &vm_list, vm_list)
1215 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
1216 			vcpu = kvm->vcpus[i];
1217 			if (vcpu)
1218 				*val += *(u32 *)((void *)vcpu + offset);
1219 		}
1220 	spin_unlock(&kvm_lock);
1221 	return 0;
1222 }
1223 
1224 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
1225 
1226 static struct file_operations *stat_fops[] = {
1227 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
1228 	[KVM_STAT_VM]   = &vm_stat_fops,
1229 };
1230 
1231 static void kvm_init_debug(void)
1232 {
1233 	struct kvm_stats_debugfs_item *p;
1234 
1235 	debugfs_dir = debugfs_create_dir("kvm", NULL);
1236 	for (p = debugfs_entries; p->name; ++p)
1237 		p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
1238 						(void *)(long)p->offset,
1239 						stat_fops[p->kind]);
1240 }
1241 
1242 static void kvm_exit_debug(void)
1243 {
1244 	struct kvm_stats_debugfs_item *p;
1245 
1246 	for (p = debugfs_entries; p->name; ++p)
1247 		debugfs_remove(p->dentry);
1248 	debugfs_remove(debugfs_dir);
1249 }
1250 
1251 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
1252 {
1253 	hardware_disable(NULL);
1254 	return 0;
1255 }
1256 
1257 static int kvm_resume(struct sys_device *dev)
1258 {
1259 	hardware_enable(NULL);
1260 	return 0;
1261 }
1262 
1263 static struct sysdev_class kvm_sysdev_class = {
1264 	.name = "kvm",
1265 	.suspend = kvm_suspend,
1266 	.resume = kvm_resume,
1267 };
1268 
1269 static struct sys_device kvm_sysdev = {
1270 	.id = 0,
1271 	.cls = &kvm_sysdev_class,
1272 };
1273 
1274 struct page *bad_page;
1275 
1276 static inline
1277 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
1278 {
1279 	return container_of(pn, struct kvm_vcpu, preempt_notifier);
1280 }
1281 
1282 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
1283 {
1284 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1285 
1286 	kvm_arch_vcpu_load(vcpu, cpu);
1287 }
1288 
1289 static void kvm_sched_out(struct preempt_notifier *pn,
1290 			  struct task_struct *next)
1291 {
1292 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
1293 
1294 	kvm_arch_vcpu_put(vcpu);
1295 }
1296 
1297 int kvm_init(void *opaque, unsigned int vcpu_size,
1298 		  struct module *module)
1299 {
1300 	int r;
1301 	int cpu;
1302 
1303 	kvm_init_debug();
1304 
1305 	r = kvm_arch_init(opaque);
1306 	if (r)
1307 		goto out_fail;
1308 
1309 	bad_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1310 
1311 	if (bad_page == NULL) {
1312 		r = -ENOMEM;
1313 		goto out;
1314 	}
1315 
1316 	r = kvm_arch_hardware_setup();
1317 	if (r < 0)
1318 		goto out_free_0;
1319 
1320 	for_each_online_cpu(cpu) {
1321 		smp_call_function_single(cpu,
1322 				kvm_arch_check_processor_compat,
1323 				&r, 0, 1);
1324 		if (r < 0)
1325 			goto out_free_1;
1326 	}
1327 
1328 	on_each_cpu(hardware_enable, NULL, 0, 1);
1329 	r = register_cpu_notifier(&kvm_cpu_notifier);
1330 	if (r)
1331 		goto out_free_2;
1332 	register_reboot_notifier(&kvm_reboot_notifier);
1333 
1334 	r = sysdev_class_register(&kvm_sysdev_class);
1335 	if (r)
1336 		goto out_free_3;
1337 
1338 	r = sysdev_register(&kvm_sysdev);
1339 	if (r)
1340 		goto out_free_4;
1341 
1342 	/* A kmem cache lets us meet the alignment requirements of fx_save. */
1343 	kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
1344 					   __alignof__(struct kvm_vcpu),
1345 					   0, NULL);
1346 	if (!kvm_vcpu_cache) {
1347 		r = -ENOMEM;
1348 		goto out_free_5;
1349 	}
1350 
1351 	kvm_chardev_ops.owner = module;
1352 
1353 	r = misc_register(&kvm_dev);
1354 	if (r) {
1355 		printk(KERN_ERR "kvm: misc device register failed\n");
1356 		goto out_free;
1357 	}
1358 
1359 	kvm_preempt_ops.sched_in = kvm_sched_in;
1360 	kvm_preempt_ops.sched_out = kvm_sched_out;
1361 
1362 	return 0;
1363 
1364 out_free:
1365 	kmem_cache_destroy(kvm_vcpu_cache);
1366 out_free_5:
1367 	sysdev_unregister(&kvm_sysdev);
1368 out_free_4:
1369 	sysdev_class_unregister(&kvm_sysdev_class);
1370 out_free_3:
1371 	unregister_reboot_notifier(&kvm_reboot_notifier);
1372 	unregister_cpu_notifier(&kvm_cpu_notifier);
1373 out_free_2:
1374 	on_each_cpu(hardware_disable, NULL, 0, 1);
1375 out_free_1:
1376 	kvm_arch_hardware_unsetup();
1377 out_free_0:
1378 	__free_page(bad_page);
1379 out:
1380 	kvm_arch_exit();
1381 	kvm_exit_debug();
1382 out_fail:
1383 	return r;
1384 }
1385 EXPORT_SYMBOL_GPL(kvm_init);
1386 
1387 void kvm_exit(void)
1388 {
1389 	misc_deregister(&kvm_dev);
1390 	kmem_cache_destroy(kvm_vcpu_cache);
1391 	sysdev_unregister(&kvm_sysdev);
1392 	sysdev_class_unregister(&kvm_sysdev_class);
1393 	unregister_reboot_notifier(&kvm_reboot_notifier);
1394 	unregister_cpu_notifier(&kvm_cpu_notifier);
1395 	on_each_cpu(hardware_disable, NULL, 0, 1);
1396 	kvm_arch_hardware_unsetup();
1397 	kvm_arch_exit();
1398 	kvm_exit_debug();
1399 	__free_page(bad_page);
1400 }
1401 EXPORT_SYMBOL_GPL(kvm_exit);
1402