xref: /openbmc/linux/arch/x86/kvm/x86.c (revision 4b2a108c)
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  * Copyright (C) 2008 Qumranet, Inc.
8  * Copyright IBM Corporation, 2008
9  *
10  * Authors:
11  *   Avi Kivity   <avi@qumranet.com>
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Amit Shah    <amit.shah@qumranet.com>
14  *   Ben-Ami Yassour <benami@il.ibm.com>
15  *
16  * This work is licensed under the terms of the GNU GPL, version 2.  See
17  * the COPYING file in the top-level directory.
18  *
19  */
20 
21 #include <linux/kvm_host.h>
22 #include "irq.h"
23 #include "mmu.h"
24 #include "i8254.h"
25 #include "tss.h"
26 #include "kvm_cache_regs.h"
27 #include "x86.h"
28 
29 #include <linux/clocksource.h>
30 #include <linux/interrupt.h>
31 #include <linux/kvm.h>
32 #include <linux/fs.h>
33 #include <linux/vmalloc.h>
34 #include <linux/module.h>
35 #include <linux/mman.h>
36 #include <linux/highmem.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/cpufreq.h>
40 
41 #include <asm/uaccess.h>
42 #include <asm/msr.h>
43 #include <asm/desc.h>
44 #include <asm/mtrr.h>
45 
46 #define MAX_IO_MSRS 256
47 #define CR0_RESERVED_BITS						\
48 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
49 			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
50 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
51 #define CR4_RESERVED_BITS						\
52 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
53 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
54 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
55 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
56 
57 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
58 /* EFER defaults:
59  * - enable syscall per default because its emulated by KVM
60  * - enable LME and LMA per default on 64 bit KVM
61  */
62 #ifdef CONFIG_X86_64
63 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
64 #else
65 static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
66 #endif
67 
68 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
69 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
70 
71 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
72 				    struct kvm_cpuid_entry2 __user *entries);
73 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
74 					      u32 function, u32 index);
75 
76 struct kvm_x86_ops *kvm_x86_ops;
77 EXPORT_SYMBOL_GPL(kvm_x86_ops);
78 
79 struct kvm_stats_debugfs_item debugfs_entries[] = {
80 	{ "pf_fixed", VCPU_STAT(pf_fixed) },
81 	{ "pf_guest", VCPU_STAT(pf_guest) },
82 	{ "tlb_flush", VCPU_STAT(tlb_flush) },
83 	{ "invlpg", VCPU_STAT(invlpg) },
84 	{ "exits", VCPU_STAT(exits) },
85 	{ "io_exits", VCPU_STAT(io_exits) },
86 	{ "mmio_exits", VCPU_STAT(mmio_exits) },
87 	{ "signal_exits", VCPU_STAT(signal_exits) },
88 	{ "irq_window", VCPU_STAT(irq_window_exits) },
89 	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
90 	{ "halt_exits", VCPU_STAT(halt_exits) },
91 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
92 	{ "hypercalls", VCPU_STAT(hypercalls) },
93 	{ "request_irq", VCPU_STAT(request_irq_exits) },
94 	{ "irq_exits", VCPU_STAT(irq_exits) },
95 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
96 	{ "efer_reload", VCPU_STAT(efer_reload) },
97 	{ "fpu_reload", VCPU_STAT(fpu_reload) },
98 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
99 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
100 	{ "irq_injections", VCPU_STAT(irq_injections) },
101 	{ "nmi_injections", VCPU_STAT(nmi_injections) },
102 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
103 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
104 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
105 	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
106 	{ "mmu_flooded", VM_STAT(mmu_flooded) },
107 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
108 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
109 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
110 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
111 	{ "largepages", VM_STAT(lpages) },
112 	{ NULL }
113 };
114 
115 unsigned long segment_base(u16 selector)
116 {
117 	struct descriptor_table gdt;
118 	struct desc_struct *d;
119 	unsigned long table_base;
120 	unsigned long v;
121 
122 	if (selector == 0)
123 		return 0;
124 
125 	asm("sgdt %0" : "=m"(gdt));
126 	table_base = gdt.base;
127 
128 	if (selector & 4) {           /* from ldt */
129 		u16 ldt_selector;
130 
131 		asm("sldt %0" : "=g"(ldt_selector));
132 		table_base = segment_base(ldt_selector);
133 	}
134 	d = (struct desc_struct *)(table_base + (selector & ~7));
135 	v = d->base0 | ((unsigned long)d->base1 << 16) |
136 		((unsigned long)d->base2 << 24);
137 #ifdef CONFIG_X86_64
138 	if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
139 		v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
140 #endif
141 	return v;
142 }
143 EXPORT_SYMBOL_GPL(segment_base);
144 
145 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
146 {
147 	if (irqchip_in_kernel(vcpu->kvm))
148 		return vcpu->arch.apic_base;
149 	else
150 		return vcpu->arch.apic_base;
151 }
152 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
153 
154 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
155 {
156 	/* TODO: reserve bits check */
157 	if (irqchip_in_kernel(vcpu->kvm))
158 		kvm_lapic_set_base(vcpu, data);
159 	else
160 		vcpu->arch.apic_base = data;
161 }
162 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
163 
164 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
165 {
166 	WARN_ON(vcpu->arch.exception.pending);
167 	vcpu->arch.exception.pending = true;
168 	vcpu->arch.exception.has_error_code = false;
169 	vcpu->arch.exception.nr = nr;
170 }
171 EXPORT_SYMBOL_GPL(kvm_queue_exception);
172 
173 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
174 			   u32 error_code)
175 {
176 	++vcpu->stat.pf_guest;
177 
178 	if (vcpu->arch.exception.pending) {
179 		if (vcpu->arch.exception.nr == PF_VECTOR) {
180 			printk(KERN_DEBUG "kvm: inject_page_fault:"
181 					" double fault 0x%lx\n", addr);
182 			vcpu->arch.exception.nr = DF_VECTOR;
183 			vcpu->arch.exception.error_code = 0;
184 		} else if (vcpu->arch.exception.nr == DF_VECTOR) {
185 			/* triple fault -> shutdown */
186 			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
187 		}
188 		return;
189 	}
190 	vcpu->arch.cr2 = addr;
191 	kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
192 }
193 
194 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
195 {
196 	vcpu->arch.nmi_pending = 1;
197 }
198 EXPORT_SYMBOL_GPL(kvm_inject_nmi);
199 
200 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
201 {
202 	WARN_ON(vcpu->arch.exception.pending);
203 	vcpu->arch.exception.pending = true;
204 	vcpu->arch.exception.has_error_code = true;
205 	vcpu->arch.exception.nr = nr;
206 	vcpu->arch.exception.error_code = error_code;
207 }
208 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
209 
210 static void __queue_exception(struct kvm_vcpu *vcpu)
211 {
212 	kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
213 				     vcpu->arch.exception.has_error_code,
214 				     vcpu->arch.exception.error_code);
215 }
216 
217 /*
218  * Load the pae pdptrs.  Return true is they are all valid.
219  */
220 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
221 {
222 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
223 	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
224 	int i;
225 	int ret;
226 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
227 
228 	ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
229 				  offset * sizeof(u64), sizeof(pdpte));
230 	if (ret < 0) {
231 		ret = 0;
232 		goto out;
233 	}
234 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
235 		if (is_present_pte(pdpte[i]) &&
236 		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
237 			ret = 0;
238 			goto out;
239 		}
240 	}
241 	ret = 1;
242 
243 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
244 out:
245 
246 	return ret;
247 }
248 EXPORT_SYMBOL_GPL(load_pdptrs);
249 
250 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
251 {
252 	u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
253 	bool changed = true;
254 	int r;
255 
256 	if (is_long_mode(vcpu) || !is_pae(vcpu))
257 		return false;
258 
259 	r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
260 	if (r < 0)
261 		goto out;
262 	changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
263 out:
264 
265 	return changed;
266 }
267 
268 void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
269 {
270 	if (cr0 & CR0_RESERVED_BITS) {
271 		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
272 		       cr0, vcpu->arch.cr0);
273 		kvm_inject_gp(vcpu, 0);
274 		return;
275 	}
276 
277 	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
278 		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
279 		kvm_inject_gp(vcpu, 0);
280 		return;
281 	}
282 
283 	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
284 		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
285 		       "and a clear PE flag\n");
286 		kvm_inject_gp(vcpu, 0);
287 		return;
288 	}
289 
290 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
291 #ifdef CONFIG_X86_64
292 		if ((vcpu->arch.shadow_efer & EFER_LME)) {
293 			int cs_db, cs_l;
294 
295 			if (!is_pae(vcpu)) {
296 				printk(KERN_DEBUG "set_cr0: #GP, start paging "
297 				       "in long mode while PAE is disabled\n");
298 				kvm_inject_gp(vcpu, 0);
299 				return;
300 			}
301 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
302 			if (cs_l) {
303 				printk(KERN_DEBUG "set_cr0: #GP, start paging "
304 				       "in long mode while CS.L == 1\n");
305 				kvm_inject_gp(vcpu, 0);
306 				return;
307 
308 			}
309 		} else
310 #endif
311 		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
312 			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
313 			       "reserved bits\n");
314 			kvm_inject_gp(vcpu, 0);
315 			return;
316 		}
317 
318 	}
319 
320 	kvm_x86_ops->set_cr0(vcpu, cr0);
321 	vcpu->arch.cr0 = cr0;
322 
323 	kvm_mmu_reset_context(vcpu);
324 	return;
325 }
326 EXPORT_SYMBOL_GPL(kvm_set_cr0);
327 
328 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
329 {
330 	kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
331 	KVMTRACE_1D(LMSW, vcpu,
332 		    (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
333 		    handler);
334 }
335 EXPORT_SYMBOL_GPL(kvm_lmsw);
336 
337 void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
338 {
339 	unsigned long old_cr4 = vcpu->arch.cr4;
340 	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
341 
342 	if (cr4 & CR4_RESERVED_BITS) {
343 		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
344 		kvm_inject_gp(vcpu, 0);
345 		return;
346 	}
347 
348 	if (is_long_mode(vcpu)) {
349 		if (!(cr4 & X86_CR4_PAE)) {
350 			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
351 			       "in long mode\n");
352 			kvm_inject_gp(vcpu, 0);
353 			return;
354 		}
355 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
356 		   && ((cr4 ^ old_cr4) & pdptr_bits)
357 		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
358 		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
359 		kvm_inject_gp(vcpu, 0);
360 		return;
361 	}
362 
363 	if (cr4 & X86_CR4_VMXE) {
364 		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
365 		kvm_inject_gp(vcpu, 0);
366 		return;
367 	}
368 	kvm_x86_ops->set_cr4(vcpu, cr4);
369 	vcpu->arch.cr4 = cr4;
370 	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
371 	kvm_mmu_reset_context(vcpu);
372 }
373 EXPORT_SYMBOL_GPL(kvm_set_cr4);
374 
375 void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
376 {
377 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
378 		kvm_mmu_sync_roots(vcpu);
379 		kvm_mmu_flush_tlb(vcpu);
380 		return;
381 	}
382 
383 	if (is_long_mode(vcpu)) {
384 		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
385 			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
386 			kvm_inject_gp(vcpu, 0);
387 			return;
388 		}
389 	} else {
390 		if (is_pae(vcpu)) {
391 			if (cr3 & CR3_PAE_RESERVED_BITS) {
392 				printk(KERN_DEBUG
393 				       "set_cr3: #GP, reserved bits\n");
394 				kvm_inject_gp(vcpu, 0);
395 				return;
396 			}
397 			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
398 				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
399 				       "reserved bits\n");
400 				kvm_inject_gp(vcpu, 0);
401 				return;
402 			}
403 		}
404 		/*
405 		 * We don't check reserved bits in nonpae mode, because
406 		 * this isn't enforced, and VMware depends on this.
407 		 */
408 	}
409 
410 	/*
411 	 * Does the new cr3 value map to physical memory? (Note, we
412 	 * catch an invalid cr3 even in real-mode, because it would
413 	 * cause trouble later on when we turn on paging anyway.)
414 	 *
415 	 * A real CPU would silently accept an invalid cr3 and would
416 	 * attempt to use it - with largely undefined (and often hard
417 	 * to debug) behavior on the guest side.
418 	 */
419 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
420 		kvm_inject_gp(vcpu, 0);
421 	else {
422 		vcpu->arch.cr3 = cr3;
423 		vcpu->arch.mmu.new_cr3(vcpu);
424 	}
425 }
426 EXPORT_SYMBOL_GPL(kvm_set_cr3);
427 
428 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
429 {
430 	if (cr8 & CR8_RESERVED_BITS) {
431 		printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
432 		kvm_inject_gp(vcpu, 0);
433 		return;
434 	}
435 	if (irqchip_in_kernel(vcpu->kvm))
436 		kvm_lapic_set_tpr(vcpu, cr8);
437 	else
438 		vcpu->arch.cr8 = cr8;
439 }
440 EXPORT_SYMBOL_GPL(kvm_set_cr8);
441 
442 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
443 {
444 	if (irqchip_in_kernel(vcpu->kvm))
445 		return kvm_lapic_get_cr8(vcpu);
446 	else
447 		return vcpu->arch.cr8;
448 }
449 EXPORT_SYMBOL_GPL(kvm_get_cr8);
450 
451 static inline u32 bit(int bitno)
452 {
453 	return 1 << (bitno & 31);
454 }
455 
456 /*
457  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
458  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
459  *
460  * This list is modified at module load time to reflect the
461  * capabilities of the host cpu.
462  */
463 static u32 msrs_to_save[] = {
464 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
465 	MSR_K6_STAR,
466 #ifdef CONFIG_X86_64
467 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
468 #endif
469 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
470 	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
471 };
472 
473 static unsigned num_msrs_to_save;
474 
475 static u32 emulated_msrs[] = {
476 	MSR_IA32_MISC_ENABLE,
477 };
478 
479 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
480 {
481 	if (efer & efer_reserved_bits) {
482 		printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
483 		       efer);
484 		kvm_inject_gp(vcpu, 0);
485 		return;
486 	}
487 
488 	if (is_paging(vcpu)
489 	    && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) {
490 		printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
491 		kvm_inject_gp(vcpu, 0);
492 		return;
493 	}
494 
495 	if (efer & EFER_FFXSR) {
496 		struct kvm_cpuid_entry2 *feat;
497 
498 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
499 		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
500 			printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
501 			kvm_inject_gp(vcpu, 0);
502 			return;
503 		}
504 	}
505 
506 	if (efer & EFER_SVME) {
507 		struct kvm_cpuid_entry2 *feat;
508 
509 		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
510 		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
511 			printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
512 			kvm_inject_gp(vcpu, 0);
513 			return;
514 		}
515 	}
516 
517 	kvm_x86_ops->set_efer(vcpu, efer);
518 
519 	efer &= ~EFER_LMA;
520 	efer |= vcpu->arch.shadow_efer & EFER_LMA;
521 
522 	vcpu->arch.shadow_efer = efer;
523 
524 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
525 	kvm_mmu_reset_context(vcpu);
526 }
527 
528 void kvm_enable_efer_bits(u64 mask)
529 {
530        efer_reserved_bits &= ~mask;
531 }
532 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
533 
534 
535 /*
536  * Writes msr value into into the appropriate "register".
537  * Returns 0 on success, non-0 otherwise.
538  * Assumes vcpu_load() was already called.
539  */
540 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
541 {
542 	return kvm_x86_ops->set_msr(vcpu, msr_index, data);
543 }
544 
545 /*
546  * Adapt set_msr() to msr_io()'s calling convention
547  */
548 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
549 {
550 	return kvm_set_msr(vcpu, index, *data);
551 }
552 
553 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
554 {
555 	static int version;
556 	struct pvclock_wall_clock wc;
557 	struct timespec now, sys, boot;
558 
559 	if (!wall_clock)
560 		return;
561 
562 	version++;
563 
564 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
565 
566 	/*
567 	 * The guest calculates current wall clock time by adding
568 	 * system time (updated by kvm_write_guest_time below) to the
569 	 * wall clock specified here.  guest system time equals host
570 	 * system time for us, thus we must fill in host boot time here.
571 	 */
572 	now = current_kernel_time();
573 	ktime_get_ts(&sys);
574 	boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
575 
576 	wc.sec = boot.tv_sec;
577 	wc.nsec = boot.tv_nsec;
578 	wc.version = version;
579 
580 	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
581 
582 	version++;
583 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
584 }
585 
586 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
587 {
588 	uint32_t quotient, remainder;
589 
590 	/* Don't try to replace with do_div(), this one calculates
591 	 * "(dividend << 32) / divisor" */
592 	__asm__ ( "divl %4"
593 		  : "=a" (quotient), "=d" (remainder)
594 		  : "0" (0), "1" (dividend), "r" (divisor) );
595 	return quotient;
596 }
597 
598 static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock)
599 {
600 	uint64_t nsecs = 1000000000LL;
601 	int32_t  shift = 0;
602 	uint64_t tps64;
603 	uint32_t tps32;
604 
605 	tps64 = tsc_khz * 1000LL;
606 	while (tps64 > nsecs*2) {
607 		tps64 >>= 1;
608 		shift--;
609 	}
610 
611 	tps32 = (uint32_t)tps64;
612 	while (tps32 <= (uint32_t)nsecs) {
613 		tps32 <<= 1;
614 		shift++;
615 	}
616 
617 	hv_clock->tsc_shift = shift;
618 	hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
619 
620 	pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
621 		 __func__, tsc_khz, hv_clock->tsc_shift,
622 		 hv_clock->tsc_to_system_mul);
623 }
624 
625 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
626 
627 static void kvm_write_guest_time(struct kvm_vcpu *v)
628 {
629 	struct timespec ts;
630 	unsigned long flags;
631 	struct kvm_vcpu_arch *vcpu = &v->arch;
632 	void *shared_kaddr;
633 	unsigned long this_tsc_khz;
634 
635 	if ((!vcpu->time_page))
636 		return;
637 
638 	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
639 	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
640 		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
641 		vcpu->hv_clock_tsc_khz = this_tsc_khz;
642 	}
643 	put_cpu_var(cpu_tsc_khz);
644 
645 	/* Keep irq disabled to prevent changes to the clock */
646 	local_irq_save(flags);
647 	kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
648 			  &vcpu->hv_clock.tsc_timestamp);
649 	ktime_get_ts(&ts);
650 	local_irq_restore(flags);
651 
652 	/* With all the info we got, fill in the values */
653 
654 	vcpu->hv_clock.system_time = ts.tv_nsec +
655 				     (NSEC_PER_SEC * (u64)ts.tv_sec);
656 	/*
657 	 * The interface expects us to write an even number signaling that the
658 	 * update is finished. Since the guest won't see the intermediate
659 	 * state, we just increase by 2 at the end.
660 	 */
661 	vcpu->hv_clock.version += 2;
662 
663 	shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
664 
665 	memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
666 	       sizeof(vcpu->hv_clock));
667 
668 	kunmap_atomic(shared_kaddr, KM_USER0);
669 
670 	mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
671 }
672 
673 static int kvm_request_guest_time_update(struct kvm_vcpu *v)
674 {
675 	struct kvm_vcpu_arch *vcpu = &v->arch;
676 
677 	if (!vcpu->time_page)
678 		return 0;
679 	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
680 	return 1;
681 }
682 
683 static bool msr_mtrr_valid(unsigned msr)
684 {
685 	switch (msr) {
686 	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
687 	case MSR_MTRRfix64K_00000:
688 	case MSR_MTRRfix16K_80000:
689 	case MSR_MTRRfix16K_A0000:
690 	case MSR_MTRRfix4K_C0000:
691 	case MSR_MTRRfix4K_C8000:
692 	case MSR_MTRRfix4K_D0000:
693 	case MSR_MTRRfix4K_D8000:
694 	case MSR_MTRRfix4K_E0000:
695 	case MSR_MTRRfix4K_E8000:
696 	case MSR_MTRRfix4K_F0000:
697 	case MSR_MTRRfix4K_F8000:
698 	case MSR_MTRRdefType:
699 	case MSR_IA32_CR_PAT:
700 		return true;
701 	case 0x2f8:
702 		return true;
703 	}
704 	return false;
705 }
706 
707 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
708 {
709 	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
710 
711 	if (!msr_mtrr_valid(msr))
712 		return 1;
713 
714 	if (msr == MSR_MTRRdefType) {
715 		vcpu->arch.mtrr_state.def_type = data;
716 		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
717 	} else if (msr == MSR_MTRRfix64K_00000)
718 		p[0] = data;
719 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
720 		p[1 + msr - MSR_MTRRfix16K_80000] = data;
721 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
722 		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
723 	else if (msr == MSR_IA32_CR_PAT)
724 		vcpu->arch.pat = data;
725 	else {	/* Variable MTRRs */
726 		int idx, is_mtrr_mask;
727 		u64 *pt;
728 
729 		idx = (msr - 0x200) / 2;
730 		is_mtrr_mask = msr - 0x200 - 2 * idx;
731 		if (!is_mtrr_mask)
732 			pt =
733 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
734 		else
735 			pt =
736 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
737 		*pt = data;
738 	}
739 
740 	kvm_mmu_reset_context(vcpu);
741 	return 0;
742 }
743 
744 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
745 {
746 	switch (msr) {
747 	case MSR_EFER:
748 		set_efer(vcpu, data);
749 		break;
750 	case MSR_IA32_MC0_STATUS:
751 		pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
752 		       __func__, data);
753 		break;
754 	case MSR_IA32_MCG_STATUS:
755 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
756 			__func__, data);
757 		break;
758 	case MSR_IA32_MCG_CTL:
759 		pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
760 			__func__, data);
761 		break;
762 	case MSR_IA32_DEBUGCTLMSR:
763 		if (!data) {
764 			/* We support the non-activated case already */
765 			break;
766 		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
767 			/* Values other than LBR and BTF are vendor-specific,
768 			   thus reserved and should throw a #GP */
769 			return 1;
770 		}
771 		pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
772 			__func__, data);
773 		break;
774 	case MSR_IA32_UCODE_REV:
775 	case MSR_IA32_UCODE_WRITE:
776 	case MSR_VM_HSAVE_PA:
777 		break;
778 	case 0x200 ... 0x2ff:
779 		return set_msr_mtrr(vcpu, msr, data);
780 	case MSR_IA32_APICBASE:
781 		kvm_set_apic_base(vcpu, data);
782 		break;
783 	case MSR_IA32_MISC_ENABLE:
784 		vcpu->arch.ia32_misc_enable_msr = data;
785 		break;
786 	case MSR_KVM_WALL_CLOCK:
787 		vcpu->kvm->arch.wall_clock = data;
788 		kvm_write_wall_clock(vcpu->kvm, data);
789 		break;
790 	case MSR_KVM_SYSTEM_TIME: {
791 		if (vcpu->arch.time_page) {
792 			kvm_release_page_dirty(vcpu->arch.time_page);
793 			vcpu->arch.time_page = NULL;
794 		}
795 
796 		vcpu->arch.time = data;
797 
798 		/* we verify if the enable bit is set... */
799 		if (!(data & 1))
800 			break;
801 
802 		/* ...but clean it before doing the actual write */
803 		vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
804 
805 		vcpu->arch.time_page =
806 				gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
807 
808 		if (is_error_page(vcpu->arch.time_page)) {
809 			kvm_release_page_clean(vcpu->arch.time_page);
810 			vcpu->arch.time_page = NULL;
811 		}
812 
813 		kvm_request_guest_time_update(vcpu);
814 		break;
815 	}
816 	default:
817 		pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
818 		return 1;
819 	}
820 	return 0;
821 }
822 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
823 
824 
825 /*
826  * Reads an msr value (of 'msr_index') into 'pdata'.
827  * Returns 0 on success, non-0 otherwise.
828  * Assumes vcpu_load() was already called.
829  */
830 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
831 {
832 	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
833 }
834 
835 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
836 {
837 	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
838 
839 	if (!msr_mtrr_valid(msr))
840 		return 1;
841 
842 	if (msr == MSR_MTRRdefType)
843 		*pdata = vcpu->arch.mtrr_state.def_type +
844 			 (vcpu->arch.mtrr_state.enabled << 10);
845 	else if (msr == MSR_MTRRfix64K_00000)
846 		*pdata = p[0];
847 	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
848 		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
849 	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
850 		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
851 	else if (msr == MSR_IA32_CR_PAT)
852 		*pdata = vcpu->arch.pat;
853 	else {	/* Variable MTRRs */
854 		int idx, is_mtrr_mask;
855 		u64 *pt;
856 
857 		idx = (msr - 0x200) / 2;
858 		is_mtrr_mask = msr - 0x200 - 2 * idx;
859 		if (!is_mtrr_mask)
860 			pt =
861 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
862 		else
863 			pt =
864 			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
865 		*pdata = *pt;
866 	}
867 
868 	return 0;
869 }
870 
871 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
872 {
873 	u64 data;
874 
875 	switch (msr) {
876 	case 0xc0010010: /* SYSCFG */
877 	case 0xc0010015: /* HWCR */
878 	case MSR_IA32_PLATFORM_ID:
879 	case MSR_IA32_P5_MC_ADDR:
880 	case MSR_IA32_P5_MC_TYPE:
881 	case MSR_IA32_MC0_CTL:
882 	case MSR_IA32_MCG_STATUS:
883 	case MSR_IA32_MCG_CAP:
884 	case MSR_IA32_MCG_CTL:
885 	case MSR_IA32_MC0_MISC:
886 	case MSR_IA32_MC0_MISC+4:
887 	case MSR_IA32_MC0_MISC+8:
888 	case MSR_IA32_MC0_MISC+12:
889 	case MSR_IA32_MC0_MISC+16:
890 	case MSR_IA32_MC0_MISC+20:
891 	case MSR_IA32_UCODE_REV:
892 	case MSR_IA32_EBL_CR_POWERON:
893 	case MSR_IA32_DEBUGCTLMSR:
894 	case MSR_IA32_LASTBRANCHFROMIP:
895 	case MSR_IA32_LASTBRANCHTOIP:
896 	case MSR_IA32_LASTINTFROMIP:
897 	case MSR_IA32_LASTINTTOIP:
898 	case MSR_VM_HSAVE_PA:
899 	case MSR_P6_EVNTSEL0:
900 	case MSR_P6_EVNTSEL1:
901 	case MSR_K7_EVNTSEL0:
902 		data = 0;
903 		break;
904 	case MSR_MTRRcap:
905 		data = 0x500 | KVM_NR_VAR_MTRR;
906 		break;
907 	case 0x200 ... 0x2ff:
908 		return get_msr_mtrr(vcpu, msr, pdata);
909 	case 0xcd: /* fsb frequency */
910 		data = 3;
911 		break;
912 	case MSR_IA32_APICBASE:
913 		data = kvm_get_apic_base(vcpu);
914 		break;
915 	case MSR_IA32_MISC_ENABLE:
916 		data = vcpu->arch.ia32_misc_enable_msr;
917 		break;
918 	case MSR_IA32_PERF_STATUS:
919 		/* TSC increment by tick */
920 		data = 1000ULL;
921 		/* CPU multiplier */
922 		data |= (((uint64_t)4ULL) << 40);
923 		break;
924 	case MSR_EFER:
925 		data = vcpu->arch.shadow_efer;
926 		break;
927 	case MSR_KVM_WALL_CLOCK:
928 		data = vcpu->kvm->arch.wall_clock;
929 		break;
930 	case MSR_KVM_SYSTEM_TIME:
931 		data = vcpu->arch.time;
932 		break;
933 	default:
934 		pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
935 		return 1;
936 	}
937 	*pdata = data;
938 	return 0;
939 }
940 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
941 
942 /*
943  * Read or write a bunch of msrs. All parameters are kernel addresses.
944  *
945  * @return number of msrs set successfully.
946  */
947 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
948 		    struct kvm_msr_entry *entries,
949 		    int (*do_msr)(struct kvm_vcpu *vcpu,
950 				  unsigned index, u64 *data))
951 {
952 	int i;
953 
954 	vcpu_load(vcpu);
955 
956 	down_read(&vcpu->kvm->slots_lock);
957 	for (i = 0; i < msrs->nmsrs; ++i)
958 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
959 			break;
960 	up_read(&vcpu->kvm->slots_lock);
961 
962 	vcpu_put(vcpu);
963 
964 	return i;
965 }
966 
967 /*
968  * Read or write a bunch of msrs. Parameters are user addresses.
969  *
970  * @return number of msrs set successfully.
971  */
972 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
973 		  int (*do_msr)(struct kvm_vcpu *vcpu,
974 				unsigned index, u64 *data),
975 		  int writeback)
976 {
977 	struct kvm_msrs msrs;
978 	struct kvm_msr_entry *entries;
979 	int r, n;
980 	unsigned size;
981 
982 	r = -EFAULT;
983 	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
984 		goto out;
985 
986 	r = -E2BIG;
987 	if (msrs.nmsrs >= MAX_IO_MSRS)
988 		goto out;
989 
990 	r = -ENOMEM;
991 	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
992 	entries = vmalloc(size);
993 	if (!entries)
994 		goto out;
995 
996 	r = -EFAULT;
997 	if (copy_from_user(entries, user_msrs->entries, size))
998 		goto out_free;
999 
1000 	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
1001 	if (r < 0)
1002 		goto out_free;
1003 
1004 	r = -EFAULT;
1005 	if (writeback && copy_to_user(user_msrs->entries, entries, size))
1006 		goto out_free;
1007 
1008 	r = n;
1009 
1010 out_free:
1011 	vfree(entries);
1012 out:
1013 	return r;
1014 }
1015 
1016 int kvm_dev_ioctl_check_extension(long ext)
1017 {
1018 	int r;
1019 
1020 	switch (ext) {
1021 	case KVM_CAP_IRQCHIP:
1022 	case KVM_CAP_HLT:
1023 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
1024 	case KVM_CAP_SET_TSS_ADDR:
1025 	case KVM_CAP_EXT_CPUID:
1026 	case KVM_CAP_CLOCKSOURCE:
1027 	case KVM_CAP_PIT:
1028 	case KVM_CAP_NOP_IO_DELAY:
1029 	case KVM_CAP_MP_STATE:
1030 	case KVM_CAP_SYNC_MMU:
1031 	case KVM_CAP_REINJECT_CONTROL:
1032 	case KVM_CAP_IRQ_INJECT_STATUS:
1033 	case KVM_CAP_ASSIGN_DEV_IRQ:
1034 		r = 1;
1035 		break;
1036 	case KVM_CAP_COALESCED_MMIO:
1037 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
1038 		break;
1039 	case KVM_CAP_VAPIC:
1040 		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
1041 		break;
1042 	case KVM_CAP_NR_VCPUS:
1043 		r = KVM_MAX_VCPUS;
1044 		break;
1045 	case KVM_CAP_NR_MEMSLOTS:
1046 		r = KVM_MEMORY_SLOTS;
1047 		break;
1048 	case KVM_CAP_PV_MMU:
1049 		r = !tdp_enabled;
1050 		break;
1051 	case KVM_CAP_IOMMU:
1052 		r = iommu_found();
1053 		break;
1054 	default:
1055 		r = 0;
1056 		break;
1057 	}
1058 	return r;
1059 
1060 }
1061 
1062 long kvm_arch_dev_ioctl(struct file *filp,
1063 			unsigned int ioctl, unsigned long arg)
1064 {
1065 	void __user *argp = (void __user *)arg;
1066 	long r;
1067 
1068 	switch (ioctl) {
1069 	case KVM_GET_MSR_INDEX_LIST: {
1070 		struct kvm_msr_list __user *user_msr_list = argp;
1071 		struct kvm_msr_list msr_list;
1072 		unsigned n;
1073 
1074 		r = -EFAULT;
1075 		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
1076 			goto out;
1077 		n = msr_list.nmsrs;
1078 		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
1079 		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
1080 			goto out;
1081 		r = -E2BIG;
1082 		if (n < num_msrs_to_save)
1083 			goto out;
1084 		r = -EFAULT;
1085 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
1086 				 num_msrs_to_save * sizeof(u32)))
1087 			goto out;
1088 		if (copy_to_user(user_msr_list->indices
1089 				 + num_msrs_to_save * sizeof(u32),
1090 				 &emulated_msrs,
1091 				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
1092 			goto out;
1093 		r = 0;
1094 		break;
1095 	}
1096 	case KVM_GET_SUPPORTED_CPUID: {
1097 		struct kvm_cpuid2 __user *cpuid_arg = argp;
1098 		struct kvm_cpuid2 cpuid;
1099 
1100 		r = -EFAULT;
1101 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1102 			goto out;
1103 		r = kvm_dev_ioctl_get_supported_cpuid(&cpuid,
1104 						      cpuid_arg->entries);
1105 		if (r)
1106 			goto out;
1107 
1108 		r = -EFAULT;
1109 		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1110 			goto out;
1111 		r = 0;
1112 		break;
1113 	}
1114 	default:
1115 		r = -EINVAL;
1116 	}
1117 out:
1118 	return r;
1119 }
1120 
1121 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1122 {
1123 	kvm_x86_ops->vcpu_load(vcpu, cpu);
1124 	kvm_request_guest_time_update(vcpu);
1125 }
1126 
1127 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1128 {
1129 	kvm_x86_ops->vcpu_put(vcpu);
1130 	kvm_put_guest_fpu(vcpu);
1131 }
1132 
1133 static int is_efer_nx(void)
1134 {
1135 	unsigned long long efer = 0;
1136 
1137 	rdmsrl_safe(MSR_EFER, &efer);
1138 	return efer & EFER_NX;
1139 }
1140 
1141 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
1142 {
1143 	int i;
1144 	struct kvm_cpuid_entry2 *e, *entry;
1145 
1146 	entry = NULL;
1147 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
1148 		e = &vcpu->arch.cpuid_entries[i];
1149 		if (e->function == 0x80000001) {
1150 			entry = e;
1151 			break;
1152 		}
1153 	}
1154 	if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
1155 		entry->edx &= ~(1 << 20);
1156 		printk(KERN_INFO "kvm: guest NX capability removed\n");
1157 	}
1158 }
1159 
1160 /* when an old userspace process fills a new kernel module */
1161 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1162 				    struct kvm_cpuid *cpuid,
1163 				    struct kvm_cpuid_entry __user *entries)
1164 {
1165 	int r, i;
1166 	struct kvm_cpuid_entry *cpuid_entries;
1167 
1168 	r = -E2BIG;
1169 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1170 		goto out;
1171 	r = -ENOMEM;
1172 	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
1173 	if (!cpuid_entries)
1174 		goto out;
1175 	r = -EFAULT;
1176 	if (copy_from_user(cpuid_entries, entries,
1177 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1178 		goto out_free;
1179 	for (i = 0; i < cpuid->nent; i++) {
1180 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1181 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
1182 		vcpu->arch.cpuid_entries[i].ebx = cpuid_entries[i].ebx;
1183 		vcpu->arch.cpuid_entries[i].ecx = cpuid_entries[i].ecx;
1184 		vcpu->arch.cpuid_entries[i].edx = cpuid_entries[i].edx;
1185 		vcpu->arch.cpuid_entries[i].index = 0;
1186 		vcpu->arch.cpuid_entries[i].flags = 0;
1187 		vcpu->arch.cpuid_entries[i].padding[0] = 0;
1188 		vcpu->arch.cpuid_entries[i].padding[1] = 0;
1189 		vcpu->arch.cpuid_entries[i].padding[2] = 0;
1190 	}
1191 	vcpu->arch.cpuid_nent = cpuid->nent;
1192 	cpuid_fix_nx_cap(vcpu);
1193 	r = 0;
1194 
1195 out_free:
1196 	vfree(cpuid_entries);
1197 out:
1198 	return r;
1199 }
1200 
1201 static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1202 				     struct kvm_cpuid2 *cpuid,
1203 				     struct kvm_cpuid_entry2 __user *entries)
1204 {
1205 	int r;
1206 
1207 	r = -E2BIG;
1208 	if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
1209 		goto out;
1210 	r = -EFAULT;
1211 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1212 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1213 		goto out;
1214 	vcpu->arch.cpuid_nent = cpuid->nent;
1215 	return 0;
1216 
1217 out:
1218 	return r;
1219 }
1220 
1221 static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
1222 				     struct kvm_cpuid2 *cpuid,
1223 				     struct kvm_cpuid_entry2 __user *entries)
1224 {
1225 	int r;
1226 
1227 	r = -E2BIG;
1228 	if (cpuid->nent < vcpu->arch.cpuid_nent)
1229 		goto out;
1230 	r = -EFAULT;
1231 	if (copy_to_user(entries, &vcpu->arch.cpuid_entries,
1232 			 vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
1233 		goto out;
1234 	return 0;
1235 
1236 out:
1237 	cpuid->nent = vcpu->arch.cpuid_nent;
1238 	return r;
1239 }
1240 
1241 static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1242 			   u32 index)
1243 {
1244 	entry->function = function;
1245 	entry->index = index;
1246 	cpuid_count(entry->function, entry->index,
1247 		    &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
1248 	entry->flags = 0;
1249 }
1250 
1251 #define F(x) bit(X86_FEATURE_##x)
1252 
1253 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1254 			 u32 index, int *nent, int maxnent)
1255 {
1256 	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1257 #ifdef CONFIG_X86_64
1258 	unsigned f_lm = F(LM);
1259 #else
1260 	unsigned f_lm = 0;
1261 #endif
1262 
1263 	/* cpuid 1.edx */
1264 	const u32 kvm_supported_word0_x86_features =
1265 		F(FPU) | F(VME) | F(DE) | F(PSE) |
1266 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1267 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
1268 		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1269 		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
1270 		0 /* Reserved, DS, ACPI */ | F(MMX) |
1271 		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
1272 		0 /* HTT, TM, Reserved, PBE */;
1273 	/* cpuid 0x80000001.edx */
1274 	const u32 kvm_supported_word1_x86_features =
1275 		F(FPU) | F(VME) | F(DE) | F(PSE) |
1276 		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
1277 		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
1278 		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1279 		F(PAT) | F(PSE36) | 0 /* Reserved */ |
1280 		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1281 		F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
1282 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1283 	/* cpuid 1.ecx */
1284 	const u32 kvm_supported_word4_x86_features =
1285 		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
1286 		0 /* DS-CPL, VMX, SMX, EST */ |
1287 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
1288 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1289 		0 /* Reserved, DCA */ | F(XMM4_1) |
1290 		F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
1291 		0 /* Reserved, XSAVE, OSXSAVE */;
1292 	/* cpuid 0x80000001.ecx */
1293 	const u32 kvm_supported_word6_x86_features =
1294 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1295 		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1296 		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
1297 		0 /* SKINIT */ | 0 /* WDT */;
1298 
1299 	/* all calls to cpuid_count() should be made on the same cpu */
1300 	get_cpu();
1301 	do_cpuid_1_ent(entry, function, index);
1302 	++*nent;
1303 
1304 	switch (function) {
1305 	case 0:
1306 		entry->eax = min(entry->eax, (u32)0xb);
1307 		break;
1308 	case 1:
1309 		entry->edx &= kvm_supported_word0_x86_features;
1310 		entry->ecx &= kvm_supported_word4_x86_features;
1311 		break;
1312 	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
1313 	 * may return different values. This forces us to get_cpu() before
1314 	 * issuing the first command, and also to emulate this annoying behavior
1315 	 * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
1316 	case 2: {
1317 		int t, times = entry->eax & 0xff;
1318 
1319 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1320 		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
1321 		for (t = 1; t < times && *nent < maxnent; ++t) {
1322 			do_cpuid_1_ent(&entry[t], function, 0);
1323 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
1324 			++*nent;
1325 		}
1326 		break;
1327 	}
1328 	/* function 4 and 0xb have additional index. */
1329 	case 4: {
1330 		int i, cache_type;
1331 
1332 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1333 		/* read more entries until cache_type is zero */
1334 		for (i = 1; *nent < maxnent; ++i) {
1335 			cache_type = entry[i - 1].eax & 0x1f;
1336 			if (!cache_type)
1337 				break;
1338 			do_cpuid_1_ent(&entry[i], function, i);
1339 			entry[i].flags |=
1340 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1341 			++*nent;
1342 		}
1343 		break;
1344 	}
1345 	case 0xb: {
1346 		int i, level_type;
1347 
1348 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1349 		/* read more entries until level_type is zero */
1350 		for (i = 1; *nent < maxnent; ++i) {
1351 			level_type = entry[i - 1].ecx & 0xff00;
1352 			if (!level_type)
1353 				break;
1354 			do_cpuid_1_ent(&entry[i], function, i);
1355 			entry[i].flags |=
1356 			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
1357 			++*nent;
1358 		}
1359 		break;
1360 	}
1361 	case 0x80000000:
1362 		entry->eax = min(entry->eax, 0x8000001a);
1363 		break;
1364 	case 0x80000001:
1365 		entry->edx &= kvm_supported_word1_x86_features;
1366 		entry->ecx &= kvm_supported_word6_x86_features;
1367 		break;
1368 	}
1369 	put_cpu();
1370 }
1371 
1372 #undef F
1373 
1374 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
1375 				     struct kvm_cpuid_entry2 __user *entries)
1376 {
1377 	struct kvm_cpuid_entry2 *cpuid_entries;
1378 	int limit, nent = 0, r = -E2BIG;
1379 	u32 func;
1380 
1381 	if (cpuid->nent < 1)
1382 		goto out;
1383 	r = -ENOMEM;
1384 	cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
1385 	if (!cpuid_entries)
1386 		goto out;
1387 
1388 	do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
1389 	limit = cpuid_entries[0].eax;
1390 	for (func = 1; func <= limit && nent < cpuid->nent; ++func)
1391 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
1392 			     &nent, cpuid->nent);
1393 	r = -E2BIG;
1394 	if (nent >= cpuid->nent)
1395 		goto out_free;
1396 
1397 	do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
1398 	limit = cpuid_entries[nent - 1].eax;
1399 	for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
1400 		do_cpuid_ent(&cpuid_entries[nent], func, 0,
1401 			     &nent, cpuid->nent);
1402 	r = -EFAULT;
1403 	if (copy_to_user(entries, cpuid_entries,
1404 			 nent * sizeof(struct kvm_cpuid_entry2)))
1405 		goto out_free;
1406 	cpuid->nent = nent;
1407 	r = 0;
1408 
1409 out_free:
1410 	vfree(cpuid_entries);
1411 out:
1412 	return r;
1413 }
1414 
1415 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
1416 				    struct kvm_lapic_state *s)
1417 {
1418 	vcpu_load(vcpu);
1419 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
1420 	vcpu_put(vcpu);
1421 
1422 	return 0;
1423 }
1424 
1425 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
1426 				    struct kvm_lapic_state *s)
1427 {
1428 	vcpu_load(vcpu);
1429 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
1430 	kvm_apic_post_state_restore(vcpu);
1431 	vcpu_put(vcpu);
1432 
1433 	return 0;
1434 }
1435 
1436 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
1437 				    struct kvm_interrupt *irq)
1438 {
1439 	if (irq->irq < 0 || irq->irq >= 256)
1440 		return -EINVAL;
1441 	if (irqchip_in_kernel(vcpu->kvm))
1442 		return -ENXIO;
1443 	vcpu_load(vcpu);
1444 
1445 	kvm_queue_interrupt(vcpu, irq->irq, false);
1446 
1447 	vcpu_put(vcpu);
1448 
1449 	return 0;
1450 }
1451 
1452 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
1453 {
1454 	vcpu_load(vcpu);
1455 	kvm_inject_nmi(vcpu);
1456 	vcpu_put(vcpu);
1457 
1458 	return 0;
1459 }
1460 
1461 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
1462 					   struct kvm_tpr_access_ctl *tac)
1463 {
1464 	if (tac->flags)
1465 		return -EINVAL;
1466 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
1467 	return 0;
1468 }
1469 
1470 long kvm_arch_vcpu_ioctl(struct file *filp,
1471 			 unsigned int ioctl, unsigned long arg)
1472 {
1473 	struct kvm_vcpu *vcpu = filp->private_data;
1474 	void __user *argp = (void __user *)arg;
1475 	int r;
1476 	struct kvm_lapic_state *lapic = NULL;
1477 
1478 	switch (ioctl) {
1479 	case KVM_GET_LAPIC: {
1480 		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1481 
1482 		r = -ENOMEM;
1483 		if (!lapic)
1484 			goto out;
1485 		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1486 		if (r)
1487 			goto out;
1488 		r = -EFAULT;
1489 		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1490 			goto out;
1491 		r = 0;
1492 		break;
1493 	}
1494 	case KVM_SET_LAPIC: {
1495 		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1496 		r = -ENOMEM;
1497 		if (!lapic)
1498 			goto out;
1499 		r = -EFAULT;
1500 		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1501 			goto out;
1502 		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1503 		if (r)
1504 			goto out;
1505 		r = 0;
1506 		break;
1507 	}
1508 	case KVM_INTERRUPT: {
1509 		struct kvm_interrupt irq;
1510 
1511 		r = -EFAULT;
1512 		if (copy_from_user(&irq, argp, sizeof irq))
1513 			goto out;
1514 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
1515 		if (r)
1516 			goto out;
1517 		r = 0;
1518 		break;
1519 	}
1520 	case KVM_NMI: {
1521 		r = kvm_vcpu_ioctl_nmi(vcpu);
1522 		if (r)
1523 			goto out;
1524 		r = 0;
1525 		break;
1526 	}
1527 	case KVM_SET_CPUID: {
1528 		struct kvm_cpuid __user *cpuid_arg = argp;
1529 		struct kvm_cpuid cpuid;
1530 
1531 		r = -EFAULT;
1532 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1533 			goto out;
1534 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
1535 		if (r)
1536 			goto out;
1537 		break;
1538 	}
1539 	case KVM_SET_CPUID2: {
1540 		struct kvm_cpuid2 __user *cpuid_arg = argp;
1541 		struct kvm_cpuid2 cpuid;
1542 
1543 		r = -EFAULT;
1544 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1545 			goto out;
1546 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
1547 					      cpuid_arg->entries);
1548 		if (r)
1549 			goto out;
1550 		break;
1551 	}
1552 	case KVM_GET_CPUID2: {
1553 		struct kvm_cpuid2 __user *cpuid_arg = argp;
1554 		struct kvm_cpuid2 cpuid;
1555 
1556 		r = -EFAULT;
1557 		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
1558 			goto out;
1559 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
1560 					      cpuid_arg->entries);
1561 		if (r)
1562 			goto out;
1563 		r = -EFAULT;
1564 		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
1565 			goto out;
1566 		r = 0;
1567 		break;
1568 	}
1569 	case KVM_GET_MSRS:
1570 		r = msr_io(vcpu, argp, kvm_get_msr, 1);
1571 		break;
1572 	case KVM_SET_MSRS:
1573 		r = msr_io(vcpu, argp, do_set_msr, 0);
1574 		break;
1575 	case KVM_TPR_ACCESS_REPORTING: {
1576 		struct kvm_tpr_access_ctl tac;
1577 
1578 		r = -EFAULT;
1579 		if (copy_from_user(&tac, argp, sizeof tac))
1580 			goto out;
1581 		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
1582 		if (r)
1583 			goto out;
1584 		r = -EFAULT;
1585 		if (copy_to_user(argp, &tac, sizeof tac))
1586 			goto out;
1587 		r = 0;
1588 		break;
1589 	};
1590 	case KVM_SET_VAPIC_ADDR: {
1591 		struct kvm_vapic_addr va;
1592 
1593 		r = -EINVAL;
1594 		if (!irqchip_in_kernel(vcpu->kvm))
1595 			goto out;
1596 		r = -EFAULT;
1597 		if (copy_from_user(&va, argp, sizeof va))
1598 			goto out;
1599 		r = 0;
1600 		kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
1601 		break;
1602 	}
1603 	default:
1604 		r = -EINVAL;
1605 	}
1606 out:
1607 	kfree(lapic);
1608 	return r;
1609 }
1610 
1611 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
1612 {
1613 	int ret;
1614 
1615 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
1616 		return -1;
1617 	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
1618 	return ret;
1619 }
1620 
1621 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1622 					  u32 kvm_nr_mmu_pages)
1623 {
1624 	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1625 		return -EINVAL;
1626 
1627 	down_write(&kvm->slots_lock);
1628 	spin_lock(&kvm->mmu_lock);
1629 
1630 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1631 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1632 
1633 	spin_unlock(&kvm->mmu_lock);
1634 	up_write(&kvm->slots_lock);
1635 	return 0;
1636 }
1637 
1638 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1639 {
1640 	return kvm->arch.n_alloc_mmu_pages;
1641 }
1642 
1643 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1644 {
1645 	int i;
1646 	struct kvm_mem_alias *alias;
1647 
1648 	for (i = 0; i < kvm->arch.naliases; ++i) {
1649 		alias = &kvm->arch.aliases[i];
1650 		if (gfn >= alias->base_gfn
1651 		    && gfn < alias->base_gfn + alias->npages)
1652 			return alias->target_gfn + gfn - alias->base_gfn;
1653 	}
1654 	return gfn;
1655 }
1656 
1657 /*
1658  * Set a new alias region.  Aliases map a portion of physical memory into
1659  * another portion.  This is useful for memory windows, for example the PC
1660  * VGA region.
1661  */
1662 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1663 					 struct kvm_memory_alias *alias)
1664 {
1665 	int r, n;
1666 	struct kvm_mem_alias *p;
1667 
1668 	r = -EINVAL;
1669 	/* General sanity checks */
1670 	if (alias->memory_size & (PAGE_SIZE - 1))
1671 		goto out;
1672 	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
1673 		goto out;
1674 	if (alias->slot >= KVM_ALIAS_SLOTS)
1675 		goto out;
1676 	if (alias->guest_phys_addr + alias->memory_size
1677 	    < alias->guest_phys_addr)
1678 		goto out;
1679 	if (alias->target_phys_addr + alias->memory_size
1680 	    < alias->target_phys_addr)
1681 		goto out;
1682 
1683 	down_write(&kvm->slots_lock);
1684 	spin_lock(&kvm->mmu_lock);
1685 
1686 	p = &kvm->arch.aliases[alias->slot];
1687 	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
1688 	p->npages = alias->memory_size >> PAGE_SHIFT;
1689 	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
1690 
1691 	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
1692 		if (kvm->arch.aliases[n - 1].npages)
1693 			break;
1694 	kvm->arch.naliases = n;
1695 
1696 	spin_unlock(&kvm->mmu_lock);
1697 	kvm_mmu_zap_all(kvm);
1698 
1699 	up_write(&kvm->slots_lock);
1700 
1701 	return 0;
1702 
1703 out:
1704 	return r;
1705 }
1706 
1707 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1708 {
1709 	int r;
1710 
1711 	r = 0;
1712 	switch (chip->chip_id) {
1713 	case KVM_IRQCHIP_PIC_MASTER:
1714 		memcpy(&chip->chip.pic,
1715 			&pic_irqchip(kvm)->pics[0],
1716 			sizeof(struct kvm_pic_state));
1717 		break;
1718 	case KVM_IRQCHIP_PIC_SLAVE:
1719 		memcpy(&chip->chip.pic,
1720 			&pic_irqchip(kvm)->pics[1],
1721 			sizeof(struct kvm_pic_state));
1722 		break;
1723 	case KVM_IRQCHIP_IOAPIC:
1724 		memcpy(&chip->chip.ioapic,
1725 			ioapic_irqchip(kvm),
1726 			sizeof(struct kvm_ioapic_state));
1727 		break;
1728 	default:
1729 		r = -EINVAL;
1730 		break;
1731 	}
1732 	return r;
1733 }
1734 
1735 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
1736 {
1737 	int r;
1738 
1739 	r = 0;
1740 	switch (chip->chip_id) {
1741 	case KVM_IRQCHIP_PIC_MASTER:
1742 		memcpy(&pic_irqchip(kvm)->pics[0],
1743 			&chip->chip.pic,
1744 			sizeof(struct kvm_pic_state));
1745 		break;
1746 	case KVM_IRQCHIP_PIC_SLAVE:
1747 		memcpy(&pic_irqchip(kvm)->pics[1],
1748 			&chip->chip.pic,
1749 			sizeof(struct kvm_pic_state));
1750 		break;
1751 	case KVM_IRQCHIP_IOAPIC:
1752 		memcpy(ioapic_irqchip(kvm),
1753 			&chip->chip.ioapic,
1754 			sizeof(struct kvm_ioapic_state));
1755 		break;
1756 	default:
1757 		r = -EINVAL;
1758 		break;
1759 	}
1760 	kvm_pic_update_irq(pic_irqchip(kvm));
1761 	return r;
1762 }
1763 
1764 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1765 {
1766 	int r = 0;
1767 
1768 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
1769 	return r;
1770 }
1771 
1772 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
1773 {
1774 	int r = 0;
1775 
1776 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
1777 	kvm_pit_load_count(kvm, 0, ps->channels[0].count);
1778 	return r;
1779 }
1780 
1781 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
1782 				 struct kvm_reinject_control *control)
1783 {
1784 	if (!kvm->arch.vpit)
1785 		return -ENXIO;
1786 	kvm->arch.vpit->pit_state.pit_timer.reinject = control->pit_reinject;
1787 	return 0;
1788 }
1789 
1790 /*
1791  * Get (and clear) the dirty memory log for a memory slot.
1792  */
1793 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
1794 				      struct kvm_dirty_log *log)
1795 {
1796 	int r;
1797 	int n;
1798 	struct kvm_memory_slot *memslot;
1799 	int is_dirty = 0;
1800 
1801 	down_write(&kvm->slots_lock);
1802 
1803 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
1804 	if (r)
1805 		goto out;
1806 
1807 	/* If nothing is dirty, don't bother messing with page tables. */
1808 	if (is_dirty) {
1809 		spin_lock(&kvm->mmu_lock);
1810 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
1811 		spin_unlock(&kvm->mmu_lock);
1812 		kvm_flush_remote_tlbs(kvm);
1813 		memslot = &kvm->memslots[log->slot];
1814 		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
1815 		memset(memslot->dirty_bitmap, 0, n);
1816 	}
1817 	r = 0;
1818 out:
1819 	up_write(&kvm->slots_lock);
1820 	return r;
1821 }
1822 
1823 long kvm_arch_vm_ioctl(struct file *filp,
1824 		       unsigned int ioctl, unsigned long arg)
1825 {
1826 	struct kvm *kvm = filp->private_data;
1827 	void __user *argp = (void __user *)arg;
1828 	int r = -EINVAL;
1829 	/*
1830 	 * This union makes it completely explicit to gcc-3.x
1831 	 * that these two variables' stack usage should be
1832 	 * combined, not added together.
1833 	 */
1834 	union {
1835 		struct kvm_pit_state ps;
1836 		struct kvm_memory_alias alias;
1837 	} u;
1838 
1839 	switch (ioctl) {
1840 	case KVM_SET_TSS_ADDR:
1841 		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1842 		if (r < 0)
1843 			goto out;
1844 		break;
1845 	case KVM_SET_MEMORY_REGION: {
1846 		struct kvm_memory_region kvm_mem;
1847 		struct kvm_userspace_memory_region kvm_userspace_mem;
1848 
1849 		r = -EFAULT;
1850 		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1851 			goto out;
1852 		kvm_userspace_mem.slot = kvm_mem.slot;
1853 		kvm_userspace_mem.flags = kvm_mem.flags;
1854 		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1855 		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1856 		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1857 		if (r)
1858 			goto out;
1859 		break;
1860 	}
1861 	case KVM_SET_NR_MMU_PAGES:
1862 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1863 		if (r)
1864 			goto out;
1865 		break;
1866 	case KVM_GET_NR_MMU_PAGES:
1867 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1868 		break;
1869 	case KVM_SET_MEMORY_ALIAS:
1870 		r = -EFAULT;
1871 		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1872 			goto out;
1873 		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1874 		if (r)
1875 			goto out;
1876 		break;
1877 	case KVM_CREATE_IRQCHIP:
1878 		r = -ENOMEM;
1879 		kvm->arch.vpic = kvm_create_pic(kvm);
1880 		if (kvm->arch.vpic) {
1881 			r = kvm_ioapic_init(kvm);
1882 			if (r) {
1883 				kfree(kvm->arch.vpic);
1884 				kvm->arch.vpic = NULL;
1885 				goto out;
1886 			}
1887 		} else
1888 			goto out;
1889 		r = kvm_setup_default_irq_routing(kvm);
1890 		if (r) {
1891 			kfree(kvm->arch.vpic);
1892 			kfree(kvm->arch.vioapic);
1893 			goto out;
1894 		}
1895 		break;
1896 	case KVM_CREATE_PIT:
1897 		mutex_lock(&kvm->lock);
1898 		r = -EEXIST;
1899 		if (kvm->arch.vpit)
1900 			goto create_pit_unlock;
1901 		r = -ENOMEM;
1902 		kvm->arch.vpit = kvm_create_pit(kvm);
1903 		if (kvm->arch.vpit)
1904 			r = 0;
1905 	create_pit_unlock:
1906 		mutex_unlock(&kvm->lock);
1907 		break;
1908 	case KVM_IRQ_LINE_STATUS:
1909 	case KVM_IRQ_LINE: {
1910 		struct kvm_irq_level irq_event;
1911 
1912 		r = -EFAULT;
1913 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
1914 			goto out;
1915 		if (irqchip_in_kernel(kvm)) {
1916 			__s32 status;
1917 			mutex_lock(&kvm->lock);
1918 			status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
1919 					irq_event.irq, irq_event.level);
1920 			mutex_unlock(&kvm->lock);
1921 			if (ioctl == KVM_IRQ_LINE_STATUS) {
1922 				irq_event.status = status;
1923 				if (copy_to_user(argp, &irq_event,
1924 							sizeof irq_event))
1925 					goto out;
1926 			}
1927 			r = 0;
1928 		}
1929 		break;
1930 	}
1931 	case KVM_GET_IRQCHIP: {
1932 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1933 		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1934 
1935 		r = -ENOMEM;
1936 		if (!chip)
1937 			goto out;
1938 		r = -EFAULT;
1939 		if (copy_from_user(chip, argp, sizeof *chip))
1940 			goto get_irqchip_out;
1941 		r = -ENXIO;
1942 		if (!irqchip_in_kernel(kvm))
1943 			goto get_irqchip_out;
1944 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1945 		if (r)
1946 			goto get_irqchip_out;
1947 		r = -EFAULT;
1948 		if (copy_to_user(argp, chip, sizeof *chip))
1949 			goto get_irqchip_out;
1950 		r = 0;
1951 	get_irqchip_out:
1952 		kfree(chip);
1953 		if (r)
1954 			goto out;
1955 		break;
1956 	}
1957 	case KVM_SET_IRQCHIP: {
1958 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1959 		struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1960 
1961 		r = -ENOMEM;
1962 		if (!chip)
1963 			goto out;
1964 		r = -EFAULT;
1965 		if (copy_from_user(chip, argp, sizeof *chip))
1966 			goto set_irqchip_out;
1967 		r = -ENXIO;
1968 		if (!irqchip_in_kernel(kvm))
1969 			goto set_irqchip_out;
1970 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1971 		if (r)
1972 			goto set_irqchip_out;
1973 		r = 0;
1974 	set_irqchip_out:
1975 		kfree(chip);
1976 		if (r)
1977 			goto out;
1978 		break;
1979 	}
1980 	case KVM_GET_PIT: {
1981 		r = -EFAULT;
1982 		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1983 			goto out;
1984 		r = -ENXIO;
1985 		if (!kvm->arch.vpit)
1986 			goto out;
1987 		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1988 		if (r)
1989 			goto out;
1990 		r = -EFAULT;
1991 		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1992 			goto out;
1993 		r = 0;
1994 		break;
1995 	}
1996 	case KVM_SET_PIT: {
1997 		r = -EFAULT;
1998 		if (copy_from_user(&u.ps, argp, sizeof u.ps))
1999 			goto out;
2000 		r = -ENXIO;
2001 		if (!kvm->arch.vpit)
2002 			goto out;
2003 		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
2004 		if (r)
2005 			goto out;
2006 		r = 0;
2007 		break;
2008 	}
2009 	case KVM_REINJECT_CONTROL: {
2010 		struct kvm_reinject_control control;
2011 		r =  -EFAULT;
2012 		if (copy_from_user(&control, argp, sizeof(control)))
2013 			goto out;
2014 		r = kvm_vm_ioctl_reinject(kvm, &control);
2015 		if (r)
2016 			goto out;
2017 		r = 0;
2018 		break;
2019 	}
2020 	default:
2021 		;
2022 	}
2023 out:
2024 	return r;
2025 }
2026 
2027 static void kvm_init_msr_list(void)
2028 {
2029 	u32 dummy[2];
2030 	unsigned i, j;
2031 
2032 	for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
2033 		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2034 			continue;
2035 		if (j < i)
2036 			msrs_to_save[j] = msrs_to_save[i];
2037 		j++;
2038 	}
2039 	num_msrs_to_save = j;
2040 }
2041 
2042 /*
2043  * Only apic need an MMIO device hook, so shortcut now..
2044  */
2045 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
2046 						gpa_t addr, int len,
2047 						int is_write)
2048 {
2049 	struct kvm_io_device *dev;
2050 
2051 	if (vcpu->arch.apic) {
2052 		dev = &vcpu->arch.apic->dev;
2053 		if (dev->in_range(dev, addr, len, is_write))
2054 			return dev;
2055 	}
2056 	return NULL;
2057 }
2058 
2059 
2060 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
2061 						gpa_t addr, int len,
2062 						int is_write)
2063 {
2064 	struct kvm_io_device *dev;
2065 
2066 	dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
2067 	if (dev == NULL)
2068 		dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
2069 					  is_write);
2070 	return dev;
2071 }
2072 
2073 static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
2074 			       struct kvm_vcpu *vcpu)
2075 {
2076 	void *data = val;
2077 	int r = X86EMUL_CONTINUE;
2078 
2079 	while (bytes) {
2080 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2081 		unsigned offset = addr & (PAGE_SIZE-1);
2082 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2083 		int ret;
2084 
2085 		if (gpa == UNMAPPED_GVA) {
2086 			r = X86EMUL_PROPAGATE_FAULT;
2087 			goto out;
2088 		}
2089 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
2090 		if (ret < 0) {
2091 			r = X86EMUL_UNHANDLEABLE;
2092 			goto out;
2093 		}
2094 
2095 		bytes -= toread;
2096 		data += toread;
2097 		addr += toread;
2098 	}
2099 out:
2100 	return r;
2101 }
2102 
2103 static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2104 				struct kvm_vcpu *vcpu)
2105 {
2106 	void *data = val;
2107 	int r = X86EMUL_CONTINUE;
2108 
2109 	while (bytes) {
2110 		gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2111 		unsigned offset = addr & (PAGE_SIZE-1);
2112 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2113 		int ret;
2114 
2115 		if (gpa == UNMAPPED_GVA) {
2116 			r = X86EMUL_PROPAGATE_FAULT;
2117 			goto out;
2118 		}
2119 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
2120 		if (ret < 0) {
2121 			r = X86EMUL_UNHANDLEABLE;
2122 			goto out;
2123 		}
2124 
2125 		bytes -= towrite;
2126 		data += towrite;
2127 		addr += towrite;
2128 	}
2129 out:
2130 	return r;
2131 }
2132 
2133 
2134 static int emulator_read_emulated(unsigned long addr,
2135 				  void *val,
2136 				  unsigned int bytes,
2137 				  struct kvm_vcpu *vcpu)
2138 {
2139 	struct kvm_io_device *mmio_dev;
2140 	gpa_t                 gpa;
2141 
2142 	if (vcpu->mmio_read_completed) {
2143 		memcpy(val, vcpu->mmio_data, bytes);
2144 		vcpu->mmio_read_completed = 0;
2145 		return X86EMUL_CONTINUE;
2146 	}
2147 
2148 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2149 
2150 	/* For APIC access vmexit */
2151 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2152 		goto mmio;
2153 
2154 	if (kvm_read_guest_virt(addr, val, bytes, vcpu)
2155 				== X86EMUL_CONTINUE)
2156 		return X86EMUL_CONTINUE;
2157 	if (gpa == UNMAPPED_GVA)
2158 		return X86EMUL_PROPAGATE_FAULT;
2159 
2160 mmio:
2161 	/*
2162 	 * Is this MMIO handled locally?
2163 	 */
2164 	mutex_lock(&vcpu->kvm->lock);
2165 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
2166 	if (mmio_dev) {
2167 		kvm_iodevice_read(mmio_dev, gpa, bytes, val);
2168 		mutex_unlock(&vcpu->kvm->lock);
2169 		return X86EMUL_CONTINUE;
2170 	}
2171 	mutex_unlock(&vcpu->kvm->lock);
2172 
2173 	vcpu->mmio_needed = 1;
2174 	vcpu->mmio_phys_addr = gpa;
2175 	vcpu->mmio_size = bytes;
2176 	vcpu->mmio_is_write = 0;
2177 
2178 	return X86EMUL_UNHANDLEABLE;
2179 }
2180 
2181 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
2182 			  const void *val, int bytes)
2183 {
2184 	int ret;
2185 
2186 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
2187 	if (ret < 0)
2188 		return 0;
2189 	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
2190 	return 1;
2191 }
2192 
2193 static int emulator_write_emulated_onepage(unsigned long addr,
2194 					   const void *val,
2195 					   unsigned int bytes,
2196 					   struct kvm_vcpu *vcpu)
2197 {
2198 	struct kvm_io_device *mmio_dev;
2199 	gpa_t                 gpa;
2200 
2201 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2202 
2203 	if (gpa == UNMAPPED_GVA) {
2204 		kvm_inject_page_fault(vcpu, addr, 2);
2205 		return X86EMUL_PROPAGATE_FAULT;
2206 	}
2207 
2208 	/* For APIC access vmexit */
2209 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2210 		goto mmio;
2211 
2212 	if (emulator_write_phys(vcpu, gpa, val, bytes))
2213 		return X86EMUL_CONTINUE;
2214 
2215 mmio:
2216 	/*
2217 	 * Is this MMIO handled locally?
2218 	 */
2219 	mutex_lock(&vcpu->kvm->lock);
2220 	mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
2221 	if (mmio_dev) {
2222 		kvm_iodevice_write(mmio_dev, gpa, bytes, val);
2223 		mutex_unlock(&vcpu->kvm->lock);
2224 		return X86EMUL_CONTINUE;
2225 	}
2226 	mutex_unlock(&vcpu->kvm->lock);
2227 
2228 	vcpu->mmio_needed = 1;
2229 	vcpu->mmio_phys_addr = gpa;
2230 	vcpu->mmio_size = bytes;
2231 	vcpu->mmio_is_write = 1;
2232 	memcpy(vcpu->mmio_data, val, bytes);
2233 
2234 	return X86EMUL_CONTINUE;
2235 }
2236 
2237 int emulator_write_emulated(unsigned long addr,
2238 				   const void *val,
2239 				   unsigned int bytes,
2240 				   struct kvm_vcpu *vcpu)
2241 {
2242 	/* Crossing a page boundary? */
2243 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
2244 		int rc, now;
2245 
2246 		now = -addr & ~PAGE_MASK;
2247 		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
2248 		if (rc != X86EMUL_CONTINUE)
2249 			return rc;
2250 		addr += now;
2251 		val += now;
2252 		bytes -= now;
2253 	}
2254 	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
2255 }
2256 EXPORT_SYMBOL_GPL(emulator_write_emulated);
2257 
2258 static int emulator_cmpxchg_emulated(unsigned long addr,
2259 				     const void *old,
2260 				     const void *new,
2261 				     unsigned int bytes,
2262 				     struct kvm_vcpu *vcpu)
2263 {
2264 	static int reported;
2265 
2266 	if (!reported) {
2267 		reported = 1;
2268 		printk(KERN_WARNING "kvm: emulating exchange as write\n");
2269 	}
2270 #ifndef CONFIG_X86_64
2271 	/* guests cmpxchg8b have to be emulated atomically */
2272 	if (bytes == 8) {
2273 		gpa_t gpa;
2274 		struct page *page;
2275 		char *kaddr;
2276 		u64 val;
2277 
2278 		gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
2279 
2280 		if (gpa == UNMAPPED_GVA ||
2281 		   (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2282 			goto emul_write;
2283 
2284 		if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
2285 			goto emul_write;
2286 
2287 		val = *(u64 *)new;
2288 
2289 		page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2290 
2291 		kaddr = kmap_atomic(page, KM_USER0);
2292 		set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
2293 		kunmap_atomic(kaddr, KM_USER0);
2294 		kvm_release_page_dirty(page);
2295 	}
2296 emul_write:
2297 #endif
2298 
2299 	return emulator_write_emulated(addr, new, bytes, vcpu);
2300 }
2301 
2302 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2303 {
2304 	return kvm_x86_ops->get_segment_base(vcpu, seg);
2305 }
2306 
2307 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2308 {
2309 	kvm_mmu_invlpg(vcpu, address);
2310 	return X86EMUL_CONTINUE;
2311 }
2312 
2313 int emulate_clts(struct kvm_vcpu *vcpu)
2314 {
2315 	KVMTRACE_0D(CLTS, vcpu, handler);
2316 	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2317 	return X86EMUL_CONTINUE;
2318 }
2319 
2320 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2321 {
2322 	struct kvm_vcpu *vcpu = ctxt->vcpu;
2323 
2324 	switch (dr) {
2325 	case 0 ... 3:
2326 		*dest = kvm_x86_ops->get_dr(vcpu, dr);
2327 		return X86EMUL_CONTINUE;
2328 	default:
2329 		pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2330 		return X86EMUL_UNHANDLEABLE;
2331 	}
2332 }
2333 
2334 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2335 {
2336 	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2337 	int exception;
2338 
2339 	kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
2340 	if (exception) {
2341 		/* FIXME: better handling */
2342 		return X86EMUL_UNHANDLEABLE;
2343 	}
2344 	return X86EMUL_CONTINUE;
2345 }
2346 
2347 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2348 {
2349 	u8 opcodes[4];
2350 	unsigned long rip = kvm_rip_read(vcpu);
2351 	unsigned long rip_linear;
2352 
2353 	if (!printk_ratelimit())
2354 		return;
2355 
2356 	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2357 
2358 	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu);
2359 
2360 	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2361 	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2362 }
2363 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2364 
2365 static struct x86_emulate_ops emulate_ops = {
2366 	.read_std            = kvm_read_guest_virt,
2367 	.read_emulated       = emulator_read_emulated,
2368 	.write_emulated      = emulator_write_emulated,
2369 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
2370 };
2371 
2372 static void cache_all_regs(struct kvm_vcpu *vcpu)
2373 {
2374 	kvm_register_read(vcpu, VCPU_REGS_RAX);
2375 	kvm_register_read(vcpu, VCPU_REGS_RSP);
2376 	kvm_register_read(vcpu, VCPU_REGS_RIP);
2377 	vcpu->arch.regs_dirty = ~0;
2378 }
2379 
2380 int emulate_instruction(struct kvm_vcpu *vcpu,
2381 			struct kvm_run *run,
2382 			unsigned long cr2,
2383 			u16 error_code,
2384 			int emulation_type)
2385 {
2386 	int r, shadow_mask;
2387 	struct decode_cache *c;
2388 
2389 	kvm_clear_exception_queue(vcpu);
2390 	vcpu->arch.mmio_fault_cr2 = cr2;
2391 	/*
2392 	 * TODO: fix x86_emulate.c to use guest_read/write_register
2393 	 * instead of direct ->regs accesses, can save hundred cycles
2394 	 * on Intel for instructions that don't read/change RSP, for
2395 	 * for example.
2396 	 */
2397 	cache_all_regs(vcpu);
2398 
2399 	vcpu->mmio_is_write = 0;
2400 	vcpu->arch.pio.string = 0;
2401 
2402 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
2403 		int cs_db, cs_l;
2404 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2405 
2406 		vcpu->arch.emulate_ctxt.vcpu = vcpu;
2407 		vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
2408 		vcpu->arch.emulate_ctxt.mode =
2409 			(vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2410 			? X86EMUL_MODE_REAL : cs_l
2411 			? X86EMUL_MODE_PROT64 :	cs_db
2412 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2413 
2414 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2415 
2416 		/* Reject the instructions other than VMCALL/VMMCALL when
2417 		 * try to emulate invalid opcode */
2418 		c = &vcpu->arch.emulate_ctxt.decode;
2419 		if ((emulation_type & EMULTYPE_TRAP_UD) &&
2420 		    (!(c->twobyte && c->b == 0x01 &&
2421 		      (c->modrm_reg == 0 || c->modrm_reg == 3) &&
2422 		       c->modrm_mod == 3 && c->modrm_rm == 1)))
2423 			return EMULATE_FAIL;
2424 
2425 		++vcpu->stat.insn_emulation;
2426 		if (r)  {
2427 			++vcpu->stat.insn_emulation_fail;
2428 			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2429 				return EMULATE_DONE;
2430 			return EMULATE_FAIL;
2431 		}
2432 	}
2433 
2434 	if (emulation_type & EMULTYPE_SKIP) {
2435 		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
2436 		return EMULATE_DONE;
2437 	}
2438 
2439 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2440 	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
2441 
2442 	if (r == 0)
2443 		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
2444 
2445 	if (vcpu->arch.pio.string)
2446 		return EMULATE_DO_MMIO;
2447 
2448 	if ((r || vcpu->mmio_is_write) && run) {
2449 		run->exit_reason = KVM_EXIT_MMIO;
2450 		run->mmio.phys_addr = vcpu->mmio_phys_addr;
2451 		memcpy(run->mmio.data, vcpu->mmio_data, 8);
2452 		run->mmio.len = vcpu->mmio_size;
2453 		run->mmio.is_write = vcpu->mmio_is_write;
2454 	}
2455 
2456 	if (r) {
2457 		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
2458 			return EMULATE_DONE;
2459 		if (!vcpu->mmio_needed) {
2460 			kvm_report_emulation_failure(vcpu, "mmio");
2461 			return EMULATE_FAIL;
2462 		}
2463 		return EMULATE_DO_MMIO;
2464 	}
2465 
2466 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2467 
2468 	if (vcpu->mmio_is_write) {
2469 		vcpu->mmio_needed = 0;
2470 		return EMULATE_DO_MMIO;
2471 	}
2472 
2473 	return EMULATE_DONE;
2474 }
2475 EXPORT_SYMBOL_GPL(emulate_instruction);
2476 
2477 static int pio_copy_data(struct kvm_vcpu *vcpu)
2478 {
2479 	void *p = vcpu->arch.pio_data;
2480 	gva_t q = vcpu->arch.pio.guest_gva;
2481 	unsigned bytes;
2482 	int ret;
2483 
2484 	bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2485 	if (vcpu->arch.pio.in)
2486 		ret = kvm_write_guest_virt(q, p, bytes, vcpu);
2487 	else
2488 		ret = kvm_read_guest_virt(q, p, bytes, vcpu);
2489 	return ret;
2490 }
2491 
2492 int complete_pio(struct kvm_vcpu *vcpu)
2493 {
2494 	struct kvm_pio_request *io = &vcpu->arch.pio;
2495 	long delta;
2496 	int r;
2497 	unsigned long val;
2498 
2499 	if (!io->string) {
2500 		if (io->in) {
2501 			val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2502 			memcpy(&val, vcpu->arch.pio_data, io->size);
2503 			kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2504 		}
2505 	} else {
2506 		if (io->in) {
2507 			r = pio_copy_data(vcpu);
2508 			if (r)
2509 				return r;
2510 		}
2511 
2512 		delta = 1;
2513 		if (io->rep) {
2514 			delta *= io->cur_count;
2515 			/*
2516 			 * The size of the register should really depend on
2517 			 * current address size.
2518 			 */
2519 			val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2520 			val -= delta;
2521 			kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2522 		}
2523 		if (io->down)
2524 			delta = -delta;
2525 		delta *= io->size;
2526 		if (io->in) {
2527 			val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2528 			val += delta;
2529 			kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2530 		} else {
2531 			val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2532 			val += delta;
2533 			kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2534 		}
2535 	}
2536 
2537 	io->count -= io->cur_count;
2538 	io->cur_count = 0;
2539 
2540 	return 0;
2541 }
2542 
2543 static void kernel_pio(struct kvm_io_device *pio_dev,
2544 		       struct kvm_vcpu *vcpu,
2545 		       void *pd)
2546 {
2547 	/* TODO: String I/O for in kernel device */
2548 
2549 	mutex_lock(&vcpu->kvm->lock);
2550 	if (vcpu->arch.pio.in)
2551 		kvm_iodevice_read(pio_dev, vcpu->arch.pio.port,
2552 				  vcpu->arch.pio.size,
2553 				  pd);
2554 	else
2555 		kvm_iodevice_write(pio_dev, vcpu->arch.pio.port,
2556 				   vcpu->arch.pio.size,
2557 				   pd);
2558 	mutex_unlock(&vcpu->kvm->lock);
2559 }
2560 
2561 static void pio_string_write(struct kvm_io_device *pio_dev,
2562 			     struct kvm_vcpu *vcpu)
2563 {
2564 	struct kvm_pio_request *io = &vcpu->arch.pio;
2565 	void *pd = vcpu->arch.pio_data;
2566 	int i;
2567 
2568 	mutex_lock(&vcpu->kvm->lock);
2569 	for (i = 0; i < io->cur_count; i++) {
2570 		kvm_iodevice_write(pio_dev, io->port,
2571 				   io->size,
2572 				   pd);
2573 		pd += io->size;
2574 	}
2575 	mutex_unlock(&vcpu->kvm->lock);
2576 }
2577 
2578 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2579 					       gpa_t addr, int len,
2580 					       int is_write)
2581 {
2582 	return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2583 }
2584 
2585 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2586 		  int size, unsigned port)
2587 {
2588 	struct kvm_io_device *pio_dev;
2589 	unsigned long val;
2590 
2591 	vcpu->run->exit_reason = KVM_EXIT_IO;
2592 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2593 	vcpu->run->io.size = vcpu->arch.pio.size = size;
2594 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2595 	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = 1;
2596 	vcpu->run->io.port = vcpu->arch.pio.port = port;
2597 	vcpu->arch.pio.in = in;
2598 	vcpu->arch.pio.string = 0;
2599 	vcpu->arch.pio.down = 0;
2600 	vcpu->arch.pio.rep = 0;
2601 
2602 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2603 		KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2604 			    handler);
2605 	else
2606 		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2607 			    handler);
2608 
2609 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2610 	memcpy(vcpu->arch.pio_data, &val, 4);
2611 
2612 	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2613 	if (pio_dev) {
2614 		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2615 		complete_pio(vcpu);
2616 		return 1;
2617 	}
2618 	return 0;
2619 }
2620 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2621 
2622 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2623 		  int size, unsigned long count, int down,
2624 		  gva_t address, int rep, unsigned port)
2625 {
2626 	unsigned now, in_page;
2627 	int ret = 0;
2628 	struct kvm_io_device *pio_dev;
2629 
2630 	vcpu->run->exit_reason = KVM_EXIT_IO;
2631 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2632 	vcpu->run->io.size = vcpu->arch.pio.size = size;
2633 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2634 	vcpu->run->io.count = vcpu->arch.pio.count = vcpu->arch.pio.cur_count = count;
2635 	vcpu->run->io.port = vcpu->arch.pio.port = port;
2636 	vcpu->arch.pio.in = in;
2637 	vcpu->arch.pio.string = 1;
2638 	vcpu->arch.pio.down = down;
2639 	vcpu->arch.pio.rep = rep;
2640 
2641 	if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
2642 		KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
2643 			    handler);
2644 	else
2645 		KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2646 			    handler);
2647 
2648 	if (!count) {
2649 		kvm_x86_ops->skip_emulated_instruction(vcpu);
2650 		return 1;
2651 	}
2652 
2653 	if (!down)
2654 		in_page = PAGE_SIZE - offset_in_page(address);
2655 	else
2656 		in_page = offset_in_page(address) + size;
2657 	now = min(count, (unsigned long)in_page / size);
2658 	if (!now)
2659 		now = 1;
2660 	if (down) {
2661 		/*
2662 		 * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2663 		 */
2664 		pr_unimpl(vcpu, "guest string pio down\n");
2665 		kvm_inject_gp(vcpu, 0);
2666 		return 1;
2667 	}
2668 	vcpu->run->io.count = now;
2669 	vcpu->arch.pio.cur_count = now;
2670 
2671 	if (vcpu->arch.pio.cur_count == vcpu->arch.pio.count)
2672 		kvm_x86_ops->skip_emulated_instruction(vcpu);
2673 
2674 	vcpu->arch.pio.guest_gva = address;
2675 
2676 	pio_dev = vcpu_find_pio_dev(vcpu, port,
2677 				    vcpu->arch.pio.cur_count,
2678 				    !vcpu->arch.pio.in);
2679 	if (!vcpu->arch.pio.in) {
2680 		/* string PIO write */
2681 		ret = pio_copy_data(vcpu);
2682 		if (ret == X86EMUL_PROPAGATE_FAULT) {
2683 			kvm_inject_gp(vcpu, 0);
2684 			return 1;
2685 		}
2686 		if (ret == 0 && pio_dev) {
2687 			pio_string_write(pio_dev, vcpu);
2688 			complete_pio(vcpu);
2689 			if (vcpu->arch.pio.count == 0)
2690 				ret = 1;
2691 		}
2692 	} else if (pio_dev)
2693 		pr_unimpl(vcpu, "no string pio read support yet, "
2694 		       "port %x size %d count %ld\n",
2695 			port, size, count);
2696 
2697 	return ret;
2698 }
2699 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2700 
2701 static void bounce_off(void *info)
2702 {
2703 	/* nothing */
2704 }
2705 
2706 static unsigned int  ref_freq;
2707 static unsigned long tsc_khz_ref;
2708 
2709 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
2710 				     void *data)
2711 {
2712 	struct cpufreq_freqs *freq = data;
2713 	struct kvm *kvm;
2714 	struct kvm_vcpu *vcpu;
2715 	int i, send_ipi = 0;
2716 
2717 	if (!ref_freq)
2718 		ref_freq = freq->old;
2719 
2720 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
2721 		return 0;
2722 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
2723 		return 0;
2724 	per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new);
2725 
2726 	spin_lock(&kvm_lock);
2727 	list_for_each_entry(kvm, &vm_list, vm_list) {
2728 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2729 			vcpu = kvm->vcpus[i];
2730 			if (!vcpu)
2731 				continue;
2732 			if (vcpu->cpu != freq->cpu)
2733 				continue;
2734 			if (!kvm_request_guest_time_update(vcpu))
2735 				continue;
2736 			if (vcpu->cpu != smp_processor_id())
2737 				send_ipi++;
2738 		}
2739 	}
2740 	spin_unlock(&kvm_lock);
2741 
2742 	if (freq->old < freq->new && send_ipi) {
2743 		/*
2744 		 * We upscale the frequency.  Must make the guest
2745 		 * doesn't see old kvmclock values while running with
2746 		 * the new frequency, otherwise we risk the guest sees
2747 		 * time go backwards.
2748 		 *
2749 		 * In case we update the frequency for another cpu
2750 		 * (which might be in guest context) send an interrupt
2751 		 * to kick the cpu out of guest context.  Next time
2752 		 * guest context is entered kvmclock will be updated,
2753 		 * so the guest will not see stale values.
2754 		 */
2755 		smp_call_function_single(freq->cpu, bounce_off, NULL, 1);
2756 	}
2757 	return 0;
2758 }
2759 
2760 static struct notifier_block kvmclock_cpufreq_notifier_block = {
2761         .notifier_call  = kvmclock_cpufreq_notifier
2762 };
2763 
2764 int kvm_arch_init(void *opaque)
2765 {
2766 	int r, cpu;
2767 	struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
2768 
2769 	if (kvm_x86_ops) {
2770 		printk(KERN_ERR "kvm: already loaded the other module\n");
2771 		r = -EEXIST;
2772 		goto out;
2773 	}
2774 
2775 	if (!ops->cpu_has_kvm_support()) {
2776 		printk(KERN_ERR "kvm: no hardware support\n");
2777 		r = -EOPNOTSUPP;
2778 		goto out;
2779 	}
2780 	if (ops->disabled_by_bios()) {
2781 		printk(KERN_ERR "kvm: disabled by bios\n");
2782 		r = -EOPNOTSUPP;
2783 		goto out;
2784 	}
2785 
2786 	r = kvm_mmu_module_init();
2787 	if (r)
2788 		goto out;
2789 
2790 	kvm_init_msr_list();
2791 
2792 	kvm_x86_ops = ops;
2793 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
2794 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
2795 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
2796 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
2797 
2798 	for_each_possible_cpu(cpu)
2799 		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
2800 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
2801 		tsc_khz_ref = tsc_khz;
2802 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
2803 					  CPUFREQ_TRANSITION_NOTIFIER);
2804 	}
2805 
2806 	return 0;
2807 
2808 out:
2809 	return r;
2810 }
2811 
2812 void kvm_arch_exit(void)
2813 {
2814 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
2815 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
2816 					    CPUFREQ_TRANSITION_NOTIFIER);
2817 	kvm_x86_ops = NULL;
2818 	kvm_mmu_module_exit();
2819 }
2820 
2821 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2822 {
2823 	++vcpu->stat.halt_exits;
2824 	KVMTRACE_0D(HLT, vcpu, handler);
2825 	if (irqchip_in_kernel(vcpu->kvm)) {
2826 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2827 		return 1;
2828 	} else {
2829 		vcpu->run->exit_reason = KVM_EXIT_HLT;
2830 		return 0;
2831 	}
2832 }
2833 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
2834 
2835 static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
2836 			   unsigned long a1)
2837 {
2838 	if (is_long_mode(vcpu))
2839 		return a0;
2840 	else
2841 		return a0 | ((gpa_t)a1 << 32);
2842 }
2843 
2844 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2845 {
2846 	unsigned long nr, a0, a1, a2, a3, ret;
2847 	int r = 1;
2848 
2849 	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2850 	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2851 	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2852 	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2853 	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2854 
2855 	KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2856 
2857 	if (!is_long_mode(vcpu)) {
2858 		nr &= 0xFFFFFFFF;
2859 		a0 &= 0xFFFFFFFF;
2860 		a1 &= 0xFFFFFFFF;
2861 		a2 &= 0xFFFFFFFF;
2862 		a3 &= 0xFFFFFFFF;
2863 	}
2864 
2865 	switch (nr) {
2866 	case KVM_HC_VAPIC_POLL_IRQ:
2867 		ret = 0;
2868 		break;
2869 	case KVM_HC_MMU_OP:
2870 		r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
2871 		break;
2872 	default:
2873 		ret = -KVM_ENOSYS;
2874 		break;
2875 	}
2876 	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2877 	++vcpu->stat.hypercalls;
2878 	return r;
2879 }
2880 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
2881 
2882 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2883 {
2884 	char instruction[3];
2885 	int ret = 0;
2886 	unsigned long rip = kvm_rip_read(vcpu);
2887 
2888 
2889 	/*
2890 	 * Blow out the MMU to ensure that no other VCPU has an active mapping
2891 	 * to ensure that the updated hypercall appears atomically across all
2892 	 * VCPUs.
2893 	 */
2894 	kvm_mmu_zap_all(vcpu->kvm);
2895 
2896 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
2897 	if (emulator_write_emulated(rip, instruction, 3, vcpu)
2898 	    != X86EMUL_CONTINUE)
2899 		ret = -EFAULT;
2900 
2901 	return ret;
2902 }
2903 
2904 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
2905 {
2906 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
2907 }
2908 
2909 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2910 {
2911 	struct descriptor_table dt = { limit, base };
2912 
2913 	kvm_x86_ops->set_gdt(vcpu, &dt);
2914 }
2915 
2916 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
2917 {
2918 	struct descriptor_table dt = { limit, base };
2919 
2920 	kvm_x86_ops->set_idt(vcpu, &dt);
2921 }
2922 
2923 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2924 		   unsigned long *rflags)
2925 {
2926 	kvm_lmsw(vcpu, msw);
2927 	*rflags = kvm_x86_ops->get_rflags(vcpu);
2928 }
2929 
2930 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2931 {
2932 	unsigned long value;
2933 
2934 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2935 	switch (cr) {
2936 	case 0:
2937 		value = vcpu->arch.cr0;
2938 		break;
2939 	case 2:
2940 		value = vcpu->arch.cr2;
2941 		break;
2942 	case 3:
2943 		value = vcpu->arch.cr3;
2944 		break;
2945 	case 4:
2946 		value = vcpu->arch.cr4;
2947 		break;
2948 	case 8:
2949 		value = kvm_get_cr8(vcpu);
2950 		break;
2951 	default:
2952 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2953 		return 0;
2954 	}
2955 	KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2956 		    (u32)((u64)value >> 32), handler);
2957 
2958 	return value;
2959 }
2960 
2961 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2962 		     unsigned long *rflags)
2963 {
2964 	KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2965 		    (u32)((u64)val >> 32), handler);
2966 
2967 	switch (cr) {
2968 	case 0:
2969 		kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
2970 		*rflags = kvm_x86_ops->get_rflags(vcpu);
2971 		break;
2972 	case 2:
2973 		vcpu->arch.cr2 = val;
2974 		break;
2975 	case 3:
2976 		kvm_set_cr3(vcpu, val);
2977 		break;
2978 	case 4:
2979 		kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
2980 		break;
2981 	case 8:
2982 		kvm_set_cr8(vcpu, val & 0xfUL);
2983 		break;
2984 	default:
2985 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2986 	}
2987 }
2988 
2989 static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
2990 {
2991 	struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i];
2992 	int j, nent = vcpu->arch.cpuid_nent;
2993 
2994 	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
2995 	/* when no next entry is found, the current entry[i] is reselected */
2996 	for (j = i + 1; ; j = (j + 1) % nent) {
2997 		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
2998 		if (ej->function == e->function) {
2999 			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
3000 			return j;
3001 		}
3002 	}
3003 	return 0; /* silence gcc, even though control never reaches here */
3004 }
3005 
3006 /* find an entry with matching function, matching index (if needed), and that
3007  * should be read next (if it's stateful) */
3008 static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
3009 	u32 function, u32 index)
3010 {
3011 	if (e->function != function)
3012 		return 0;
3013 	if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
3014 		return 0;
3015 	if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
3016 	    !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
3017 		return 0;
3018 	return 1;
3019 }
3020 
3021 struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3022 					      u32 function, u32 index)
3023 {
3024 	int i;
3025 	struct kvm_cpuid_entry2 *best = NULL;
3026 
3027 	for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
3028 		struct kvm_cpuid_entry2 *e;
3029 
3030 		e = &vcpu->arch.cpuid_entries[i];
3031 		if (is_matching_cpuid_entry(e, function, index)) {
3032 			if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
3033 				move_to_next_stateful_cpuid_entry(vcpu, i);
3034 			best = e;
3035 			break;
3036 		}
3037 		/*
3038 		 * Both basic or both extended?
3039 		 */
3040 		if (((e->function ^ function) & 0x80000000) == 0)
3041 			if (!best || e->function > best->function)
3042 				best = e;
3043 	}
3044 	return best;
3045 }
3046 
3047 int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3048 {
3049 	struct kvm_cpuid_entry2 *best;
3050 
3051 	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
3052 	if (best)
3053 		return best->eax & 0xff;
3054 	return 36;
3055 }
3056 
3057 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
3058 {
3059 	u32 function, index;
3060 	struct kvm_cpuid_entry2 *best;
3061 
3062 	function = kvm_register_read(vcpu, VCPU_REGS_RAX);
3063 	index = kvm_register_read(vcpu, VCPU_REGS_RCX);
3064 	kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
3065 	kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
3066 	kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
3067 	kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
3068 	best = kvm_find_cpuid_entry(vcpu, function, index);
3069 	if (best) {
3070 		kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
3071 		kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
3072 		kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
3073 		kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
3074 	}
3075 	kvm_x86_ops->skip_emulated_instruction(vcpu);
3076 	KVMTRACE_5D(CPUID, vcpu, function,
3077 		    (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
3078 		    (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
3079 		    (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
3080 		    (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
3081 }
3082 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3083 
3084 /*
3085  * Check if userspace requested an interrupt window, and that the
3086  * interrupt window is open.
3087  *
3088  * No need to exit to userspace if we already have an interrupt queued.
3089  */
3090 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
3091 					  struct kvm_run *kvm_run)
3092 {
3093 	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3094 		kvm_run->request_interrupt_window &&
3095 		kvm_arch_interrupt_allowed(vcpu));
3096 }
3097 
3098 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
3099 			      struct kvm_run *kvm_run)
3100 {
3101 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3102 	kvm_run->cr8 = kvm_get_cr8(vcpu);
3103 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
3104 	if (irqchip_in_kernel(vcpu->kvm))
3105 		kvm_run->ready_for_interrupt_injection = 1;
3106 	else
3107 		kvm_run->ready_for_interrupt_injection =
3108 			kvm_arch_interrupt_allowed(vcpu) &&
3109 			!kvm_cpu_has_interrupt(vcpu) &&
3110 			!kvm_event_needs_reinjection(vcpu);
3111 }
3112 
3113 static void vapic_enter(struct kvm_vcpu *vcpu)
3114 {
3115 	struct kvm_lapic *apic = vcpu->arch.apic;
3116 	struct page *page;
3117 
3118 	if (!apic || !apic->vapic_addr)
3119 		return;
3120 
3121 	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3122 
3123 	vcpu->arch.apic->vapic_page = page;
3124 }
3125 
3126 static void vapic_exit(struct kvm_vcpu *vcpu)
3127 {
3128 	struct kvm_lapic *apic = vcpu->arch.apic;
3129 
3130 	if (!apic || !apic->vapic_addr)
3131 		return;
3132 
3133 	down_read(&vcpu->kvm->slots_lock);
3134 	kvm_release_page_dirty(apic->vapic_page);
3135 	mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3136 	up_read(&vcpu->kvm->slots_lock);
3137 }
3138 
3139 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3140 {
3141 	int max_irr, tpr;
3142 
3143 	if (!kvm_x86_ops->update_cr8_intercept)
3144 		return;
3145 
3146 	if (!vcpu->arch.apic->vapic_addr)
3147 		max_irr = kvm_lapic_find_highest_irr(vcpu);
3148 	else
3149 		max_irr = -1;
3150 
3151 	if (max_irr != -1)
3152 		max_irr >>= 4;
3153 
3154 	tpr = kvm_lapic_get_cr8(vcpu);
3155 
3156 	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3157 }
3158 
3159 static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3160 {
3161 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3162 		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
3163 
3164 	/* try to reinject previous events if any */
3165 	if (vcpu->arch.nmi_injected) {
3166 		kvm_x86_ops->set_nmi(vcpu);
3167 		return;
3168 	}
3169 
3170 	if (vcpu->arch.interrupt.pending) {
3171 		kvm_x86_ops->set_irq(vcpu);
3172 		return;
3173 	}
3174 
3175 	/* try to inject new event if pending */
3176 	if (vcpu->arch.nmi_pending) {
3177 		if (kvm_x86_ops->nmi_allowed(vcpu)) {
3178 			vcpu->arch.nmi_pending = false;
3179 			vcpu->arch.nmi_injected = true;
3180 			kvm_x86_ops->set_nmi(vcpu);
3181 		}
3182 	} else if (kvm_cpu_has_interrupt(vcpu)) {
3183 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
3184 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
3185 					    false);
3186 			kvm_x86_ops->set_irq(vcpu);
3187 		}
3188 	}
3189 }
3190 
3191 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3192 {
3193 	int r;
3194 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3195 		kvm_run->request_interrupt_window;
3196 
3197 	if (vcpu->requests)
3198 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
3199 			kvm_mmu_unload(vcpu);
3200 
3201 	r = kvm_mmu_reload(vcpu);
3202 	if (unlikely(r))
3203 		goto out;
3204 
3205 	if (vcpu->requests) {
3206 		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
3207 			__kvm_migrate_timers(vcpu);
3208 		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
3209 			kvm_write_guest_time(vcpu);
3210 		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
3211 			kvm_mmu_sync_roots(vcpu);
3212 		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
3213 			kvm_x86_ops->tlb_flush(vcpu);
3214 		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3215 				       &vcpu->requests)) {
3216 			kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS;
3217 			r = 0;
3218 			goto out;
3219 		}
3220 		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3221 			kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
3222 			r = 0;
3223 			goto out;
3224 		}
3225 	}
3226 
3227 	preempt_disable();
3228 
3229 	kvm_x86_ops->prepare_guest_switch(vcpu);
3230 	kvm_load_guest_fpu(vcpu);
3231 
3232 	local_irq_disable();
3233 
3234 	clear_bit(KVM_REQ_KICK, &vcpu->requests);
3235 	smp_mb__after_clear_bit();
3236 
3237 	if (vcpu->requests || need_resched() || signal_pending(current)) {
3238 		local_irq_enable();
3239 		preempt_enable();
3240 		r = 1;
3241 		goto out;
3242 	}
3243 
3244 	if (vcpu->arch.exception.pending)
3245 		__queue_exception(vcpu);
3246 	else
3247 		inject_pending_irq(vcpu, kvm_run);
3248 
3249 	/* enable NMI/IRQ window open exits if needed */
3250 	if (vcpu->arch.nmi_pending)
3251 		kvm_x86_ops->enable_nmi_window(vcpu);
3252 	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
3253 		kvm_x86_ops->enable_irq_window(vcpu);
3254 
3255 	if (kvm_lapic_enabled(vcpu)) {
3256 		update_cr8_intercept(vcpu);
3257 		kvm_lapic_sync_to_vapic(vcpu);
3258 	}
3259 
3260 	up_read(&vcpu->kvm->slots_lock);
3261 
3262 	kvm_guest_enter();
3263 
3264 	get_debugreg(vcpu->arch.host_dr6, 6);
3265 	get_debugreg(vcpu->arch.host_dr7, 7);
3266 	if (unlikely(vcpu->arch.switch_db_regs)) {
3267 		get_debugreg(vcpu->arch.host_db[0], 0);
3268 		get_debugreg(vcpu->arch.host_db[1], 1);
3269 		get_debugreg(vcpu->arch.host_db[2], 2);
3270 		get_debugreg(vcpu->arch.host_db[3], 3);
3271 
3272 		set_debugreg(0, 7);
3273 		set_debugreg(vcpu->arch.eff_db[0], 0);
3274 		set_debugreg(vcpu->arch.eff_db[1], 1);
3275 		set_debugreg(vcpu->arch.eff_db[2], 2);
3276 		set_debugreg(vcpu->arch.eff_db[3], 3);
3277 	}
3278 
3279 	KVMTRACE_0D(VMENTRY, vcpu, entryexit);
3280 	kvm_x86_ops->run(vcpu, kvm_run);
3281 
3282 	if (unlikely(vcpu->arch.switch_db_regs)) {
3283 		set_debugreg(0, 7);
3284 		set_debugreg(vcpu->arch.host_db[0], 0);
3285 		set_debugreg(vcpu->arch.host_db[1], 1);
3286 		set_debugreg(vcpu->arch.host_db[2], 2);
3287 		set_debugreg(vcpu->arch.host_db[3], 3);
3288 	}
3289 	set_debugreg(vcpu->arch.host_dr6, 6);
3290 	set_debugreg(vcpu->arch.host_dr7, 7);
3291 
3292 	set_bit(KVM_REQ_KICK, &vcpu->requests);
3293 	local_irq_enable();
3294 
3295 	++vcpu->stat.exits;
3296 
3297 	/*
3298 	 * We must have an instruction between local_irq_enable() and
3299 	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
3300 	 * the interrupt shadow.  The stat.exits increment will do nicely.
3301 	 * But we need to prevent reordering, hence this barrier():
3302 	 */
3303 	barrier();
3304 
3305 	kvm_guest_exit();
3306 
3307 	preempt_enable();
3308 
3309 	down_read(&vcpu->kvm->slots_lock);
3310 
3311 	/*
3312 	 * Profile KVM exit RIPs:
3313 	 */
3314 	if (unlikely(prof_on == KVM_PROFILING)) {
3315 		unsigned long rip = kvm_rip_read(vcpu);
3316 		profile_hit(KVM_PROFILING, (void *)rip);
3317 	}
3318 
3319 
3320 	kvm_lapic_sync_from_vapic(vcpu);
3321 
3322 	r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
3323 out:
3324 	return r;
3325 }
3326 
3327 
3328 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3329 {
3330 	int r;
3331 
3332 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3333 		pr_debug("vcpu %d received sipi with vector # %x\n",
3334 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
3335 		kvm_lapic_reset(vcpu);
3336 		r = kvm_arch_vcpu_reset(vcpu);
3337 		if (r)
3338 			return r;
3339 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3340 	}
3341 
3342 	down_read(&vcpu->kvm->slots_lock);
3343 	vapic_enter(vcpu);
3344 
3345 	r = 1;
3346 	while (r > 0) {
3347 		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3348 			r = vcpu_enter_guest(vcpu, kvm_run);
3349 		else {
3350 			up_read(&vcpu->kvm->slots_lock);
3351 			kvm_vcpu_block(vcpu);
3352 			down_read(&vcpu->kvm->slots_lock);
3353 			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3354 			{
3355 				switch(vcpu->arch.mp_state) {
3356 				case KVM_MP_STATE_HALTED:
3357 					vcpu->arch.mp_state =
3358 						KVM_MP_STATE_RUNNABLE;
3359 				case KVM_MP_STATE_RUNNABLE:
3360 					break;
3361 				case KVM_MP_STATE_SIPI_RECEIVED:
3362 				default:
3363 					r = -EINTR;
3364 					break;
3365 				}
3366 			}
3367 		}
3368 
3369 		if (r <= 0)
3370 			break;
3371 
3372 		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
3373 		if (kvm_cpu_has_pending_timer(vcpu))
3374 			kvm_inject_pending_timer_irqs(vcpu);
3375 
3376 		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3377 			r = -EINTR;
3378 			kvm_run->exit_reason = KVM_EXIT_INTR;
3379 			++vcpu->stat.request_irq_exits;
3380 		}
3381 		if (signal_pending(current)) {
3382 			r = -EINTR;
3383 			kvm_run->exit_reason = KVM_EXIT_INTR;
3384 			++vcpu->stat.signal_exits;
3385 		}
3386 		if (need_resched()) {
3387 			up_read(&vcpu->kvm->slots_lock);
3388 			kvm_resched(vcpu);
3389 			down_read(&vcpu->kvm->slots_lock);
3390 		}
3391 	}
3392 
3393 	up_read(&vcpu->kvm->slots_lock);
3394 	post_kvm_run_save(vcpu, kvm_run);
3395 
3396 	vapic_exit(vcpu);
3397 
3398 	return r;
3399 }
3400 
3401 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3402 {
3403 	int r;
3404 	sigset_t sigsaved;
3405 
3406 	vcpu_load(vcpu);
3407 
3408 	if (vcpu->sigset_active)
3409 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3410 
3411 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
3412 		kvm_vcpu_block(vcpu);
3413 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
3414 		r = -EAGAIN;
3415 		goto out;
3416 	}
3417 
3418 	/* re-sync apic's tpr */
3419 	if (!irqchip_in_kernel(vcpu->kvm))
3420 		kvm_set_cr8(vcpu, kvm_run->cr8);
3421 
3422 	if (vcpu->arch.pio.cur_count) {
3423 		r = complete_pio(vcpu);
3424 		if (r)
3425 			goto out;
3426 	}
3427 #if CONFIG_HAS_IOMEM
3428 	if (vcpu->mmio_needed) {
3429 		memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3430 		vcpu->mmio_read_completed = 1;
3431 		vcpu->mmio_needed = 0;
3432 
3433 		down_read(&vcpu->kvm->slots_lock);
3434 		r = emulate_instruction(vcpu, kvm_run,
3435 					vcpu->arch.mmio_fault_cr2, 0,
3436 					EMULTYPE_NO_DECODE);
3437 		up_read(&vcpu->kvm->slots_lock);
3438 		if (r == EMULATE_DO_MMIO) {
3439 			/*
3440 			 * Read-modify-write.  Back to userspace.
3441 			 */
3442 			r = 0;
3443 			goto out;
3444 		}
3445 	}
3446 #endif
3447 	if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3448 		kvm_register_write(vcpu, VCPU_REGS_RAX,
3449 				     kvm_run->hypercall.ret);
3450 
3451 	r = __vcpu_run(vcpu, kvm_run);
3452 
3453 out:
3454 	if (vcpu->sigset_active)
3455 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
3456 
3457 	vcpu_put(vcpu);
3458 	return r;
3459 }
3460 
3461 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3462 {
3463 	vcpu_load(vcpu);
3464 
3465 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3466 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3467 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3468 	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3469 	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3470 	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3471 	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3472 	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3473 #ifdef CONFIG_X86_64
3474 	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3475 	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3476 	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3477 	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3478 	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3479 	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3480 	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3481 	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3482 #endif
3483 
3484 	regs->rip = kvm_rip_read(vcpu);
3485 	regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3486 
3487 	/*
3488 	 * Don't leak debug flags in case they were set for guest debugging
3489 	 */
3490 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3491 		regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3492 
3493 	vcpu_put(vcpu);
3494 
3495 	return 0;
3496 }
3497 
3498 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3499 {
3500 	vcpu_load(vcpu);
3501 
3502 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3503 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3504 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3505 	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3506 	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3507 	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3508 	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3509 	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3510 #ifdef CONFIG_X86_64
3511 	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3512 	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3513 	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3514 	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3515 	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3516 	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3517 	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3518 	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3519 
3520 #endif
3521 
3522 	kvm_rip_write(vcpu, regs->rip);
3523 	kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3524 
3525 
3526 	vcpu->arch.exception.pending = false;
3527 
3528 	vcpu_put(vcpu);
3529 
3530 	return 0;
3531 }
3532 
3533 void kvm_get_segment(struct kvm_vcpu *vcpu,
3534 		     struct kvm_segment *var, int seg)
3535 {
3536 	kvm_x86_ops->get_segment(vcpu, var, seg);
3537 }
3538 
3539 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3540 {
3541 	struct kvm_segment cs;
3542 
3543 	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3544 	*db = cs.db;
3545 	*l = cs.l;
3546 }
3547 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
3548 
3549 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3550 				  struct kvm_sregs *sregs)
3551 {
3552 	struct descriptor_table dt;
3553 
3554 	vcpu_load(vcpu);
3555 
3556 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3557 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3558 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3559 	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3560 	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3561 	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3562 
3563 	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3564 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3565 
3566 	kvm_x86_ops->get_idt(vcpu, &dt);
3567 	sregs->idt.limit = dt.limit;
3568 	sregs->idt.base = dt.base;
3569 	kvm_x86_ops->get_gdt(vcpu, &dt);
3570 	sregs->gdt.limit = dt.limit;
3571 	sregs->gdt.base = dt.base;
3572 
3573 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3574 	sregs->cr0 = vcpu->arch.cr0;
3575 	sregs->cr2 = vcpu->arch.cr2;
3576 	sregs->cr3 = vcpu->arch.cr3;
3577 	sregs->cr4 = vcpu->arch.cr4;
3578 	sregs->cr8 = kvm_get_cr8(vcpu);
3579 	sregs->efer = vcpu->arch.shadow_efer;
3580 	sregs->apic_base = kvm_get_apic_base(vcpu);
3581 
3582 	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
3583 
3584 	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
3585 		set_bit(vcpu->arch.interrupt.nr,
3586 			(unsigned long *)sregs->interrupt_bitmap);
3587 
3588 	vcpu_put(vcpu);
3589 
3590 	return 0;
3591 }
3592 
3593 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
3594 				    struct kvm_mp_state *mp_state)
3595 {
3596 	vcpu_load(vcpu);
3597 	mp_state->mp_state = vcpu->arch.mp_state;
3598 	vcpu_put(vcpu);
3599 	return 0;
3600 }
3601 
3602 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3603 				    struct kvm_mp_state *mp_state)
3604 {
3605 	vcpu_load(vcpu);
3606 	vcpu->arch.mp_state = mp_state->mp_state;
3607 	vcpu_put(vcpu);
3608 	return 0;
3609 }
3610 
3611 static void kvm_set_segment(struct kvm_vcpu *vcpu,
3612 			struct kvm_segment *var, int seg)
3613 {
3614 	kvm_x86_ops->set_segment(vcpu, var, seg);
3615 }
3616 
3617 static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3618 				   struct kvm_segment *kvm_desct)
3619 {
3620 	kvm_desct->base = seg_desc->base0;
3621 	kvm_desct->base |= seg_desc->base1 << 16;
3622 	kvm_desct->base |= seg_desc->base2 << 24;
3623 	kvm_desct->limit = seg_desc->limit0;
3624 	kvm_desct->limit |= seg_desc->limit << 16;
3625 	if (seg_desc->g) {
3626 		kvm_desct->limit <<= 12;
3627 		kvm_desct->limit |= 0xfff;
3628 	}
3629 	kvm_desct->selector = selector;
3630 	kvm_desct->type = seg_desc->type;
3631 	kvm_desct->present = seg_desc->p;
3632 	kvm_desct->dpl = seg_desc->dpl;
3633 	kvm_desct->db = seg_desc->d;
3634 	kvm_desct->s = seg_desc->s;
3635 	kvm_desct->l = seg_desc->l;
3636 	kvm_desct->g = seg_desc->g;
3637 	kvm_desct->avl = seg_desc->avl;
3638 	if (!selector)
3639 		kvm_desct->unusable = 1;
3640 	else
3641 		kvm_desct->unusable = 0;
3642 	kvm_desct->padding = 0;
3643 }
3644 
3645 static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
3646 					  u16 selector,
3647 					  struct descriptor_table *dtable)
3648 {
3649 	if (selector & 1 << 2) {
3650 		struct kvm_segment kvm_seg;
3651 
3652 		kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3653 
3654 		if (kvm_seg.unusable)
3655 			dtable->limit = 0;
3656 		else
3657 			dtable->limit = kvm_seg.limit;
3658 		dtable->base = kvm_seg.base;
3659 	}
3660 	else
3661 		kvm_x86_ops->get_gdt(vcpu, dtable);
3662 }
3663 
3664 /* allowed just for 8 bytes segments */
3665 static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3666 					 struct desc_struct *seg_desc)
3667 {
3668 	gpa_t gpa;
3669 	struct descriptor_table dtable;
3670 	u16 index = selector >> 3;
3671 
3672 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
3673 
3674 	if (dtable.limit < index * 8 + 7) {
3675 		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3676 		return 1;
3677 	}
3678 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3679 	gpa += index * 8;
3680 	return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3681 }
3682 
3683 /* allowed just for 8 bytes segments */
3684 static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3685 					 struct desc_struct *seg_desc)
3686 {
3687 	gpa_t gpa;
3688 	struct descriptor_table dtable;
3689 	u16 index = selector >> 3;
3690 
3691 	get_segment_descriptor_dtable(vcpu, selector, &dtable);
3692 
3693 	if (dtable.limit < index * 8 + 7)
3694 		return 1;
3695 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3696 	gpa += index * 8;
3697 	return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3698 }
3699 
3700 static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3701 			     struct desc_struct *seg_desc)
3702 {
3703 	u32 base_addr;
3704 
3705 	base_addr = seg_desc->base0;
3706 	base_addr |= (seg_desc->base1 << 16);
3707 	base_addr |= (seg_desc->base2 << 24);
3708 
3709 	return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3710 }
3711 
3712 static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3713 {
3714 	struct kvm_segment kvm_seg;
3715 
3716 	kvm_get_segment(vcpu, &kvm_seg, seg);
3717 	return kvm_seg.selector;
3718 }
3719 
3720 static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3721 						u16 selector,
3722 						struct kvm_segment *kvm_seg)
3723 {
3724 	struct desc_struct seg_desc;
3725 
3726 	if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
3727 		return 1;
3728 	seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
3729 	return 0;
3730 }
3731 
3732 static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3733 {
3734 	struct kvm_segment segvar = {
3735 		.base = selector << 4,
3736 		.limit = 0xffff,
3737 		.selector = selector,
3738 		.type = 3,
3739 		.present = 1,
3740 		.dpl = 3,
3741 		.db = 0,
3742 		.s = 1,
3743 		.l = 0,
3744 		.g = 0,
3745 		.avl = 0,
3746 		.unusable = 0,
3747 	};
3748 	kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3749 	return 0;
3750 }
3751 
3752 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3753 				int type_bits, int seg)
3754 {
3755 	struct kvm_segment kvm_seg;
3756 
3757 	if (!(vcpu->arch.cr0 & X86_CR0_PE))
3758 		return kvm_load_realmode_segment(vcpu, selector, seg);
3759 	if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3760 		return 1;
3761 	kvm_seg.type |= type_bits;
3762 
3763 	if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
3764 	    seg != VCPU_SREG_LDTR)
3765 		if (!kvm_seg.s)
3766 			kvm_seg.unusable = 1;
3767 
3768 	kvm_set_segment(vcpu, &kvm_seg, seg);
3769 	return 0;
3770 }
3771 
3772 static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3773 				struct tss_segment_32 *tss)
3774 {
3775 	tss->cr3 = vcpu->arch.cr3;
3776 	tss->eip = kvm_rip_read(vcpu);
3777 	tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3778 	tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3779 	tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3780 	tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3781 	tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3782 	tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3783 	tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3784 	tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3785 	tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3786 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3787 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3788 	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3789 	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3790 	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
3791 	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
3792 	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3793 }
3794 
3795 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3796 				  struct tss_segment_32 *tss)
3797 {
3798 	kvm_set_cr3(vcpu, tss->cr3);
3799 
3800 	kvm_rip_write(vcpu, tss->eip);
3801 	kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3802 
3803 	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3804 	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3805 	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3806 	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3807 	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3808 	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3809 	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3810 	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3811 
3812 	if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3813 		return 1;
3814 
3815 	if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3816 		return 1;
3817 
3818 	if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3819 		return 1;
3820 
3821 	if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3822 		return 1;
3823 
3824 	if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3825 		return 1;
3826 
3827 	if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3828 		return 1;
3829 
3830 	if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3831 		return 1;
3832 	return 0;
3833 }
3834 
3835 static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3836 				struct tss_segment_16 *tss)
3837 {
3838 	tss->ip = kvm_rip_read(vcpu);
3839 	tss->flag = kvm_x86_ops->get_rflags(vcpu);
3840 	tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3841 	tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3842 	tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3843 	tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3844 	tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3845 	tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3846 	tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3847 	tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3848 
3849 	tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3850 	tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3851 	tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
3852 	tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
3853 	tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
3854 	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
3855 }
3856 
3857 static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3858 				 struct tss_segment_16 *tss)
3859 {
3860 	kvm_rip_write(vcpu, tss->ip);
3861 	kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3862 	kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3863 	kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3864 	kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3865 	kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3866 	kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3867 	kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3868 	kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3869 	kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3870 
3871 	if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3872 		return 1;
3873 
3874 	if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3875 		return 1;
3876 
3877 	if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3878 		return 1;
3879 
3880 	if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3881 		return 1;
3882 
3883 	if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3884 		return 1;
3885 	return 0;
3886 }
3887 
3888 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3889 			      u16 old_tss_sel, u32 old_tss_base,
3890 			      struct desc_struct *nseg_desc)
3891 {
3892 	struct tss_segment_16 tss_segment_16;
3893 	int ret = 0;
3894 
3895 	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3896 			   sizeof tss_segment_16))
3897 		goto out;
3898 
3899 	save_state_to_tss16(vcpu, &tss_segment_16);
3900 
3901 	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3902 			    sizeof tss_segment_16))
3903 		goto out;
3904 
3905 	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3906 			   &tss_segment_16, sizeof tss_segment_16))
3907 		goto out;
3908 
3909 	if (old_tss_sel != 0xffff) {
3910 		tss_segment_16.prev_task_link = old_tss_sel;
3911 
3912 		if (kvm_write_guest(vcpu->kvm,
3913 				    get_tss_base_addr(vcpu, nseg_desc),
3914 				    &tss_segment_16.prev_task_link,
3915 				    sizeof tss_segment_16.prev_task_link))
3916 			goto out;
3917 	}
3918 
3919 	if (load_state_from_tss16(vcpu, &tss_segment_16))
3920 		goto out;
3921 
3922 	ret = 1;
3923 out:
3924 	return ret;
3925 }
3926 
3927 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3928 		       u16 old_tss_sel, u32 old_tss_base,
3929 		       struct desc_struct *nseg_desc)
3930 {
3931 	struct tss_segment_32 tss_segment_32;
3932 	int ret = 0;
3933 
3934 	if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3935 			   sizeof tss_segment_32))
3936 		goto out;
3937 
3938 	save_state_to_tss32(vcpu, &tss_segment_32);
3939 
3940 	if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3941 			    sizeof tss_segment_32))
3942 		goto out;
3943 
3944 	if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3945 			   &tss_segment_32, sizeof tss_segment_32))
3946 		goto out;
3947 
3948 	if (old_tss_sel != 0xffff) {
3949 		tss_segment_32.prev_task_link = old_tss_sel;
3950 
3951 		if (kvm_write_guest(vcpu->kvm,
3952 				    get_tss_base_addr(vcpu, nseg_desc),
3953 				    &tss_segment_32.prev_task_link,
3954 				    sizeof tss_segment_32.prev_task_link))
3955 			goto out;
3956 	}
3957 
3958 	if (load_state_from_tss32(vcpu, &tss_segment_32))
3959 		goto out;
3960 
3961 	ret = 1;
3962 out:
3963 	return ret;
3964 }
3965 
3966 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3967 {
3968 	struct kvm_segment tr_seg;
3969 	struct desc_struct cseg_desc;
3970 	struct desc_struct nseg_desc;
3971 	int ret = 0;
3972 	u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3973 	u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3974 
3975 	old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3976 
3977 	/* FIXME: Handle errors. Failure to read either TSS or their
3978 	 * descriptors should generate a pagefault.
3979 	 */
3980 	if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3981 		goto out;
3982 
3983 	if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3984 		goto out;
3985 
3986 	if (reason != TASK_SWITCH_IRET) {
3987 		int cpl;
3988 
3989 		cpl = kvm_x86_ops->get_cpl(vcpu);
3990 		if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
3991 			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
3992 			return 1;
3993 		}
3994 	}
3995 
3996 	if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
3997 		kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
3998 		return 1;
3999 	}
4000 
4001 	if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
4002 		cseg_desc.type &= ~(1 << 1); //clear the B flag
4003 		save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
4004 	}
4005 
4006 	if (reason == TASK_SWITCH_IRET) {
4007 		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4008 		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4009 	}
4010 
4011 	/* set back link to prev task only if NT bit is set in eflags
4012 	   note that old_tss_sel is not used afetr this point */
4013 	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4014 		old_tss_sel = 0xffff;
4015 
4016 	/* set back link to prev task only if NT bit is set in eflags
4017 	   note that old_tss_sel is not used afetr this point */
4018 	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4019 		old_tss_sel = 0xffff;
4020 
4021 	if (nseg_desc.type & 8)
4022 		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4023 					 old_tss_base, &nseg_desc);
4024 	else
4025 		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
4026 					 old_tss_base, &nseg_desc);
4027 
4028 	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4029 		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
4030 		kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4031 	}
4032 
4033 	if (reason != TASK_SWITCH_IRET) {
4034 		nseg_desc.type |= (1 << 1);
4035 		save_guest_segment_descriptor(vcpu, tss_selector,
4036 					      &nseg_desc);
4037 	}
4038 
4039 	kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
4040 	seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4041 	tr_seg.type = 11;
4042 	kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
4043 out:
4044 	return ret;
4045 }
4046 EXPORT_SYMBOL_GPL(kvm_task_switch);
4047 
4048 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4049 				  struct kvm_sregs *sregs)
4050 {
4051 	int mmu_reset_needed = 0;
4052 	int pending_vec, max_bits;
4053 	struct descriptor_table dt;
4054 
4055 	vcpu_load(vcpu);
4056 
4057 	dt.limit = sregs->idt.limit;
4058 	dt.base = sregs->idt.base;
4059 	kvm_x86_ops->set_idt(vcpu, &dt);
4060 	dt.limit = sregs->gdt.limit;
4061 	dt.base = sregs->gdt.base;
4062 	kvm_x86_ops->set_gdt(vcpu, &dt);
4063 
4064 	vcpu->arch.cr2 = sregs->cr2;
4065 	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
4066 
4067 	down_read(&vcpu->kvm->slots_lock);
4068 	if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
4069 		vcpu->arch.cr3 = sregs->cr3;
4070 	else
4071 		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
4072 	up_read(&vcpu->kvm->slots_lock);
4073 
4074 	kvm_set_cr8(vcpu, sregs->cr8);
4075 
4076 	mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
4077 	kvm_x86_ops->set_efer(vcpu, sregs->efer);
4078 	kvm_set_apic_base(vcpu, sregs->apic_base);
4079 
4080 	kvm_x86_ops->decache_cr4_guest_bits(vcpu);
4081 
4082 	mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4083 	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4084 	vcpu->arch.cr0 = sregs->cr0;
4085 
4086 	mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4;
4087 	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4088 	if (!is_long_mode(vcpu) && is_pae(vcpu))
4089 		load_pdptrs(vcpu, vcpu->arch.cr3);
4090 
4091 	if (mmu_reset_needed)
4092 		kvm_mmu_reset_context(vcpu);
4093 
4094 	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
4095 	pending_vec = find_first_bit(
4096 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
4097 	if (pending_vec < max_bits) {
4098 		kvm_queue_interrupt(vcpu, pending_vec, false);
4099 		pr_debug("Set back pending irq %d\n", pending_vec);
4100 		if (irqchip_in_kernel(vcpu->kvm))
4101 			kvm_pic_clear_isr_ack(vcpu->kvm);
4102 	}
4103 
4104 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
4105 	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
4106 	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
4107 	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
4108 	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
4109 	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
4110 
4111 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
4112 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
4113 
4114 	/* Older userspace won't unhalt the vcpu on reset. */
4115 	if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
4116 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4117 	    !(vcpu->arch.cr0 & X86_CR0_PE))
4118 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4119 
4120 	vcpu_put(vcpu);
4121 
4122 	return 0;
4123 }
4124 
4125 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4126 					struct kvm_guest_debug *dbg)
4127 {
4128 	int i, r;
4129 
4130 	vcpu_load(vcpu);
4131 
4132 	if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) ==
4133 	    (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) {
4134 		for (i = 0; i < KVM_NR_DB_REGS; ++i)
4135 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4136 		vcpu->arch.switch_db_regs =
4137 			(dbg->arch.debugreg[7] & DR7_BP_EN_MASK);
4138 	} else {
4139 		for (i = 0; i < KVM_NR_DB_REGS; i++)
4140 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
4141 		vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4142 	}
4143 
4144 	r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
4145 
4146 	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
4147 		kvm_queue_exception(vcpu, DB_VECTOR);
4148 	else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4149 		kvm_queue_exception(vcpu, BP_VECTOR);
4150 
4151 	vcpu_put(vcpu);
4152 
4153 	return r;
4154 }
4155 
4156 /*
4157  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
4158  * we have asm/x86/processor.h
4159  */
4160 struct fxsave {
4161 	u16	cwd;
4162 	u16	swd;
4163 	u16	twd;
4164 	u16	fop;
4165 	u64	rip;
4166 	u64	rdp;
4167 	u32	mxcsr;
4168 	u32	mxcsr_mask;
4169 	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
4170 #ifdef CONFIG_X86_64
4171 	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
4172 #else
4173 	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
4174 #endif
4175 };
4176 
4177 /*
4178  * Translate a guest virtual address to a guest physical address.
4179  */
4180 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4181 				    struct kvm_translation *tr)
4182 {
4183 	unsigned long vaddr = tr->linear_address;
4184 	gpa_t gpa;
4185 
4186 	vcpu_load(vcpu);
4187 	down_read(&vcpu->kvm->slots_lock);
4188 	gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr);
4189 	up_read(&vcpu->kvm->slots_lock);
4190 	tr->physical_address = gpa;
4191 	tr->valid = gpa != UNMAPPED_GVA;
4192 	tr->writeable = 1;
4193 	tr->usermode = 0;
4194 	vcpu_put(vcpu);
4195 
4196 	return 0;
4197 }
4198 
4199 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4200 {
4201 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4202 
4203 	vcpu_load(vcpu);
4204 
4205 	memcpy(fpu->fpr, fxsave->st_space, 128);
4206 	fpu->fcw = fxsave->cwd;
4207 	fpu->fsw = fxsave->swd;
4208 	fpu->ftwx = fxsave->twd;
4209 	fpu->last_opcode = fxsave->fop;
4210 	fpu->last_ip = fxsave->rip;
4211 	fpu->last_dp = fxsave->rdp;
4212 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
4213 
4214 	vcpu_put(vcpu);
4215 
4216 	return 0;
4217 }
4218 
4219 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
4220 {
4221 	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
4222 
4223 	vcpu_load(vcpu);
4224 
4225 	memcpy(fxsave->st_space, fpu->fpr, 128);
4226 	fxsave->cwd = fpu->fcw;
4227 	fxsave->swd = fpu->fsw;
4228 	fxsave->twd = fpu->ftwx;
4229 	fxsave->fop = fpu->last_opcode;
4230 	fxsave->rip = fpu->last_ip;
4231 	fxsave->rdp = fpu->last_dp;
4232 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
4233 
4234 	vcpu_put(vcpu);
4235 
4236 	return 0;
4237 }
4238 
4239 void fx_init(struct kvm_vcpu *vcpu)
4240 {
4241 	unsigned after_mxcsr_mask;
4242 
4243 	/*
4244 	 * Touch the fpu the first time in non atomic context as if
4245 	 * this is the first fpu instruction the exception handler
4246 	 * will fire before the instruction returns and it'll have to
4247 	 * allocate ram with GFP_KERNEL.
4248 	 */
4249 	if (!used_math())
4250 		kvm_fx_save(&vcpu->arch.host_fx_image);
4251 
4252 	/* Initialize guest FPU by resetting ours and saving into guest's */
4253 	preempt_disable();
4254 	kvm_fx_save(&vcpu->arch.host_fx_image);
4255 	kvm_fx_finit();
4256 	kvm_fx_save(&vcpu->arch.guest_fx_image);
4257 	kvm_fx_restore(&vcpu->arch.host_fx_image);
4258 	preempt_enable();
4259 
4260 	vcpu->arch.cr0 |= X86_CR0_ET;
4261 	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
4262 	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
4263 	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
4264 	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
4265 }
4266 EXPORT_SYMBOL_GPL(fx_init);
4267 
4268 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4269 {
4270 	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
4271 		return;
4272 
4273 	vcpu->guest_fpu_loaded = 1;
4274 	kvm_fx_save(&vcpu->arch.host_fx_image);
4275 	kvm_fx_restore(&vcpu->arch.guest_fx_image);
4276 }
4277 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4278 
4279 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4280 {
4281 	if (!vcpu->guest_fpu_loaded)
4282 		return;
4283 
4284 	vcpu->guest_fpu_loaded = 0;
4285 	kvm_fx_save(&vcpu->arch.guest_fx_image);
4286 	kvm_fx_restore(&vcpu->arch.host_fx_image);
4287 	++vcpu->stat.fpu_reload;
4288 }
4289 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4290 
4291 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4292 {
4293 	if (vcpu->arch.time_page) {
4294 		kvm_release_page_dirty(vcpu->arch.time_page);
4295 		vcpu->arch.time_page = NULL;
4296 	}
4297 
4298 	kvm_x86_ops->vcpu_free(vcpu);
4299 }
4300 
4301 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
4302 						unsigned int id)
4303 {
4304 	return kvm_x86_ops->vcpu_create(kvm, id);
4305 }
4306 
4307 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
4308 {
4309 	int r;
4310 
4311 	/* We do fxsave: this must be aligned. */
4312 	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
4313 
4314 	vcpu->arch.mtrr_state.have_fixed = 1;
4315 	vcpu_load(vcpu);
4316 	r = kvm_arch_vcpu_reset(vcpu);
4317 	if (r == 0)
4318 		r = kvm_mmu_setup(vcpu);
4319 	vcpu_put(vcpu);
4320 	if (r < 0)
4321 		goto free_vcpu;
4322 
4323 	return 0;
4324 free_vcpu:
4325 	kvm_x86_ops->vcpu_free(vcpu);
4326 	return r;
4327 }
4328 
4329 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
4330 {
4331 	vcpu_load(vcpu);
4332 	kvm_mmu_unload(vcpu);
4333 	vcpu_put(vcpu);
4334 
4335 	kvm_x86_ops->vcpu_free(vcpu);
4336 }
4337 
4338 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4339 {
4340 	vcpu->arch.nmi_pending = false;
4341 	vcpu->arch.nmi_injected = false;
4342 
4343 	vcpu->arch.switch_db_regs = 0;
4344 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
4345 	vcpu->arch.dr6 = DR6_FIXED_1;
4346 	vcpu->arch.dr7 = DR7_FIXED_1;
4347 
4348 	return kvm_x86_ops->vcpu_reset(vcpu);
4349 }
4350 
4351 void kvm_arch_hardware_enable(void *garbage)
4352 {
4353 	kvm_x86_ops->hardware_enable(garbage);
4354 }
4355 
4356 void kvm_arch_hardware_disable(void *garbage)
4357 {
4358 	kvm_x86_ops->hardware_disable(garbage);
4359 }
4360 
4361 int kvm_arch_hardware_setup(void)
4362 {
4363 	return kvm_x86_ops->hardware_setup();
4364 }
4365 
4366 void kvm_arch_hardware_unsetup(void)
4367 {
4368 	kvm_x86_ops->hardware_unsetup();
4369 }
4370 
4371 void kvm_arch_check_processor_compat(void *rtn)
4372 {
4373 	kvm_x86_ops->check_processor_compatibility(rtn);
4374 }
4375 
4376 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4377 {
4378 	struct page *page;
4379 	struct kvm *kvm;
4380 	int r;
4381 
4382 	BUG_ON(vcpu->kvm == NULL);
4383 	kvm = vcpu->kvm;
4384 
4385 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
4386 	if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
4387 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4388 	else
4389 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
4390 
4391 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
4392 	if (!page) {
4393 		r = -ENOMEM;
4394 		goto fail;
4395 	}
4396 	vcpu->arch.pio_data = page_address(page);
4397 
4398 	r = kvm_mmu_create(vcpu);
4399 	if (r < 0)
4400 		goto fail_free_pio_data;
4401 
4402 	if (irqchip_in_kernel(kvm)) {
4403 		r = kvm_create_lapic(vcpu);
4404 		if (r < 0)
4405 			goto fail_mmu_destroy;
4406 	}
4407 
4408 	return 0;
4409 
4410 fail_mmu_destroy:
4411 	kvm_mmu_destroy(vcpu);
4412 fail_free_pio_data:
4413 	free_page((unsigned long)vcpu->arch.pio_data);
4414 fail:
4415 	return r;
4416 }
4417 
4418 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4419 {
4420 	kvm_free_lapic(vcpu);
4421 	down_read(&vcpu->kvm->slots_lock);
4422 	kvm_mmu_destroy(vcpu);
4423 	up_read(&vcpu->kvm->slots_lock);
4424 	free_page((unsigned long)vcpu->arch.pio_data);
4425 }
4426 
4427 struct  kvm *kvm_arch_create_vm(void)
4428 {
4429 	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
4430 
4431 	if (!kvm)
4432 		return ERR_PTR(-ENOMEM);
4433 
4434 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4435 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4436 
4437 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
4438 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
4439 
4440 	rdtscll(kvm->arch.vm_init_tsc);
4441 
4442 	return kvm;
4443 }
4444 
4445 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
4446 {
4447 	vcpu_load(vcpu);
4448 	kvm_mmu_unload(vcpu);
4449 	vcpu_put(vcpu);
4450 }
4451 
4452 static void kvm_free_vcpus(struct kvm *kvm)
4453 {
4454 	unsigned int i;
4455 
4456 	/*
4457 	 * Unpin any mmu pages first.
4458 	 */
4459 	for (i = 0; i < KVM_MAX_VCPUS; ++i)
4460 		if (kvm->vcpus[i])
4461 			kvm_unload_vcpu_mmu(kvm->vcpus[i]);
4462 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
4463 		if (kvm->vcpus[i]) {
4464 			kvm_arch_vcpu_free(kvm->vcpus[i]);
4465 			kvm->vcpus[i] = NULL;
4466 		}
4467 	}
4468 
4469 }
4470 
4471 void kvm_arch_sync_events(struct kvm *kvm)
4472 {
4473 	kvm_free_all_assigned_devices(kvm);
4474 }
4475 
4476 void kvm_arch_destroy_vm(struct kvm *kvm)
4477 {
4478 	kvm_iommu_unmap_guest(kvm);
4479 	kvm_free_pit(kvm);
4480 	kfree(kvm->arch.vpic);
4481 	kfree(kvm->arch.vioapic);
4482 	kvm_free_vcpus(kvm);
4483 	kvm_free_physmem(kvm);
4484 	if (kvm->arch.apic_access_page)
4485 		put_page(kvm->arch.apic_access_page);
4486 	if (kvm->arch.ept_identity_pagetable)
4487 		put_page(kvm->arch.ept_identity_pagetable);
4488 	kfree(kvm);
4489 }
4490 
4491 int kvm_arch_set_memory_region(struct kvm *kvm,
4492 				struct kvm_userspace_memory_region *mem,
4493 				struct kvm_memory_slot old,
4494 				int user_alloc)
4495 {
4496 	int npages = mem->memory_size >> PAGE_SHIFT;
4497 	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4498 
4499 	/*To keep backward compatibility with older userspace,
4500 	 *x86 needs to hanlde !user_alloc case.
4501 	 */
4502 	if (!user_alloc) {
4503 		if (npages && !old.rmap) {
4504 			unsigned long userspace_addr;
4505 
4506 			down_write(&current->mm->mmap_sem);
4507 			userspace_addr = do_mmap(NULL, 0,
4508 						 npages * PAGE_SIZE,
4509 						 PROT_READ | PROT_WRITE,
4510 						 MAP_PRIVATE | MAP_ANONYMOUS,
4511 						 0);
4512 			up_write(&current->mm->mmap_sem);
4513 
4514 			if (IS_ERR((void *)userspace_addr))
4515 				return PTR_ERR((void *)userspace_addr);
4516 
4517 			/* set userspace_addr atomically for kvm_hva_to_rmapp */
4518 			spin_lock(&kvm->mmu_lock);
4519 			memslot->userspace_addr = userspace_addr;
4520 			spin_unlock(&kvm->mmu_lock);
4521 		} else {
4522 			if (!old.user_alloc && old.rmap) {
4523 				int ret;
4524 
4525 				down_write(&current->mm->mmap_sem);
4526 				ret = do_munmap(current->mm, old.userspace_addr,
4527 						old.npages * PAGE_SIZE);
4528 				up_write(&current->mm->mmap_sem);
4529 				if (ret < 0)
4530 					printk(KERN_WARNING
4531 				       "kvm_vm_ioctl_set_memory_region: "
4532 				       "failed to munmap memory\n");
4533 			}
4534 		}
4535 	}
4536 
4537 	spin_lock(&kvm->mmu_lock);
4538 	if (!kvm->arch.n_requested_mmu_pages) {
4539 		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
4540 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
4541 	}
4542 
4543 	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4544 	spin_unlock(&kvm->mmu_lock);
4545 	kvm_flush_remote_tlbs(kvm);
4546 
4547 	return 0;
4548 }
4549 
4550 void kvm_arch_flush_shadow(struct kvm *kvm)
4551 {
4552 	kvm_mmu_zap_all(kvm);
4553 	kvm_reload_remote_mmus(kvm);
4554 }
4555 
4556 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4557 {
4558 	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
4559 	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
4560 	       || vcpu->arch.nmi_pending;
4561 }
4562 
4563 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
4564 {
4565 	int me;
4566 	int cpu = vcpu->cpu;
4567 
4568 	if (waitqueue_active(&vcpu->wq)) {
4569 		wake_up_interruptible(&vcpu->wq);
4570 		++vcpu->stat.halt_wakeup;
4571 	}
4572 
4573 	me = get_cpu();
4574 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
4575 		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
4576 			smp_send_reschedule(cpu);
4577 	put_cpu();
4578 }
4579 
4580 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4581 {
4582 	return kvm_x86_ops->interrupt_allowed(vcpu);
4583 }
4584