xref: /openbmc/linux/arch/x86/kvm/x86.c (revision c819e2cf)
1  /*
2   * Kernel-based Virtual Machine driver for Linux
3   *
4   * derived from drivers/kvm/kvm_main.c
5   *
6   * Copyright (C) 2006 Qumranet, Inc.
7   * Copyright (C) 2008 Qumranet, Inc.
8   * Copyright IBM Corporation, 2008
9   * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10   *
11   * Authors:
12   *   Avi Kivity   <avi@qumranet.com>
13   *   Yaniv Kamay  <yaniv@qumranet.com>
14   *   Amit Shah    <amit.shah@qumranet.com>
15   *   Ben-Ami Yassour <benami@il.ibm.com>
16   *
17   * This work is licensed under the terms of the GNU GPL, version 2.  See
18   * the COPYING file in the top-level directory.
19   *
20   */
21  
22  #include <linux/kvm_host.h>
23  #include "irq.h"
24  #include "mmu.h"
25  #include "i8254.h"
26  #include "tss.h"
27  #include "kvm_cache_regs.h"
28  #include "x86.h"
29  #include "cpuid.h"
30  #include "assigned-dev.h"
31  
32  #include <linux/clocksource.h>
33  #include <linux/interrupt.h>
34  #include <linux/kvm.h>
35  #include <linux/fs.h>
36  #include <linux/vmalloc.h>
37  #include <linux/module.h>
38  #include <linux/mman.h>
39  #include <linux/highmem.h>
40  #include <linux/iommu.h>
41  #include <linux/intel-iommu.h>
42  #include <linux/cpufreq.h>
43  #include <linux/user-return-notifier.h>
44  #include <linux/srcu.h>
45  #include <linux/slab.h>
46  #include <linux/perf_event.h>
47  #include <linux/uaccess.h>
48  #include <linux/hash.h>
49  #include <linux/pci.h>
50  #include <linux/timekeeper_internal.h>
51  #include <linux/pvclock_gtod.h>
52  #include <trace/events/kvm.h>
53  
54  #define CREATE_TRACE_POINTS
55  #include "trace.h"
56  
57  #include <asm/debugreg.h>
58  #include <asm/msr.h>
59  #include <asm/desc.h>
60  #include <asm/mtrr.h>
61  #include <asm/mce.h>
62  #include <asm/i387.h>
63  #include <asm/fpu-internal.h> /* Ugh! */
64  #include <asm/xcr.h>
65  #include <asm/pvclock.h>
66  #include <asm/div64.h>
67  
68  #define MAX_IO_MSRS 256
69  #define KVM_MAX_MCE_BANKS 32
70  #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
71  
72  #define emul_to_vcpu(ctxt) \
73  	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
74  
75  /* EFER defaults:
76   * - enable syscall per default because its emulated by KVM
77   * - enable LME and LMA per default on 64 bit KVM
78   */
79  #ifdef CONFIG_X86_64
80  static
81  u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
82  #else
83  static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
84  #endif
85  
86  #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
87  #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
88  
89  static void update_cr8_intercept(struct kvm_vcpu *vcpu);
90  static void process_nmi(struct kvm_vcpu *vcpu);
91  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
92  
93  struct kvm_x86_ops *kvm_x86_ops;
94  EXPORT_SYMBOL_GPL(kvm_x86_ops);
95  
96  static bool ignore_msrs = 0;
97  module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
98  
99  unsigned int min_timer_period_us = 500;
100  module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
101  
102  bool kvm_has_tsc_control;
103  EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
104  u32  kvm_max_guest_tsc_khz;
105  EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
106  
107  /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
108  static u32 tsc_tolerance_ppm = 250;
109  module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
110  
111  static bool backwards_tsc_observed = false;
112  
113  #define KVM_NR_SHARED_MSRS 16
114  
115  struct kvm_shared_msrs_global {
116  	int nr;
117  	u32 msrs[KVM_NR_SHARED_MSRS];
118  };
119  
120  struct kvm_shared_msrs {
121  	struct user_return_notifier urn;
122  	bool registered;
123  	struct kvm_shared_msr_values {
124  		u64 host;
125  		u64 curr;
126  	} values[KVM_NR_SHARED_MSRS];
127  };
128  
129  static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
130  static struct kvm_shared_msrs __percpu *shared_msrs;
131  
132  struct kvm_stats_debugfs_item debugfs_entries[] = {
133  	{ "pf_fixed", VCPU_STAT(pf_fixed) },
134  	{ "pf_guest", VCPU_STAT(pf_guest) },
135  	{ "tlb_flush", VCPU_STAT(tlb_flush) },
136  	{ "invlpg", VCPU_STAT(invlpg) },
137  	{ "exits", VCPU_STAT(exits) },
138  	{ "io_exits", VCPU_STAT(io_exits) },
139  	{ "mmio_exits", VCPU_STAT(mmio_exits) },
140  	{ "signal_exits", VCPU_STAT(signal_exits) },
141  	{ "irq_window", VCPU_STAT(irq_window_exits) },
142  	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
143  	{ "halt_exits", VCPU_STAT(halt_exits) },
144  	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
145  	{ "hypercalls", VCPU_STAT(hypercalls) },
146  	{ "request_irq", VCPU_STAT(request_irq_exits) },
147  	{ "irq_exits", VCPU_STAT(irq_exits) },
148  	{ "host_state_reload", VCPU_STAT(host_state_reload) },
149  	{ "efer_reload", VCPU_STAT(efer_reload) },
150  	{ "fpu_reload", VCPU_STAT(fpu_reload) },
151  	{ "insn_emulation", VCPU_STAT(insn_emulation) },
152  	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
153  	{ "irq_injections", VCPU_STAT(irq_injections) },
154  	{ "nmi_injections", VCPU_STAT(nmi_injections) },
155  	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
156  	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
157  	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
158  	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
159  	{ "mmu_flooded", VM_STAT(mmu_flooded) },
160  	{ "mmu_recycled", VM_STAT(mmu_recycled) },
161  	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
162  	{ "mmu_unsync", VM_STAT(mmu_unsync) },
163  	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
164  	{ "largepages", VM_STAT(lpages) },
165  	{ NULL }
166  };
167  
168  u64 __read_mostly host_xcr0;
169  
170  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
171  
172  static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
173  {
174  	int i;
175  	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
176  		vcpu->arch.apf.gfns[i] = ~0;
177  }
178  
179  static void kvm_on_user_return(struct user_return_notifier *urn)
180  {
181  	unsigned slot;
182  	struct kvm_shared_msrs *locals
183  		= container_of(urn, struct kvm_shared_msrs, urn);
184  	struct kvm_shared_msr_values *values;
185  
186  	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
187  		values = &locals->values[slot];
188  		if (values->host != values->curr) {
189  			wrmsrl(shared_msrs_global.msrs[slot], values->host);
190  			values->curr = values->host;
191  		}
192  	}
193  	locals->registered = false;
194  	user_return_notifier_unregister(urn);
195  }
196  
197  static void shared_msr_update(unsigned slot, u32 msr)
198  {
199  	u64 value;
200  	unsigned int cpu = smp_processor_id();
201  	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
202  
203  	/* only read, and nobody should modify it at this time,
204  	 * so don't need lock */
205  	if (slot >= shared_msrs_global.nr) {
206  		printk(KERN_ERR "kvm: invalid MSR slot!");
207  		return;
208  	}
209  	rdmsrl_safe(msr, &value);
210  	smsr->values[slot].host = value;
211  	smsr->values[slot].curr = value;
212  }
213  
214  void kvm_define_shared_msr(unsigned slot, u32 msr)
215  {
216  	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
217  	if (slot >= shared_msrs_global.nr)
218  		shared_msrs_global.nr = slot + 1;
219  	shared_msrs_global.msrs[slot] = msr;
220  	/* we need ensured the shared_msr_global have been updated */
221  	smp_wmb();
222  }
223  EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
224  
225  static void kvm_shared_msr_cpu_online(void)
226  {
227  	unsigned i;
228  
229  	for (i = 0; i < shared_msrs_global.nr; ++i)
230  		shared_msr_update(i, shared_msrs_global.msrs[i]);
231  }
232  
233  int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
234  {
235  	unsigned int cpu = smp_processor_id();
236  	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
237  	int err;
238  
239  	if (((value ^ smsr->values[slot].curr) & mask) == 0)
240  		return 0;
241  	smsr->values[slot].curr = value;
242  	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
243  	if (err)
244  		return 1;
245  
246  	if (!smsr->registered) {
247  		smsr->urn.on_user_return = kvm_on_user_return;
248  		user_return_notifier_register(&smsr->urn);
249  		smsr->registered = true;
250  	}
251  	return 0;
252  }
253  EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
254  
255  static void drop_user_return_notifiers(void)
256  {
257  	unsigned int cpu = smp_processor_id();
258  	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
259  
260  	if (smsr->registered)
261  		kvm_on_user_return(&smsr->urn);
262  }
263  
264  u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
265  {
266  	return vcpu->arch.apic_base;
267  }
268  EXPORT_SYMBOL_GPL(kvm_get_apic_base);
269  
270  int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
271  {
272  	u64 old_state = vcpu->arch.apic_base &
273  		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
274  	u64 new_state = msr_info->data &
275  		(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
276  	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) |
277  		0x2ff | (guest_cpuid_has_x2apic(vcpu) ? 0 : X2APIC_ENABLE);
278  
279  	if (!msr_info->host_initiated &&
280  	    ((msr_info->data & reserved_bits) != 0 ||
281  	     new_state == X2APIC_ENABLE ||
282  	     (new_state == MSR_IA32_APICBASE_ENABLE &&
283  	      old_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) ||
284  	     (new_state == (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE) &&
285  	      old_state == 0)))
286  		return 1;
287  
288  	kvm_lapic_set_base(vcpu, msr_info->data);
289  	return 0;
290  }
291  EXPORT_SYMBOL_GPL(kvm_set_apic_base);
292  
293  asmlinkage __visible void kvm_spurious_fault(void)
294  {
295  	/* Fault while not rebooting.  We want the trace. */
296  	BUG();
297  }
298  EXPORT_SYMBOL_GPL(kvm_spurious_fault);
299  
300  #define EXCPT_BENIGN		0
301  #define EXCPT_CONTRIBUTORY	1
302  #define EXCPT_PF		2
303  
304  static int exception_class(int vector)
305  {
306  	switch (vector) {
307  	case PF_VECTOR:
308  		return EXCPT_PF;
309  	case DE_VECTOR:
310  	case TS_VECTOR:
311  	case NP_VECTOR:
312  	case SS_VECTOR:
313  	case GP_VECTOR:
314  		return EXCPT_CONTRIBUTORY;
315  	default:
316  		break;
317  	}
318  	return EXCPT_BENIGN;
319  }
320  
321  #define EXCPT_FAULT		0
322  #define EXCPT_TRAP		1
323  #define EXCPT_ABORT		2
324  #define EXCPT_INTERRUPT		3
325  
326  static int exception_type(int vector)
327  {
328  	unsigned int mask;
329  
330  	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
331  		return EXCPT_INTERRUPT;
332  
333  	mask = 1 << vector;
334  
335  	/* #DB is trap, as instruction watchpoints are handled elsewhere */
336  	if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
337  		return EXCPT_TRAP;
338  
339  	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
340  		return EXCPT_ABORT;
341  
342  	/* Reserved exceptions will result in fault */
343  	return EXCPT_FAULT;
344  }
345  
346  static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
347  		unsigned nr, bool has_error, u32 error_code,
348  		bool reinject)
349  {
350  	u32 prev_nr;
351  	int class1, class2;
352  
353  	kvm_make_request(KVM_REQ_EVENT, vcpu);
354  
355  	if (!vcpu->arch.exception.pending) {
356  	queue:
357  		if (has_error && !is_protmode(vcpu))
358  			has_error = false;
359  		vcpu->arch.exception.pending = true;
360  		vcpu->arch.exception.has_error_code = has_error;
361  		vcpu->arch.exception.nr = nr;
362  		vcpu->arch.exception.error_code = error_code;
363  		vcpu->arch.exception.reinject = reinject;
364  		return;
365  	}
366  
367  	/* to check exception */
368  	prev_nr = vcpu->arch.exception.nr;
369  	if (prev_nr == DF_VECTOR) {
370  		/* triple fault -> shutdown */
371  		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
372  		return;
373  	}
374  	class1 = exception_class(prev_nr);
375  	class2 = exception_class(nr);
376  	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
377  		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
378  		/* generate double fault per SDM Table 5-5 */
379  		vcpu->arch.exception.pending = true;
380  		vcpu->arch.exception.has_error_code = true;
381  		vcpu->arch.exception.nr = DF_VECTOR;
382  		vcpu->arch.exception.error_code = 0;
383  	} else
384  		/* replace previous exception with a new one in a hope
385  		   that instruction re-execution will regenerate lost
386  		   exception */
387  		goto queue;
388  }
389  
390  void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
391  {
392  	kvm_multiple_exception(vcpu, nr, false, 0, false);
393  }
394  EXPORT_SYMBOL_GPL(kvm_queue_exception);
395  
396  void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
397  {
398  	kvm_multiple_exception(vcpu, nr, false, 0, true);
399  }
400  EXPORT_SYMBOL_GPL(kvm_requeue_exception);
401  
402  void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
403  {
404  	if (err)
405  		kvm_inject_gp(vcpu, 0);
406  	else
407  		kvm_x86_ops->skip_emulated_instruction(vcpu);
408  }
409  EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
410  
411  void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
412  {
413  	++vcpu->stat.pf_guest;
414  	vcpu->arch.cr2 = fault->address;
415  	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
416  }
417  EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
418  
419  static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
420  {
421  	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
422  		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
423  	else
424  		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
425  
426  	return fault->nested_page_fault;
427  }
428  
429  void kvm_inject_nmi(struct kvm_vcpu *vcpu)
430  {
431  	atomic_inc(&vcpu->arch.nmi_queued);
432  	kvm_make_request(KVM_REQ_NMI, vcpu);
433  }
434  EXPORT_SYMBOL_GPL(kvm_inject_nmi);
435  
436  void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
437  {
438  	kvm_multiple_exception(vcpu, nr, true, error_code, false);
439  }
440  EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
441  
442  void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
443  {
444  	kvm_multiple_exception(vcpu, nr, true, error_code, true);
445  }
446  EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
447  
448  /*
449   * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
450   * a #GP and return false.
451   */
452  bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
453  {
454  	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
455  		return true;
456  	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
457  	return false;
458  }
459  EXPORT_SYMBOL_GPL(kvm_require_cpl);
460  
461  bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
462  {
463  	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
464  		return true;
465  
466  	kvm_queue_exception(vcpu, UD_VECTOR);
467  	return false;
468  }
469  EXPORT_SYMBOL_GPL(kvm_require_dr);
470  
471  /*
472   * This function will be used to read from the physical memory of the currently
473   * running guest. The difference to kvm_read_guest_page is that this function
474   * can read from guest physical or from the guest's guest physical memory.
475   */
476  int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
477  			    gfn_t ngfn, void *data, int offset, int len,
478  			    u32 access)
479  {
480  	struct x86_exception exception;
481  	gfn_t real_gfn;
482  	gpa_t ngpa;
483  
484  	ngpa     = gfn_to_gpa(ngfn);
485  	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
486  	if (real_gfn == UNMAPPED_GVA)
487  		return -EFAULT;
488  
489  	real_gfn = gpa_to_gfn(real_gfn);
490  
491  	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
492  }
493  EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
494  
495  int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
496  			       void *data, int offset, int len, u32 access)
497  {
498  	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
499  				       data, offset, len, access);
500  }
501  
502  /*
503   * Load the pae pdptrs.  Return true is they are all valid.
504   */
505  int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
506  {
507  	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
508  	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
509  	int i;
510  	int ret;
511  	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
512  
513  	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
514  				      offset * sizeof(u64), sizeof(pdpte),
515  				      PFERR_USER_MASK|PFERR_WRITE_MASK);
516  	if (ret < 0) {
517  		ret = 0;
518  		goto out;
519  	}
520  	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
521  		if (is_present_gpte(pdpte[i]) &&
522  		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
523  			ret = 0;
524  			goto out;
525  		}
526  	}
527  	ret = 1;
528  
529  	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
530  	__set_bit(VCPU_EXREG_PDPTR,
531  		  (unsigned long *)&vcpu->arch.regs_avail);
532  	__set_bit(VCPU_EXREG_PDPTR,
533  		  (unsigned long *)&vcpu->arch.regs_dirty);
534  out:
535  
536  	return ret;
537  }
538  EXPORT_SYMBOL_GPL(load_pdptrs);
539  
540  static bool pdptrs_changed(struct kvm_vcpu *vcpu)
541  {
542  	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
543  	bool changed = true;
544  	int offset;
545  	gfn_t gfn;
546  	int r;
547  
548  	if (is_long_mode(vcpu) || !is_pae(vcpu))
549  		return false;
550  
551  	if (!test_bit(VCPU_EXREG_PDPTR,
552  		      (unsigned long *)&vcpu->arch.regs_avail))
553  		return true;
554  
555  	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
556  	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
557  	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
558  				       PFERR_USER_MASK | PFERR_WRITE_MASK);
559  	if (r < 0)
560  		goto out;
561  	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
562  out:
563  
564  	return changed;
565  }
566  
567  int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
568  {
569  	unsigned long old_cr0 = kvm_read_cr0(vcpu);
570  	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
571  				    X86_CR0_CD | X86_CR0_NW;
572  
573  	cr0 |= X86_CR0_ET;
574  
575  #ifdef CONFIG_X86_64
576  	if (cr0 & 0xffffffff00000000UL)
577  		return 1;
578  #endif
579  
580  	cr0 &= ~CR0_RESERVED_BITS;
581  
582  	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
583  		return 1;
584  
585  	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
586  		return 1;
587  
588  	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
589  #ifdef CONFIG_X86_64
590  		if ((vcpu->arch.efer & EFER_LME)) {
591  			int cs_db, cs_l;
592  
593  			if (!is_pae(vcpu))
594  				return 1;
595  			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
596  			if (cs_l)
597  				return 1;
598  		} else
599  #endif
600  		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
601  						 kvm_read_cr3(vcpu)))
602  			return 1;
603  	}
604  
605  	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
606  		return 1;
607  
608  	kvm_x86_ops->set_cr0(vcpu, cr0);
609  
610  	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
611  		kvm_clear_async_pf_completion_queue(vcpu);
612  		kvm_async_pf_hash_reset(vcpu);
613  	}
614  
615  	if ((cr0 ^ old_cr0) & update_bits)
616  		kvm_mmu_reset_context(vcpu);
617  	return 0;
618  }
619  EXPORT_SYMBOL_GPL(kvm_set_cr0);
620  
621  void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
622  {
623  	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
624  }
625  EXPORT_SYMBOL_GPL(kvm_lmsw);
626  
627  static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
628  {
629  	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
630  			!vcpu->guest_xcr0_loaded) {
631  		/* kvm_set_xcr() also depends on this */
632  		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
633  		vcpu->guest_xcr0_loaded = 1;
634  	}
635  }
636  
637  static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
638  {
639  	if (vcpu->guest_xcr0_loaded) {
640  		if (vcpu->arch.xcr0 != host_xcr0)
641  			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
642  		vcpu->guest_xcr0_loaded = 0;
643  	}
644  }
645  
646  int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
647  {
648  	u64 xcr0 = xcr;
649  	u64 old_xcr0 = vcpu->arch.xcr0;
650  	u64 valid_bits;
651  
652  	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
653  	if (index != XCR_XFEATURE_ENABLED_MASK)
654  		return 1;
655  	if (!(xcr0 & XSTATE_FP))
656  		return 1;
657  	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
658  		return 1;
659  
660  	/*
661  	 * Do not allow the guest to set bits that we do not support
662  	 * saving.  However, xcr0 bit 0 is always set, even if the
663  	 * emulated CPU does not support XSAVE (see fx_init).
664  	 */
665  	valid_bits = vcpu->arch.guest_supported_xcr0 | XSTATE_FP;
666  	if (xcr0 & ~valid_bits)
667  		return 1;
668  
669  	if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
670  		return 1;
671  
672  	if (xcr0 & XSTATE_AVX512) {
673  		if (!(xcr0 & XSTATE_YMM))
674  			return 1;
675  		if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
676  			return 1;
677  	}
678  	kvm_put_guest_xcr0(vcpu);
679  	vcpu->arch.xcr0 = xcr0;
680  
681  	if ((xcr0 ^ old_xcr0) & XSTATE_EXTEND_MASK)
682  		kvm_update_cpuid(vcpu);
683  	return 0;
684  }
685  
686  int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
687  {
688  	if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
689  	    __kvm_set_xcr(vcpu, index, xcr)) {
690  		kvm_inject_gp(vcpu, 0);
691  		return 1;
692  	}
693  	return 0;
694  }
695  EXPORT_SYMBOL_GPL(kvm_set_xcr);
696  
697  int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
698  {
699  	unsigned long old_cr4 = kvm_read_cr4(vcpu);
700  	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
701  				   X86_CR4_PAE | X86_CR4_SMEP;
702  	if (cr4 & CR4_RESERVED_BITS)
703  		return 1;
704  
705  	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
706  		return 1;
707  
708  	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
709  		return 1;
710  
711  	if (!guest_cpuid_has_smap(vcpu) && (cr4 & X86_CR4_SMAP))
712  		return 1;
713  
714  	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_FSGSBASE))
715  		return 1;
716  
717  	if (is_long_mode(vcpu)) {
718  		if (!(cr4 & X86_CR4_PAE))
719  			return 1;
720  	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
721  		   && ((cr4 ^ old_cr4) & pdptr_bits)
722  		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
723  				   kvm_read_cr3(vcpu)))
724  		return 1;
725  
726  	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
727  		if (!guest_cpuid_has_pcid(vcpu))
728  			return 1;
729  
730  		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
731  		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
732  			return 1;
733  	}
734  
735  	if (kvm_x86_ops->set_cr4(vcpu, cr4))
736  		return 1;
737  
738  	if (((cr4 ^ old_cr4) & pdptr_bits) ||
739  	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
740  		kvm_mmu_reset_context(vcpu);
741  
742  	if ((cr4 ^ old_cr4) & X86_CR4_SMAP)
743  		update_permission_bitmask(vcpu, vcpu->arch.walk_mmu, false);
744  
745  	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
746  		kvm_update_cpuid(vcpu);
747  
748  	return 0;
749  }
750  EXPORT_SYMBOL_GPL(kvm_set_cr4);
751  
752  int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
753  {
754  #ifdef CONFIG_X86_64
755  	cr3 &= ~CR3_PCID_INVD;
756  #endif
757  
758  	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
759  		kvm_mmu_sync_roots(vcpu);
760  		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
761  		return 0;
762  	}
763  
764  	if (is_long_mode(vcpu)) {
765  		if (cr3 & CR3_L_MODE_RESERVED_BITS)
766  			return 1;
767  	} else if (is_pae(vcpu) && is_paging(vcpu) &&
768  		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
769  		return 1;
770  
771  	vcpu->arch.cr3 = cr3;
772  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
773  	kvm_mmu_new_cr3(vcpu);
774  	return 0;
775  }
776  EXPORT_SYMBOL_GPL(kvm_set_cr3);
777  
778  int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
779  {
780  	if (cr8 & CR8_RESERVED_BITS)
781  		return 1;
782  	if (irqchip_in_kernel(vcpu->kvm))
783  		kvm_lapic_set_tpr(vcpu, cr8);
784  	else
785  		vcpu->arch.cr8 = cr8;
786  	return 0;
787  }
788  EXPORT_SYMBOL_GPL(kvm_set_cr8);
789  
790  unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
791  {
792  	if (irqchip_in_kernel(vcpu->kvm))
793  		return kvm_lapic_get_cr8(vcpu);
794  	else
795  		return vcpu->arch.cr8;
796  }
797  EXPORT_SYMBOL_GPL(kvm_get_cr8);
798  
799  static void kvm_update_dr6(struct kvm_vcpu *vcpu)
800  {
801  	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
802  		kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
803  }
804  
805  static void kvm_update_dr7(struct kvm_vcpu *vcpu)
806  {
807  	unsigned long dr7;
808  
809  	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
810  		dr7 = vcpu->arch.guest_debug_dr7;
811  	else
812  		dr7 = vcpu->arch.dr7;
813  	kvm_x86_ops->set_dr7(vcpu, dr7);
814  	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
815  	if (dr7 & DR7_BP_EN_MASK)
816  		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
817  }
818  
819  static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
820  {
821  	u64 fixed = DR6_FIXED_1;
822  
823  	if (!guest_cpuid_has_rtm(vcpu))
824  		fixed |= DR6_RTM;
825  	return fixed;
826  }
827  
828  static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
829  {
830  	switch (dr) {
831  	case 0 ... 3:
832  		vcpu->arch.db[dr] = val;
833  		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
834  			vcpu->arch.eff_db[dr] = val;
835  		break;
836  	case 4:
837  		/* fall through */
838  	case 6:
839  		if (val & 0xffffffff00000000ULL)
840  			return -1; /* #GP */
841  		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
842  		kvm_update_dr6(vcpu);
843  		break;
844  	case 5:
845  		/* fall through */
846  	default: /* 7 */
847  		if (val & 0xffffffff00000000ULL)
848  			return -1; /* #GP */
849  		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
850  		kvm_update_dr7(vcpu);
851  		break;
852  	}
853  
854  	return 0;
855  }
856  
857  int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
858  {
859  	if (__kvm_set_dr(vcpu, dr, val)) {
860  		kvm_inject_gp(vcpu, 0);
861  		return 1;
862  	}
863  	return 0;
864  }
865  EXPORT_SYMBOL_GPL(kvm_set_dr);
866  
867  int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
868  {
869  	switch (dr) {
870  	case 0 ... 3:
871  		*val = vcpu->arch.db[dr];
872  		break;
873  	case 4:
874  		/* fall through */
875  	case 6:
876  		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
877  			*val = vcpu->arch.dr6;
878  		else
879  			*val = kvm_x86_ops->get_dr6(vcpu);
880  		break;
881  	case 5:
882  		/* fall through */
883  	default: /* 7 */
884  		*val = vcpu->arch.dr7;
885  		break;
886  	}
887  	return 0;
888  }
889  EXPORT_SYMBOL_GPL(kvm_get_dr);
890  
891  bool kvm_rdpmc(struct kvm_vcpu *vcpu)
892  {
893  	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
894  	u64 data;
895  	int err;
896  
897  	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
898  	if (err)
899  		return err;
900  	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
901  	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
902  	return err;
903  }
904  EXPORT_SYMBOL_GPL(kvm_rdpmc);
905  
906  /*
907   * List of msr numbers which we expose to userspace through KVM_GET_MSRS
908   * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
909   *
910   * This list is modified at module load time to reflect the
911   * capabilities of the host cpu. This capabilities test skips MSRs that are
912   * kvm-specific. Those are put in the beginning of the list.
913   */
914  
915  #define KVM_SAVE_MSRS_BEGIN	12
916  static u32 msrs_to_save[] = {
917  	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
918  	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
919  	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
920  	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
921  	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
922  	MSR_KVM_PV_EOI_EN,
923  	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
924  	MSR_STAR,
925  #ifdef CONFIG_X86_64
926  	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
927  #endif
928  	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
929  	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS
930  };
931  
932  static unsigned num_msrs_to_save;
933  
934  static const u32 emulated_msrs[] = {
935  	MSR_IA32_TSC_ADJUST,
936  	MSR_IA32_TSCDEADLINE,
937  	MSR_IA32_MISC_ENABLE,
938  	MSR_IA32_MCG_STATUS,
939  	MSR_IA32_MCG_CTL,
940  };
941  
942  bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
943  {
944  	if (efer & efer_reserved_bits)
945  		return false;
946  
947  	if (efer & EFER_FFXSR) {
948  		struct kvm_cpuid_entry2 *feat;
949  
950  		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
951  		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
952  			return false;
953  	}
954  
955  	if (efer & EFER_SVME) {
956  		struct kvm_cpuid_entry2 *feat;
957  
958  		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
959  		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
960  			return false;
961  	}
962  
963  	return true;
964  }
965  EXPORT_SYMBOL_GPL(kvm_valid_efer);
966  
967  static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
968  {
969  	u64 old_efer = vcpu->arch.efer;
970  
971  	if (!kvm_valid_efer(vcpu, efer))
972  		return 1;
973  
974  	if (is_paging(vcpu)
975  	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
976  		return 1;
977  
978  	efer &= ~EFER_LMA;
979  	efer |= vcpu->arch.efer & EFER_LMA;
980  
981  	kvm_x86_ops->set_efer(vcpu, efer);
982  
983  	/* Update reserved bits */
984  	if ((efer ^ old_efer) & EFER_NX)
985  		kvm_mmu_reset_context(vcpu);
986  
987  	return 0;
988  }
989  
990  void kvm_enable_efer_bits(u64 mask)
991  {
992         efer_reserved_bits &= ~mask;
993  }
994  EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
995  
996  /*
997   * Writes msr value into into the appropriate "register".
998   * Returns 0 on success, non-0 otherwise.
999   * Assumes vcpu_load() was already called.
1000   */
1001  int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
1002  {
1003  	switch (msr->index) {
1004  	case MSR_FS_BASE:
1005  	case MSR_GS_BASE:
1006  	case MSR_KERNEL_GS_BASE:
1007  	case MSR_CSTAR:
1008  	case MSR_LSTAR:
1009  		if (is_noncanonical_address(msr->data))
1010  			return 1;
1011  		break;
1012  	case MSR_IA32_SYSENTER_EIP:
1013  	case MSR_IA32_SYSENTER_ESP:
1014  		/*
1015  		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1016  		 * non-canonical address is written on Intel but not on
1017  		 * AMD (which ignores the top 32-bits, because it does
1018  		 * not implement 64-bit SYSENTER).
1019  		 *
1020  		 * 64-bit code should hence be able to write a non-canonical
1021  		 * value on AMD.  Making the address canonical ensures that
1022  		 * vmentry does not fail on Intel after writing a non-canonical
1023  		 * value, and that something deterministic happens if the guest
1024  		 * invokes 64-bit SYSENTER.
1025  		 */
1026  		msr->data = get_canonical(msr->data);
1027  	}
1028  	return kvm_x86_ops->set_msr(vcpu, msr);
1029  }
1030  EXPORT_SYMBOL_GPL(kvm_set_msr);
1031  
1032  /*
1033   * Adapt set_msr() to msr_io()'s calling convention
1034   */
1035  static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1036  {
1037  	struct msr_data msr;
1038  
1039  	msr.data = *data;
1040  	msr.index = index;
1041  	msr.host_initiated = true;
1042  	return kvm_set_msr(vcpu, &msr);
1043  }
1044  
1045  #ifdef CONFIG_X86_64
1046  struct pvclock_gtod_data {
1047  	seqcount_t	seq;
1048  
1049  	struct { /* extract of a clocksource struct */
1050  		int vclock_mode;
1051  		cycle_t	cycle_last;
1052  		cycle_t	mask;
1053  		u32	mult;
1054  		u32	shift;
1055  	} clock;
1056  
1057  	u64		boot_ns;
1058  	u64		nsec_base;
1059  };
1060  
1061  static struct pvclock_gtod_data pvclock_gtod_data;
1062  
1063  static void update_pvclock_gtod(struct timekeeper *tk)
1064  {
1065  	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
1066  	u64 boot_ns;
1067  
1068  	boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
1069  
1070  	write_seqcount_begin(&vdata->seq);
1071  
1072  	/* copy pvclock gtod data */
1073  	vdata->clock.vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
1074  	vdata->clock.cycle_last		= tk->tkr.cycle_last;
1075  	vdata->clock.mask		= tk->tkr.mask;
1076  	vdata->clock.mult		= tk->tkr.mult;
1077  	vdata->clock.shift		= tk->tkr.shift;
1078  
1079  	vdata->boot_ns			= boot_ns;
1080  	vdata->nsec_base		= tk->tkr.xtime_nsec;
1081  
1082  	write_seqcount_end(&vdata->seq);
1083  }
1084  #endif
1085  
1086  
1087  static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
1088  {
1089  	int version;
1090  	int r;
1091  	struct pvclock_wall_clock wc;
1092  	struct timespec boot;
1093  
1094  	if (!wall_clock)
1095  		return;
1096  
1097  	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
1098  	if (r)
1099  		return;
1100  
1101  	if (version & 1)
1102  		++version;  /* first time write, random junk */
1103  
1104  	++version;
1105  
1106  	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1107  
1108  	/*
1109  	 * The guest calculates current wall clock time by adding
1110  	 * system time (updated by kvm_guest_time_update below) to the
1111  	 * wall clock specified here.  guest system time equals host
1112  	 * system time for us, thus we must fill in host boot time here.
1113  	 */
1114  	getboottime(&boot);
1115  
1116  	if (kvm->arch.kvmclock_offset) {
1117  		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
1118  		boot = timespec_sub(boot, ts);
1119  	}
1120  	wc.sec = boot.tv_sec;
1121  	wc.nsec = boot.tv_nsec;
1122  	wc.version = version;
1123  
1124  	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
1125  
1126  	version++;
1127  	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
1128  }
1129  
1130  static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
1131  {
1132  	uint32_t quotient, remainder;
1133  
1134  	/* Don't try to replace with do_div(), this one calculates
1135  	 * "(dividend << 32) / divisor" */
1136  	__asm__ ( "divl %4"
1137  		  : "=a" (quotient), "=d" (remainder)
1138  		  : "0" (0), "1" (dividend), "r" (divisor) );
1139  	return quotient;
1140  }
1141  
1142  static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
1143  			       s8 *pshift, u32 *pmultiplier)
1144  {
1145  	uint64_t scaled64;
1146  	int32_t  shift = 0;
1147  	uint64_t tps64;
1148  	uint32_t tps32;
1149  
1150  	tps64 = base_khz * 1000LL;
1151  	scaled64 = scaled_khz * 1000LL;
1152  	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
1153  		tps64 >>= 1;
1154  		shift--;
1155  	}
1156  
1157  	tps32 = (uint32_t)tps64;
1158  	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
1159  		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
1160  			scaled64 >>= 1;
1161  		else
1162  			tps32 <<= 1;
1163  		shift++;
1164  	}
1165  
1166  	*pshift = shift;
1167  	*pmultiplier = div_frac(scaled64, tps32);
1168  
1169  	pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
1170  		 __func__, base_khz, scaled_khz, shift, *pmultiplier);
1171  }
1172  
1173  static inline u64 get_kernel_ns(void)
1174  {
1175  	return ktime_get_boot_ns();
1176  }
1177  
1178  #ifdef CONFIG_X86_64
1179  static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
1180  #endif
1181  
1182  static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
1183  unsigned long max_tsc_khz;
1184  
1185  static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
1186  {
1187  	return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult,
1188  				   vcpu->arch.virtual_tsc_shift);
1189  }
1190  
1191  static u32 adjust_tsc_khz(u32 khz, s32 ppm)
1192  {
1193  	u64 v = (u64)khz * (1000000 + ppm);
1194  	do_div(v, 1000000);
1195  	return v;
1196  }
1197  
1198  static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1199  {
1200  	u32 thresh_lo, thresh_hi;
1201  	int use_scaling = 0;
1202  
1203  	/* tsc_khz can be zero if TSC calibration fails */
1204  	if (this_tsc_khz == 0)
1205  		return;
1206  
1207  	/* Compute a scale to convert nanoseconds in TSC cycles */
1208  	kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1209  			   &vcpu->arch.virtual_tsc_shift,
1210  			   &vcpu->arch.virtual_tsc_mult);
1211  	vcpu->arch.virtual_tsc_khz = this_tsc_khz;
1212  
1213  	/*
1214  	 * Compute the variation in TSC rate which is acceptable
1215  	 * within the range of tolerance and decide if the
1216  	 * rate being applied is within that bounds of the hardware
1217  	 * rate.  If so, no scaling or compensation need be done.
1218  	 */
1219  	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
1220  	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
1221  	if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) {
1222  		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi);
1223  		use_scaling = 1;
1224  	}
1225  	kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling);
1226  }
1227  
1228  static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1229  {
1230  	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
1231  				      vcpu->arch.virtual_tsc_mult,
1232  				      vcpu->arch.virtual_tsc_shift);
1233  	tsc += vcpu->arch.this_tsc_write;
1234  	return tsc;
1235  }
1236  
1237  void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
1238  {
1239  #ifdef CONFIG_X86_64
1240  	bool vcpus_matched;
1241  	struct kvm_arch *ka = &vcpu->kvm->arch;
1242  	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1243  
1244  	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1245  			 atomic_read(&vcpu->kvm->online_vcpus));
1246  
1247  	/*
1248  	 * Once the masterclock is enabled, always perform request in
1249  	 * order to update it.
1250  	 *
1251  	 * In order to enable masterclock, the host clocksource must be TSC
1252  	 * and the vcpus need to have matched TSCs.  When that happens,
1253  	 * perform request to enable masterclock.
1254  	 */
1255  	if (ka->use_master_clock ||
1256  	    (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
1257  		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
1258  
1259  	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
1260  			    atomic_read(&vcpu->kvm->online_vcpus),
1261  		            ka->use_master_clock, gtod->clock.vclock_mode);
1262  #endif
1263  }
1264  
1265  static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
1266  {
1267  	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
1268  	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
1269  }
1270  
1271  void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
1272  {
1273  	struct kvm *kvm = vcpu->kvm;
1274  	u64 offset, ns, elapsed;
1275  	unsigned long flags;
1276  	s64 usdiff;
1277  	bool matched;
1278  	bool already_matched;
1279  	u64 data = msr->data;
1280  
1281  	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1282  	offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1283  	ns = get_kernel_ns();
1284  	elapsed = ns - kvm->arch.last_tsc_nsec;
1285  
1286  	if (vcpu->arch.virtual_tsc_khz) {
1287  		int faulted = 0;
1288  
1289  		/* n.b - signed multiplication and division required */
1290  		usdiff = data - kvm->arch.last_tsc_write;
1291  #ifdef CONFIG_X86_64
1292  		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
1293  #else
1294  		/* do_div() only does unsigned */
1295  		asm("1: idivl %[divisor]\n"
1296  		    "2: xor %%edx, %%edx\n"
1297  		    "   movl $0, %[faulted]\n"
1298  		    "3:\n"
1299  		    ".section .fixup,\"ax\"\n"
1300  		    "4: movl $1, %[faulted]\n"
1301  		    "   jmp  3b\n"
1302  		    ".previous\n"
1303  
1304  		_ASM_EXTABLE(1b, 4b)
1305  
1306  		: "=A"(usdiff), [faulted] "=r" (faulted)
1307  		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
1308  
1309  #endif
1310  		do_div(elapsed, 1000);
1311  		usdiff -= elapsed;
1312  		if (usdiff < 0)
1313  			usdiff = -usdiff;
1314  
1315  		/* idivl overflow => difference is larger than USEC_PER_SEC */
1316  		if (faulted)
1317  			usdiff = USEC_PER_SEC;
1318  	} else
1319  		usdiff = USEC_PER_SEC; /* disable TSC match window below */
1320  
1321  	/*
1322  	 * Special case: TSC write with a small delta (1 second) of virtual
1323  	 * cycle time against real time is interpreted as an attempt to
1324  	 * synchronize the CPU.
1325           *
1326  	 * For a reliable TSC, we can match TSC offsets, and for an unstable
1327  	 * TSC, we add elapsed time in this computation.  We could let the
1328  	 * compensation code attempt to catch up if we fall behind, but
1329  	 * it's better to try to match offsets from the beginning.
1330           */
1331  	if (usdiff < USEC_PER_SEC &&
1332  	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
1333  		if (!check_tsc_unstable()) {
1334  			offset = kvm->arch.cur_tsc_offset;
1335  			pr_debug("kvm: matched tsc offset for %llu\n", data);
1336  		} else {
1337  			u64 delta = nsec_to_cycles(vcpu, elapsed);
1338  			data += delta;
1339  			offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1340  			pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1341  		}
1342  		matched = true;
1343  		already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
1344  	} else {
1345  		/*
1346  		 * We split periods of matched TSC writes into generations.
1347  		 * For each generation, we track the original measured
1348  		 * nanosecond time, offset, and write, so if TSCs are in
1349  		 * sync, we can match exact offset, and if not, we can match
1350  		 * exact software computation in compute_guest_tsc()
1351  		 *
1352  		 * These values are tracked in kvm->arch.cur_xxx variables.
1353  		 */
1354  		kvm->arch.cur_tsc_generation++;
1355  		kvm->arch.cur_tsc_nsec = ns;
1356  		kvm->arch.cur_tsc_write = data;
1357  		kvm->arch.cur_tsc_offset = offset;
1358  		matched = false;
1359  		pr_debug("kvm: new tsc generation %llu, clock %llu\n",
1360  			 kvm->arch.cur_tsc_generation, data);
1361  	}
1362  
1363  	/*
1364  	 * We also track th most recent recorded KHZ, write and time to
1365  	 * allow the matching interval to be extended at each write.
1366  	 */
1367  	kvm->arch.last_tsc_nsec = ns;
1368  	kvm->arch.last_tsc_write = data;
1369  	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
1370  
1371  	vcpu->arch.last_guest_tsc = data;
1372  
1373  	/* Keep track of which generation this VCPU has synchronized to */
1374  	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
1375  	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
1376  	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
1377  
1378  	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
1379  		update_ia32_tsc_adjust_msr(vcpu, offset);
1380  	kvm_x86_ops->write_tsc_offset(vcpu, offset);
1381  	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1382  
1383  	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
1384  	if (!matched) {
1385  		kvm->arch.nr_vcpus_matched_tsc = 0;
1386  	} else if (!already_matched) {
1387  		kvm->arch.nr_vcpus_matched_tsc++;
1388  	}
1389  
1390  	kvm_track_tsc_matching(vcpu);
1391  	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
1392  }
1393  
1394  EXPORT_SYMBOL_GPL(kvm_write_tsc);
1395  
1396  #ifdef CONFIG_X86_64
1397  
1398  static cycle_t read_tsc(void)
1399  {
1400  	cycle_t ret;
1401  	u64 last;
1402  
1403  	/*
1404  	 * Empirically, a fence (of type that depends on the CPU)
1405  	 * before rdtsc is enough to ensure that rdtsc is ordered
1406  	 * with respect to loads.  The various CPU manuals are unclear
1407  	 * as to whether rdtsc can be reordered with later loads,
1408  	 * but no one has ever seen it happen.
1409  	 */
1410  	rdtsc_barrier();
1411  	ret = (cycle_t)vget_cycles();
1412  
1413  	last = pvclock_gtod_data.clock.cycle_last;
1414  
1415  	if (likely(ret >= last))
1416  		return ret;
1417  
1418  	/*
1419  	 * GCC likes to generate cmov here, but this branch is extremely
1420  	 * predictable (it's just a funciton of time and the likely is
1421  	 * very likely) and there's a data dependence, so force GCC
1422  	 * to generate a branch instead.  I don't barrier() because
1423  	 * we don't actually need a barrier, and if this function
1424  	 * ever gets inlined it will generate worse code.
1425  	 */
1426  	asm volatile ("");
1427  	return last;
1428  }
1429  
1430  static inline u64 vgettsc(cycle_t *cycle_now)
1431  {
1432  	long v;
1433  	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1434  
1435  	*cycle_now = read_tsc();
1436  
1437  	v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
1438  	return v * gtod->clock.mult;
1439  }
1440  
1441  static int do_monotonic_boot(s64 *t, cycle_t *cycle_now)
1442  {
1443  	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
1444  	unsigned long seq;
1445  	int mode;
1446  	u64 ns;
1447  
1448  	do {
1449  		seq = read_seqcount_begin(&gtod->seq);
1450  		mode = gtod->clock.vclock_mode;
1451  		ns = gtod->nsec_base;
1452  		ns += vgettsc(cycle_now);
1453  		ns >>= gtod->clock.shift;
1454  		ns += gtod->boot_ns;
1455  	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
1456  	*t = ns;
1457  
1458  	return mode;
1459  }
1460  
1461  /* returns true if host is using tsc clocksource */
1462  static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
1463  {
1464  	/* checked again under seqlock below */
1465  	if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
1466  		return false;
1467  
1468  	return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
1469  }
1470  #endif
1471  
1472  /*
1473   *
1474   * Assuming a stable TSC across physical CPUS, and a stable TSC
1475   * across virtual CPUs, the following condition is possible.
1476   * Each numbered line represents an event visible to both
1477   * CPUs at the next numbered event.
1478   *
1479   * "timespecX" represents host monotonic time. "tscX" represents
1480   * RDTSC value.
1481   *
1482   * 		VCPU0 on CPU0		|	VCPU1 on CPU1
1483   *
1484   * 1.  read timespec0,tsc0
1485   * 2.					| timespec1 = timespec0 + N
1486   * 					| tsc1 = tsc0 + M
1487   * 3. transition to guest		| transition to guest
1488   * 4. ret0 = timespec0 + (rdtsc - tsc0) |
1489   * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
1490   * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
1491   *
1492   * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
1493   *
1494   * 	- ret0 < ret1
1495   *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
1496   *		...
1497   *	- 0 < N - M => M < N
1498   *
1499   * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
1500   * always the case (the difference between two distinct xtime instances
1501   * might be smaller then the difference between corresponding TSC reads,
1502   * when updating guest vcpus pvclock areas).
1503   *
1504   * To avoid that problem, do not allow visibility of distinct
1505   * system_timestamp/tsc_timestamp values simultaneously: use a master
1506   * copy of host monotonic time values. Update that master copy
1507   * in lockstep.
1508   *
1509   * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
1510   *
1511   */
1512  
1513  static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
1514  {
1515  #ifdef CONFIG_X86_64
1516  	struct kvm_arch *ka = &kvm->arch;
1517  	int vclock_mode;
1518  	bool host_tsc_clocksource, vcpus_matched;
1519  
1520  	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
1521  			atomic_read(&kvm->online_vcpus));
1522  
1523  	/*
1524  	 * If the host uses TSC clock, then passthrough TSC as stable
1525  	 * to the guest.
1526  	 */
1527  	host_tsc_clocksource = kvm_get_time_and_clockread(
1528  					&ka->master_kernel_ns,
1529  					&ka->master_cycle_now);
1530  
1531  	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
1532  				&& !backwards_tsc_observed;
1533  
1534  	if (ka->use_master_clock)
1535  		atomic_set(&kvm_guest_has_master_clock, 1);
1536  
1537  	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
1538  	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
1539  					vcpus_matched);
1540  #endif
1541  }
1542  
1543  static void kvm_gen_update_masterclock(struct kvm *kvm)
1544  {
1545  #ifdef CONFIG_X86_64
1546  	int i;
1547  	struct kvm_vcpu *vcpu;
1548  	struct kvm_arch *ka = &kvm->arch;
1549  
1550  	spin_lock(&ka->pvclock_gtod_sync_lock);
1551  	kvm_make_mclock_inprogress_request(kvm);
1552  	/* no guest entries from this point */
1553  	pvclock_update_vm_gtod_copy(kvm);
1554  
1555  	kvm_for_each_vcpu(i, vcpu, kvm)
1556  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1557  
1558  	/* guest entries allowed */
1559  	kvm_for_each_vcpu(i, vcpu, kvm)
1560  		clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
1561  
1562  	spin_unlock(&ka->pvclock_gtod_sync_lock);
1563  #endif
1564  }
1565  
1566  static int kvm_guest_time_update(struct kvm_vcpu *v)
1567  {
1568  	unsigned long flags, this_tsc_khz;
1569  	struct kvm_vcpu_arch *vcpu = &v->arch;
1570  	struct kvm_arch *ka = &v->kvm->arch;
1571  	s64 kernel_ns;
1572  	u64 tsc_timestamp, host_tsc;
1573  	struct pvclock_vcpu_time_info guest_hv_clock;
1574  	u8 pvclock_flags;
1575  	bool use_master_clock;
1576  
1577  	kernel_ns = 0;
1578  	host_tsc = 0;
1579  
1580  	/*
1581  	 * If the host uses TSC clock, then passthrough TSC as stable
1582  	 * to the guest.
1583  	 */
1584  	spin_lock(&ka->pvclock_gtod_sync_lock);
1585  	use_master_clock = ka->use_master_clock;
1586  	if (use_master_clock) {
1587  		host_tsc = ka->master_cycle_now;
1588  		kernel_ns = ka->master_kernel_ns;
1589  	}
1590  	spin_unlock(&ka->pvclock_gtod_sync_lock);
1591  
1592  	/* Keep irq disabled to prevent changes to the clock */
1593  	local_irq_save(flags);
1594  	this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
1595  	if (unlikely(this_tsc_khz == 0)) {
1596  		local_irq_restore(flags);
1597  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1598  		return 1;
1599  	}
1600  	if (!use_master_clock) {
1601  		host_tsc = native_read_tsc();
1602  		kernel_ns = get_kernel_ns();
1603  	}
1604  
1605  	tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
1606  
1607  	/*
1608  	 * We may have to catch up the TSC to match elapsed wall clock
1609  	 * time for two reasons, even if kvmclock is used.
1610  	 *   1) CPU could have been running below the maximum TSC rate
1611  	 *   2) Broken TSC compensation resets the base at each VCPU
1612  	 *      entry to avoid unknown leaps of TSC even when running
1613  	 *      again on the same CPU.  This may cause apparent elapsed
1614  	 *      time to disappear, and the guest to stand still or run
1615  	 *	very slowly.
1616  	 */
1617  	if (vcpu->tsc_catchup) {
1618  		u64 tsc = compute_guest_tsc(v, kernel_ns);
1619  		if (tsc > tsc_timestamp) {
1620  			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
1621  			tsc_timestamp = tsc;
1622  		}
1623  	}
1624  
1625  	local_irq_restore(flags);
1626  
1627  	if (!vcpu->pv_time_enabled)
1628  		return 0;
1629  
1630  	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1631  		kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1632  				   &vcpu->hv_clock.tsc_shift,
1633  				   &vcpu->hv_clock.tsc_to_system_mul);
1634  		vcpu->hw_tsc_khz = this_tsc_khz;
1635  	}
1636  
1637  	/* With all the info we got, fill in the values */
1638  	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1639  	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1640  	vcpu->last_guest_tsc = tsc_timestamp;
1641  
1642  	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
1643  		&guest_hv_clock, sizeof(guest_hv_clock))))
1644  		return 0;
1645  
1646  	/*
1647  	 * The interface expects us to write an even number signaling that the
1648  	 * update is finished. Since the guest won't see the intermediate
1649  	 * state, we just increase by 2 at the end.
1650  	 */
1651  	vcpu->hv_clock.version = guest_hv_clock.version + 2;
1652  
1653  	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
1654  	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
1655  
1656  	if (vcpu->pvclock_set_guest_stopped_request) {
1657  		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
1658  		vcpu->pvclock_set_guest_stopped_request = false;
1659  	}
1660  
1661  	/* If the host uses TSC clocksource, then it is stable */
1662  	if (use_master_clock)
1663  		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
1664  
1665  	vcpu->hv_clock.flags = pvclock_flags;
1666  
1667  	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
1668  
1669  	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
1670  				&vcpu->hv_clock,
1671  				sizeof(vcpu->hv_clock));
1672  	return 0;
1673  }
1674  
1675  /*
1676   * kvmclock updates which are isolated to a given vcpu, such as
1677   * vcpu->cpu migration, should not allow system_timestamp from
1678   * the rest of the vcpus to remain static. Otherwise ntp frequency
1679   * correction applies to one vcpu's system_timestamp but not
1680   * the others.
1681   *
1682   * So in those cases, request a kvmclock update for all vcpus.
1683   * We need to rate-limit these requests though, as they can
1684   * considerably slow guests that have a large number of vcpus.
1685   * The time for a remote vcpu to update its kvmclock is bound
1686   * by the delay we use to rate-limit the updates.
1687   */
1688  
1689  #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
1690  
1691  static void kvmclock_update_fn(struct work_struct *work)
1692  {
1693  	int i;
1694  	struct delayed_work *dwork = to_delayed_work(work);
1695  	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1696  					   kvmclock_update_work);
1697  	struct kvm *kvm = container_of(ka, struct kvm, arch);
1698  	struct kvm_vcpu *vcpu;
1699  
1700  	kvm_for_each_vcpu(i, vcpu, kvm) {
1701  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1702  		kvm_vcpu_kick(vcpu);
1703  	}
1704  }
1705  
1706  static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
1707  {
1708  	struct kvm *kvm = v->kvm;
1709  
1710  	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1711  	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
1712  					KVMCLOCK_UPDATE_DELAY);
1713  }
1714  
1715  #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
1716  
1717  static void kvmclock_sync_fn(struct work_struct *work)
1718  {
1719  	struct delayed_work *dwork = to_delayed_work(work);
1720  	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
1721  					   kvmclock_sync_work);
1722  	struct kvm *kvm = container_of(ka, struct kvm, arch);
1723  
1724  	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
1725  	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
1726  					KVMCLOCK_SYNC_PERIOD);
1727  }
1728  
1729  static bool msr_mtrr_valid(unsigned msr)
1730  {
1731  	switch (msr) {
1732  	case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
1733  	case MSR_MTRRfix64K_00000:
1734  	case MSR_MTRRfix16K_80000:
1735  	case MSR_MTRRfix16K_A0000:
1736  	case MSR_MTRRfix4K_C0000:
1737  	case MSR_MTRRfix4K_C8000:
1738  	case MSR_MTRRfix4K_D0000:
1739  	case MSR_MTRRfix4K_D8000:
1740  	case MSR_MTRRfix4K_E0000:
1741  	case MSR_MTRRfix4K_E8000:
1742  	case MSR_MTRRfix4K_F0000:
1743  	case MSR_MTRRfix4K_F8000:
1744  	case MSR_MTRRdefType:
1745  	case MSR_IA32_CR_PAT:
1746  		return true;
1747  	case 0x2f8:
1748  		return true;
1749  	}
1750  	return false;
1751  }
1752  
1753  static bool valid_pat_type(unsigned t)
1754  {
1755  	return t < 8 && (1 << t) & 0xf3; /* 0, 1, 4, 5, 6, 7 */
1756  }
1757  
1758  static bool valid_mtrr_type(unsigned t)
1759  {
1760  	return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */
1761  }
1762  
1763  bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1764  {
1765  	int i;
1766  	u64 mask;
1767  
1768  	if (!msr_mtrr_valid(msr))
1769  		return false;
1770  
1771  	if (msr == MSR_IA32_CR_PAT) {
1772  		for (i = 0; i < 8; i++)
1773  			if (!valid_pat_type((data >> (i * 8)) & 0xff))
1774  				return false;
1775  		return true;
1776  	} else if (msr == MSR_MTRRdefType) {
1777  		if (data & ~0xcff)
1778  			return false;
1779  		return valid_mtrr_type(data & 0xff);
1780  	} else if (msr >= MSR_MTRRfix64K_00000 && msr <= MSR_MTRRfix4K_F8000) {
1781  		for (i = 0; i < 8 ; i++)
1782  			if (!valid_mtrr_type((data >> (i * 8)) & 0xff))
1783  				return false;
1784  		return true;
1785  	}
1786  
1787  	/* variable MTRRs */
1788  	WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR));
1789  
1790  	mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1791  	if ((msr & 1) == 0) {
1792  		/* MTRR base */
1793  		if (!valid_mtrr_type(data & 0xff))
1794  			return false;
1795  		mask |= 0xf00;
1796  	} else
1797  		/* MTRR mask */
1798  		mask |= 0x7ff;
1799  	if (data & mask) {
1800  		kvm_inject_gp(vcpu, 0);
1801  		return false;
1802  	}
1803  
1804  	return true;
1805  }
1806  EXPORT_SYMBOL_GPL(kvm_mtrr_valid);
1807  
1808  static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1809  {
1810  	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
1811  
1812  	if (!kvm_mtrr_valid(vcpu, msr, data))
1813  		return 1;
1814  
1815  	if (msr == MSR_MTRRdefType) {
1816  		vcpu->arch.mtrr_state.def_type = data;
1817  		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
1818  	} else if (msr == MSR_MTRRfix64K_00000)
1819  		p[0] = data;
1820  	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
1821  		p[1 + msr - MSR_MTRRfix16K_80000] = data;
1822  	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
1823  		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
1824  	else if (msr == MSR_IA32_CR_PAT)
1825  		vcpu->arch.pat = data;
1826  	else {	/* Variable MTRRs */
1827  		int idx, is_mtrr_mask;
1828  		u64 *pt;
1829  
1830  		idx = (msr - 0x200) / 2;
1831  		is_mtrr_mask = msr - 0x200 - 2 * idx;
1832  		if (!is_mtrr_mask)
1833  			pt =
1834  			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
1835  		else
1836  			pt =
1837  			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
1838  		*pt = data;
1839  	}
1840  
1841  	kvm_mmu_reset_context(vcpu);
1842  	return 0;
1843  }
1844  
1845  static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1846  {
1847  	u64 mcg_cap = vcpu->arch.mcg_cap;
1848  	unsigned bank_num = mcg_cap & 0xff;
1849  
1850  	switch (msr) {
1851  	case MSR_IA32_MCG_STATUS:
1852  		vcpu->arch.mcg_status = data;
1853  		break;
1854  	case MSR_IA32_MCG_CTL:
1855  		if (!(mcg_cap & MCG_CTL_P))
1856  			return 1;
1857  		if (data != 0 && data != ~(u64)0)
1858  			return -1;
1859  		vcpu->arch.mcg_ctl = data;
1860  		break;
1861  	default:
1862  		if (msr >= MSR_IA32_MC0_CTL &&
1863  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
1864  			u32 offset = msr - MSR_IA32_MC0_CTL;
1865  			/* only 0 or all 1s can be written to IA32_MCi_CTL
1866  			 * some Linux kernels though clear bit 10 in bank 4 to
1867  			 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
1868  			 * this to avoid an uncatched #GP in the guest
1869  			 */
1870  			if ((offset & 0x3) == 0 &&
1871  			    data != 0 && (data | (1 << 10)) != ~(u64)0)
1872  				return -1;
1873  			vcpu->arch.mce_banks[offset] = data;
1874  			break;
1875  		}
1876  		return 1;
1877  	}
1878  	return 0;
1879  }
1880  
1881  static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
1882  {
1883  	struct kvm *kvm = vcpu->kvm;
1884  	int lm = is_long_mode(vcpu);
1885  	u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
1886  		: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
1887  	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1888  		: kvm->arch.xen_hvm_config.blob_size_32;
1889  	u32 page_num = data & ~PAGE_MASK;
1890  	u64 page_addr = data & PAGE_MASK;
1891  	u8 *page;
1892  	int r;
1893  
1894  	r = -E2BIG;
1895  	if (page_num >= blob_size)
1896  		goto out;
1897  	r = -ENOMEM;
1898  	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
1899  	if (IS_ERR(page)) {
1900  		r = PTR_ERR(page);
1901  		goto out;
1902  	}
1903  	if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
1904  		goto out_free;
1905  	r = 0;
1906  out_free:
1907  	kfree(page);
1908  out:
1909  	return r;
1910  }
1911  
1912  static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
1913  {
1914  	return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
1915  }
1916  
1917  static bool kvm_hv_msr_partition_wide(u32 msr)
1918  {
1919  	bool r = false;
1920  	switch (msr) {
1921  	case HV_X64_MSR_GUEST_OS_ID:
1922  	case HV_X64_MSR_HYPERCALL:
1923  	case HV_X64_MSR_REFERENCE_TSC:
1924  	case HV_X64_MSR_TIME_REF_COUNT:
1925  		r = true;
1926  		break;
1927  	}
1928  
1929  	return r;
1930  }
1931  
1932  static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1933  {
1934  	struct kvm *kvm = vcpu->kvm;
1935  
1936  	switch (msr) {
1937  	case HV_X64_MSR_GUEST_OS_ID:
1938  		kvm->arch.hv_guest_os_id = data;
1939  		/* setting guest os id to zero disables hypercall page */
1940  		if (!kvm->arch.hv_guest_os_id)
1941  			kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1942  		break;
1943  	case HV_X64_MSR_HYPERCALL: {
1944  		u64 gfn;
1945  		unsigned long addr;
1946  		u8 instructions[4];
1947  
1948  		/* if guest os id is not set hypercall should remain disabled */
1949  		if (!kvm->arch.hv_guest_os_id)
1950  			break;
1951  		if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1952  			kvm->arch.hv_hypercall = data;
1953  			break;
1954  		}
1955  		gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1956  		addr = gfn_to_hva(kvm, gfn);
1957  		if (kvm_is_error_hva(addr))
1958  			return 1;
1959  		kvm_x86_ops->patch_hypercall(vcpu, instructions);
1960  		((unsigned char *)instructions)[3] = 0xc3; /* ret */
1961  		if (__copy_to_user((void __user *)addr, instructions, 4))
1962  			return 1;
1963  		kvm->arch.hv_hypercall = data;
1964  		mark_page_dirty(kvm, gfn);
1965  		break;
1966  	}
1967  	case HV_X64_MSR_REFERENCE_TSC: {
1968  		u64 gfn;
1969  		HV_REFERENCE_TSC_PAGE tsc_ref;
1970  		memset(&tsc_ref, 0, sizeof(tsc_ref));
1971  		kvm->arch.hv_tsc_page = data;
1972  		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
1973  			break;
1974  		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
1975  		if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
1976  			&tsc_ref, sizeof(tsc_ref)))
1977  			return 1;
1978  		mark_page_dirty(kvm, gfn);
1979  		break;
1980  	}
1981  	default:
1982  		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1983  			    "data 0x%llx\n", msr, data);
1984  		return 1;
1985  	}
1986  	return 0;
1987  }
1988  
1989  static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1990  {
1991  	switch (msr) {
1992  	case HV_X64_MSR_APIC_ASSIST_PAGE: {
1993  		u64 gfn;
1994  		unsigned long addr;
1995  
1996  		if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1997  			vcpu->arch.hv_vapic = data;
1998  			if (kvm_lapic_enable_pv_eoi(vcpu, 0))
1999  				return 1;
2000  			break;
2001  		}
2002  		gfn = data >> HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT;
2003  		addr = gfn_to_hva(vcpu->kvm, gfn);
2004  		if (kvm_is_error_hva(addr))
2005  			return 1;
2006  		if (__clear_user((void __user *)addr, PAGE_SIZE))
2007  			return 1;
2008  		vcpu->arch.hv_vapic = data;
2009  		mark_page_dirty(vcpu->kvm, gfn);
2010  		if (kvm_lapic_enable_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED))
2011  			return 1;
2012  		break;
2013  	}
2014  	case HV_X64_MSR_EOI:
2015  		return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
2016  	case HV_X64_MSR_ICR:
2017  		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
2018  	case HV_X64_MSR_TPR:
2019  		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
2020  	default:
2021  		vcpu_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
2022  			    "data 0x%llx\n", msr, data);
2023  		return 1;
2024  	}
2025  
2026  	return 0;
2027  }
2028  
2029  static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
2030  {
2031  	gpa_t gpa = data & ~0x3f;
2032  
2033  	/* Bits 2:5 are reserved, Should be zero */
2034  	if (data & 0x3c)
2035  		return 1;
2036  
2037  	vcpu->arch.apf.msr_val = data;
2038  
2039  	if (!(data & KVM_ASYNC_PF_ENABLED)) {
2040  		kvm_clear_async_pf_completion_queue(vcpu);
2041  		kvm_async_pf_hash_reset(vcpu);
2042  		return 0;
2043  	}
2044  
2045  	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
2046  					sizeof(u32)))
2047  		return 1;
2048  
2049  	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
2050  	kvm_async_pf_wakeup_all(vcpu);
2051  	return 0;
2052  }
2053  
2054  static void kvmclock_reset(struct kvm_vcpu *vcpu)
2055  {
2056  	vcpu->arch.pv_time_enabled = false;
2057  }
2058  
2059  static void accumulate_steal_time(struct kvm_vcpu *vcpu)
2060  {
2061  	u64 delta;
2062  
2063  	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2064  		return;
2065  
2066  	delta = current->sched_info.run_delay - vcpu->arch.st.last_steal;
2067  	vcpu->arch.st.last_steal = current->sched_info.run_delay;
2068  	vcpu->arch.st.accum_steal = delta;
2069  }
2070  
2071  static void record_steal_time(struct kvm_vcpu *vcpu)
2072  {
2073  	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
2074  		return;
2075  
2076  	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2077  		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
2078  		return;
2079  
2080  	vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal;
2081  	vcpu->arch.st.steal.version += 2;
2082  	vcpu->arch.st.accum_steal = 0;
2083  
2084  	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
2085  		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
2086  }
2087  
2088  int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
2089  {
2090  	bool pr = false;
2091  	u32 msr = msr_info->index;
2092  	u64 data = msr_info->data;
2093  
2094  	switch (msr) {
2095  	case MSR_AMD64_NB_CFG:
2096  	case MSR_IA32_UCODE_REV:
2097  	case MSR_IA32_UCODE_WRITE:
2098  	case MSR_VM_HSAVE_PA:
2099  	case MSR_AMD64_PATCH_LOADER:
2100  	case MSR_AMD64_BU_CFG2:
2101  		break;
2102  
2103  	case MSR_EFER:
2104  		return set_efer(vcpu, data);
2105  	case MSR_K7_HWCR:
2106  		data &= ~(u64)0x40;	/* ignore flush filter disable */
2107  		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
2108  		data &= ~(u64)0x8;	/* ignore TLB cache disable */
2109  		data &= ~(u64)0x40000;  /* ignore Mc status write enable */
2110  		if (data != 0) {
2111  			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
2112  				    data);
2113  			return 1;
2114  		}
2115  		break;
2116  	case MSR_FAM10H_MMIO_CONF_BASE:
2117  		if (data != 0) {
2118  			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
2119  				    "0x%llx\n", data);
2120  			return 1;
2121  		}
2122  		break;
2123  	case MSR_IA32_DEBUGCTLMSR:
2124  		if (!data) {
2125  			/* We support the non-activated case already */
2126  			break;
2127  		} else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
2128  			/* Values other than LBR and BTF are vendor-specific,
2129  			   thus reserved and should throw a #GP */
2130  			return 1;
2131  		}
2132  		vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
2133  			    __func__, data);
2134  		break;
2135  	case 0x200 ... 0x2ff:
2136  		return set_msr_mtrr(vcpu, msr, data);
2137  	case MSR_IA32_APICBASE:
2138  		return kvm_set_apic_base(vcpu, msr_info);
2139  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2140  		return kvm_x2apic_msr_write(vcpu, msr, data);
2141  	case MSR_IA32_TSCDEADLINE:
2142  		kvm_set_lapic_tscdeadline_msr(vcpu, data);
2143  		break;
2144  	case MSR_IA32_TSC_ADJUST:
2145  		if (guest_cpuid_has_tsc_adjust(vcpu)) {
2146  			if (!msr_info->host_initiated) {
2147  				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
2148  				kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
2149  			}
2150  			vcpu->arch.ia32_tsc_adjust_msr = data;
2151  		}
2152  		break;
2153  	case MSR_IA32_MISC_ENABLE:
2154  		vcpu->arch.ia32_misc_enable_msr = data;
2155  		break;
2156  	case MSR_KVM_WALL_CLOCK_NEW:
2157  	case MSR_KVM_WALL_CLOCK:
2158  		vcpu->kvm->arch.wall_clock = data;
2159  		kvm_write_wall_clock(vcpu->kvm, data);
2160  		break;
2161  	case MSR_KVM_SYSTEM_TIME_NEW:
2162  	case MSR_KVM_SYSTEM_TIME: {
2163  		u64 gpa_offset;
2164  		kvmclock_reset(vcpu);
2165  
2166  		vcpu->arch.time = data;
2167  		kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2168  
2169  		/* we verify if the enable bit is set... */
2170  		if (!(data & 1))
2171  			break;
2172  
2173  		gpa_offset = data & ~(PAGE_MASK | 1);
2174  
2175  		if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
2176  		     &vcpu->arch.pv_time, data & ~1ULL,
2177  		     sizeof(struct pvclock_vcpu_time_info)))
2178  			vcpu->arch.pv_time_enabled = false;
2179  		else
2180  			vcpu->arch.pv_time_enabled = true;
2181  
2182  		break;
2183  	}
2184  	case MSR_KVM_ASYNC_PF_EN:
2185  		if (kvm_pv_enable_async_pf(vcpu, data))
2186  			return 1;
2187  		break;
2188  	case MSR_KVM_STEAL_TIME:
2189  
2190  		if (unlikely(!sched_info_on()))
2191  			return 1;
2192  
2193  		if (data & KVM_STEAL_RESERVED_MASK)
2194  			return 1;
2195  
2196  		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
2197  						data & KVM_STEAL_VALID_BITS,
2198  						sizeof(struct kvm_steal_time)))
2199  			return 1;
2200  
2201  		vcpu->arch.st.msr_val = data;
2202  
2203  		if (!(data & KVM_MSR_ENABLED))
2204  			break;
2205  
2206  		vcpu->arch.st.last_steal = current->sched_info.run_delay;
2207  
2208  		preempt_disable();
2209  		accumulate_steal_time(vcpu);
2210  		preempt_enable();
2211  
2212  		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2213  
2214  		break;
2215  	case MSR_KVM_PV_EOI_EN:
2216  		if (kvm_lapic_enable_pv_eoi(vcpu, data))
2217  			return 1;
2218  		break;
2219  
2220  	case MSR_IA32_MCG_CTL:
2221  	case MSR_IA32_MCG_STATUS:
2222  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2223  		return set_msr_mce(vcpu, msr, data);
2224  
2225  	/* Performance counters are not protected by a CPUID bit,
2226  	 * so we should check all of them in the generic path for the sake of
2227  	 * cross vendor migration.
2228  	 * Writing a zero into the event select MSRs disables them,
2229  	 * which we perfectly emulate ;-). Any other value should be at least
2230  	 * reported, some guests depend on them.
2231  	 */
2232  	case MSR_K7_EVNTSEL0:
2233  	case MSR_K7_EVNTSEL1:
2234  	case MSR_K7_EVNTSEL2:
2235  	case MSR_K7_EVNTSEL3:
2236  		if (data != 0)
2237  			vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2238  				    "0x%x data 0x%llx\n", msr, data);
2239  		break;
2240  	/* at least RHEL 4 unconditionally writes to the perfctr registers,
2241  	 * so we ignore writes to make it happy.
2242  	 */
2243  	case MSR_K7_PERFCTR0:
2244  	case MSR_K7_PERFCTR1:
2245  	case MSR_K7_PERFCTR2:
2246  	case MSR_K7_PERFCTR3:
2247  		vcpu_unimpl(vcpu, "unimplemented perfctr wrmsr: "
2248  			    "0x%x data 0x%llx\n", msr, data);
2249  		break;
2250  	case MSR_P6_PERFCTR0:
2251  	case MSR_P6_PERFCTR1:
2252  		pr = true;
2253  	case MSR_P6_EVNTSEL0:
2254  	case MSR_P6_EVNTSEL1:
2255  		if (kvm_pmu_msr(vcpu, msr))
2256  			return kvm_pmu_set_msr(vcpu, msr_info);
2257  
2258  		if (pr || data != 0)
2259  			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
2260  				    "0x%x data 0x%llx\n", msr, data);
2261  		break;
2262  	case MSR_K7_CLK_CTL:
2263  		/*
2264  		 * Ignore all writes to this no longer documented MSR.
2265  		 * Writes are only relevant for old K7 processors,
2266  		 * all pre-dating SVM, but a recommended workaround from
2267  		 * AMD for these chips. It is possible to specify the
2268  		 * affected processor models on the command line, hence
2269  		 * the need to ignore the workaround.
2270  		 */
2271  		break;
2272  	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2273  		if (kvm_hv_msr_partition_wide(msr)) {
2274  			int r;
2275  			mutex_lock(&vcpu->kvm->lock);
2276  			r = set_msr_hyperv_pw(vcpu, msr, data);
2277  			mutex_unlock(&vcpu->kvm->lock);
2278  			return r;
2279  		} else
2280  			return set_msr_hyperv(vcpu, msr, data);
2281  		break;
2282  	case MSR_IA32_BBL_CR_CTL3:
2283  		/* Drop writes to this legacy MSR -- see rdmsr
2284  		 * counterpart for further detail.
2285  		 */
2286  		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
2287  		break;
2288  	case MSR_AMD64_OSVW_ID_LENGTH:
2289  		if (!guest_cpuid_has_osvw(vcpu))
2290  			return 1;
2291  		vcpu->arch.osvw.length = data;
2292  		break;
2293  	case MSR_AMD64_OSVW_STATUS:
2294  		if (!guest_cpuid_has_osvw(vcpu))
2295  			return 1;
2296  		vcpu->arch.osvw.status = data;
2297  		break;
2298  	default:
2299  		if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
2300  			return xen_hvm_config(vcpu, data);
2301  		if (kvm_pmu_msr(vcpu, msr))
2302  			return kvm_pmu_set_msr(vcpu, msr_info);
2303  		if (!ignore_msrs) {
2304  			vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
2305  				    msr, data);
2306  			return 1;
2307  		} else {
2308  			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n",
2309  				    msr, data);
2310  			break;
2311  		}
2312  	}
2313  	return 0;
2314  }
2315  EXPORT_SYMBOL_GPL(kvm_set_msr_common);
2316  
2317  
2318  /*
2319   * Reads an msr value (of 'msr_index') into 'pdata'.
2320   * Returns 0 on success, non-0 otherwise.
2321   * Assumes vcpu_load() was already called.
2322   */
2323  int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
2324  {
2325  	return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
2326  }
2327  
2328  static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2329  {
2330  	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
2331  
2332  	if (!msr_mtrr_valid(msr))
2333  		return 1;
2334  
2335  	if (msr == MSR_MTRRdefType)
2336  		*pdata = vcpu->arch.mtrr_state.def_type +
2337  			 (vcpu->arch.mtrr_state.enabled << 10);
2338  	else if (msr == MSR_MTRRfix64K_00000)
2339  		*pdata = p[0];
2340  	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
2341  		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
2342  	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
2343  		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
2344  	else if (msr == MSR_IA32_CR_PAT)
2345  		*pdata = vcpu->arch.pat;
2346  	else {	/* Variable MTRRs */
2347  		int idx, is_mtrr_mask;
2348  		u64 *pt;
2349  
2350  		idx = (msr - 0x200) / 2;
2351  		is_mtrr_mask = msr - 0x200 - 2 * idx;
2352  		if (!is_mtrr_mask)
2353  			pt =
2354  			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
2355  		else
2356  			pt =
2357  			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
2358  		*pdata = *pt;
2359  	}
2360  
2361  	return 0;
2362  }
2363  
2364  static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2365  {
2366  	u64 data;
2367  	u64 mcg_cap = vcpu->arch.mcg_cap;
2368  	unsigned bank_num = mcg_cap & 0xff;
2369  
2370  	switch (msr) {
2371  	case MSR_IA32_P5_MC_ADDR:
2372  	case MSR_IA32_P5_MC_TYPE:
2373  		data = 0;
2374  		break;
2375  	case MSR_IA32_MCG_CAP:
2376  		data = vcpu->arch.mcg_cap;
2377  		break;
2378  	case MSR_IA32_MCG_CTL:
2379  		if (!(mcg_cap & MCG_CTL_P))
2380  			return 1;
2381  		data = vcpu->arch.mcg_ctl;
2382  		break;
2383  	case MSR_IA32_MCG_STATUS:
2384  		data = vcpu->arch.mcg_status;
2385  		break;
2386  	default:
2387  		if (msr >= MSR_IA32_MC0_CTL &&
2388  		    msr < MSR_IA32_MCx_CTL(bank_num)) {
2389  			u32 offset = msr - MSR_IA32_MC0_CTL;
2390  			data = vcpu->arch.mce_banks[offset];
2391  			break;
2392  		}
2393  		return 1;
2394  	}
2395  	*pdata = data;
2396  	return 0;
2397  }
2398  
2399  static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2400  {
2401  	u64 data = 0;
2402  	struct kvm *kvm = vcpu->kvm;
2403  
2404  	switch (msr) {
2405  	case HV_X64_MSR_GUEST_OS_ID:
2406  		data = kvm->arch.hv_guest_os_id;
2407  		break;
2408  	case HV_X64_MSR_HYPERCALL:
2409  		data = kvm->arch.hv_hypercall;
2410  		break;
2411  	case HV_X64_MSR_TIME_REF_COUNT: {
2412  		data =
2413  		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
2414  		break;
2415  	}
2416  	case HV_X64_MSR_REFERENCE_TSC:
2417  		data = kvm->arch.hv_tsc_page;
2418  		break;
2419  	default:
2420  		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2421  		return 1;
2422  	}
2423  
2424  	*pdata = data;
2425  	return 0;
2426  }
2427  
2428  static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2429  {
2430  	u64 data = 0;
2431  
2432  	switch (msr) {
2433  	case HV_X64_MSR_VP_INDEX: {
2434  		int r;
2435  		struct kvm_vcpu *v;
2436  		kvm_for_each_vcpu(r, v, vcpu->kvm) {
2437  			if (v == vcpu) {
2438  				data = r;
2439  				break;
2440  			}
2441  		}
2442  		break;
2443  	}
2444  	case HV_X64_MSR_EOI:
2445  		return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
2446  	case HV_X64_MSR_ICR:
2447  		return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
2448  	case HV_X64_MSR_TPR:
2449  		return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
2450  	case HV_X64_MSR_APIC_ASSIST_PAGE:
2451  		data = vcpu->arch.hv_vapic;
2452  		break;
2453  	default:
2454  		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
2455  		return 1;
2456  	}
2457  	*pdata = data;
2458  	return 0;
2459  }
2460  
2461  int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
2462  {
2463  	u64 data;
2464  
2465  	switch (msr) {
2466  	case MSR_IA32_PLATFORM_ID:
2467  	case MSR_IA32_EBL_CR_POWERON:
2468  	case MSR_IA32_DEBUGCTLMSR:
2469  	case MSR_IA32_LASTBRANCHFROMIP:
2470  	case MSR_IA32_LASTBRANCHTOIP:
2471  	case MSR_IA32_LASTINTFROMIP:
2472  	case MSR_IA32_LASTINTTOIP:
2473  	case MSR_K8_SYSCFG:
2474  	case MSR_K7_HWCR:
2475  	case MSR_VM_HSAVE_PA:
2476  	case MSR_K7_EVNTSEL0:
2477  	case MSR_K7_EVNTSEL1:
2478  	case MSR_K7_EVNTSEL2:
2479  	case MSR_K7_EVNTSEL3:
2480  	case MSR_K7_PERFCTR0:
2481  	case MSR_K7_PERFCTR1:
2482  	case MSR_K7_PERFCTR2:
2483  	case MSR_K7_PERFCTR3:
2484  	case MSR_K8_INT_PENDING_MSG:
2485  	case MSR_AMD64_NB_CFG:
2486  	case MSR_FAM10H_MMIO_CONF_BASE:
2487  	case MSR_AMD64_BU_CFG2:
2488  		data = 0;
2489  		break;
2490  	case MSR_P6_PERFCTR0:
2491  	case MSR_P6_PERFCTR1:
2492  	case MSR_P6_EVNTSEL0:
2493  	case MSR_P6_EVNTSEL1:
2494  		if (kvm_pmu_msr(vcpu, msr))
2495  			return kvm_pmu_get_msr(vcpu, msr, pdata);
2496  		data = 0;
2497  		break;
2498  	case MSR_IA32_UCODE_REV:
2499  		data = 0x100000000ULL;
2500  		break;
2501  	case MSR_MTRRcap:
2502  		data = 0x500 | KVM_NR_VAR_MTRR;
2503  		break;
2504  	case 0x200 ... 0x2ff:
2505  		return get_msr_mtrr(vcpu, msr, pdata);
2506  	case 0xcd: /* fsb frequency */
2507  		data = 3;
2508  		break;
2509  		/*
2510  		 * MSR_EBC_FREQUENCY_ID
2511  		 * Conservative value valid for even the basic CPU models.
2512  		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
2513  		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
2514  		 * and 266MHz for model 3, or 4. Set Core Clock
2515  		 * Frequency to System Bus Frequency Ratio to 1 (bits
2516  		 * 31:24) even though these are only valid for CPU
2517  		 * models > 2, however guests may end up dividing or
2518  		 * multiplying by zero otherwise.
2519  		 */
2520  	case MSR_EBC_FREQUENCY_ID:
2521  		data = 1 << 24;
2522  		break;
2523  	case MSR_IA32_APICBASE:
2524  		data = kvm_get_apic_base(vcpu);
2525  		break;
2526  	case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
2527  		return kvm_x2apic_msr_read(vcpu, msr, pdata);
2528  		break;
2529  	case MSR_IA32_TSCDEADLINE:
2530  		data = kvm_get_lapic_tscdeadline_msr(vcpu);
2531  		break;
2532  	case MSR_IA32_TSC_ADJUST:
2533  		data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
2534  		break;
2535  	case MSR_IA32_MISC_ENABLE:
2536  		data = vcpu->arch.ia32_misc_enable_msr;
2537  		break;
2538  	case MSR_IA32_PERF_STATUS:
2539  		/* TSC increment by tick */
2540  		data = 1000ULL;
2541  		/* CPU multiplier */
2542  		data |= (((uint64_t)4ULL) << 40);
2543  		break;
2544  	case MSR_EFER:
2545  		data = vcpu->arch.efer;
2546  		break;
2547  	case MSR_KVM_WALL_CLOCK:
2548  	case MSR_KVM_WALL_CLOCK_NEW:
2549  		data = vcpu->kvm->arch.wall_clock;
2550  		break;
2551  	case MSR_KVM_SYSTEM_TIME:
2552  	case MSR_KVM_SYSTEM_TIME_NEW:
2553  		data = vcpu->arch.time;
2554  		break;
2555  	case MSR_KVM_ASYNC_PF_EN:
2556  		data = vcpu->arch.apf.msr_val;
2557  		break;
2558  	case MSR_KVM_STEAL_TIME:
2559  		data = vcpu->arch.st.msr_val;
2560  		break;
2561  	case MSR_KVM_PV_EOI_EN:
2562  		data = vcpu->arch.pv_eoi.msr_val;
2563  		break;
2564  	case MSR_IA32_P5_MC_ADDR:
2565  	case MSR_IA32_P5_MC_TYPE:
2566  	case MSR_IA32_MCG_CAP:
2567  	case MSR_IA32_MCG_CTL:
2568  	case MSR_IA32_MCG_STATUS:
2569  	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
2570  		return get_msr_mce(vcpu, msr, pdata);
2571  	case MSR_K7_CLK_CTL:
2572  		/*
2573  		 * Provide expected ramp-up count for K7. All other
2574  		 * are set to zero, indicating minimum divisors for
2575  		 * every field.
2576  		 *
2577  		 * This prevents guest kernels on AMD host with CPU
2578  		 * type 6, model 8 and higher from exploding due to
2579  		 * the rdmsr failing.
2580  		 */
2581  		data = 0x20000000;
2582  		break;
2583  	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
2584  		if (kvm_hv_msr_partition_wide(msr)) {
2585  			int r;
2586  			mutex_lock(&vcpu->kvm->lock);
2587  			r = get_msr_hyperv_pw(vcpu, msr, pdata);
2588  			mutex_unlock(&vcpu->kvm->lock);
2589  			return r;
2590  		} else
2591  			return get_msr_hyperv(vcpu, msr, pdata);
2592  		break;
2593  	case MSR_IA32_BBL_CR_CTL3:
2594  		/* This legacy MSR exists but isn't fully documented in current
2595  		 * silicon.  It is however accessed by winxp in very narrow
2596  		 * scenarios where it sets bit #19, itself documented as
2597  		 * a "reserved" bit.  Best effort attempt to source coherent
2598  		 * read data here should the balance of the register be
2599  		 * interpreted by the guest:
2600  		 *
2601  		 * L2 cache control register 3: 64GB range, 256KB size,
2602  		 * enabled, latency 0x1, configured
2603  		 */
2604  		data = 0xbe702111;
2605  		break;
2606  	case MSR_AMD64_OSVW_ID_LENGTH:
2607  		if (!guest_cpuid_has_osvw(vcpu))
2608  			return 1;
2609  		data = vcpu->arch.osvw.length;
2610  		break;
2611  	case MSR_AMD64_OSVW_STATUS:
2612  		if (!guest_cpuid_has_osvw(vcpu))
2613  			return 1;
2614  		data = vcpu->arch.osvw.status;
2615  		break;
2616  	default:
2617  		if (kvm_pmu_msr(vcpu, msr))
2618  			return kvm_pmu_get_msr(vcpu, msr, pdata);
2619  		if (!ignore_msrs) {
2620  			vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
2621  			return 1;
2622  		} else {
2623  			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr);
2624  			data = 0;
2625  		}
2626  		break;
2627  	}
2628  	*pdata = data;
2629  	return 0;
2630  }
2631  EXPORT_SYMBOL_GPL(kvm_get_msr_common);
2632  
2633  /*
2634   * Read or write a bunch of msrs. All parameters are kernel addresses.
2635   *
2636   * @return number of msrs set successfully.
2637   */
2638  static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
2639  		    struct kvm_msr_entry *entries,
2640  		    int (*do_msr)(struct kvm_vcpu *vcpu,
2641  				  unsigned index, u64 *data))
2642  {
2643  	int i, idx;
2644  
2645  	idx = srcu_read_lock(&vcpu->kvm->srcu);
2646  	for (i = 0; i < msrs->nmsrs; ++i)
2647  		if (do_msr(vcpu, entries[i].index, &entries[i].data))
2648  			break;
2649  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
2650  
2651  	return i;
2652  }
2653  
2654  /*
2655   * Read or write a bunch of msrs. Parameters are user addresses.
2656   *
2657   * @return number of msrs set successfully.
2658   */
2659  static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
2660  		  int (*do_msr)(struct kvm_vcpu *vcpu,
2661  				unsigned index, u64 *data),
2662  		  int writeback)
2663  {
2664  	struct kvm_msrs msrs;
2665  	struct kvm_msr_entry *entries;
2666  	int r, n;
2667  	unsigned size;
2668  
2669  	r = -EFAULT;
2670  	if (copy_from_user(&msrs, user_msrs, sizeof msrs))
2671  		goto out;
2672  
2673  	r = -E2BIG;
2674  	if (msrs.nmsrs >= MAX_IO_MSRS)
2675  		goto out;
2676  
2677  	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
2678  	entries = memdup_user(user_msrs->entries, size);
2679  	if (IS_ERR(entries)) {
2680  		r = PTR_ERR(entries);
2681  		goto out;
2682  	}
2683  
2684  	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
2685  	if (r < 0)
2686  		goto out_free;
2687  
2688  	r = -EFAULT;
2689  	if (writeback && copy_to_user(user_msrs->entries, entries, size))
2690  		goto out_free;
2691  
2692  	r = n;
2693  
2694  out_free:
2695  	kfree(entries);
2696  out:
2697  	return r;
2698  }
2699  
2700  int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
2701  {
2702  	int r;
2703  
2704  	switch (ext) {
2705  	case KVM_CAP_IRQCHIP:
2706  	case KVM_CAP_HLT:
2707  	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
2708  	case KVM_CAP_SET_TSS_ADDR:
2709  	case KVM_CAP_EXT_CPUID:
2710  	case KVM_CAP_EXT_EMUL_CPUID:
2711  	case KVM_CAP_CLOCKSOURCE:
2712  	case KVM_CAP_PIT:
2713  	case KVM_CAP_NOP_IO_DELAY:
2714  	case KVM_CAP_MP_STATE:
2715  	case KVM_CAP_SYNC_MMU:
2716  	case KVM_CAP_USER_NMI:
2717  	case KVM_CAP_REINJECT_CONTROL:
2718  	case KVM_CAP_IRQ_INJECT_STATUS:
2719  	case KVM_CAP_IRQFD:
2720  	case KVM_CAP_IOEVENTFD:
2721  	case KVM_CAP_IOEVENTFD_NO_LENGTH:
2722  	case KVM_CAP_PIT2:
2723  	case KVM_CAP_PIT_STATE2:
2724  	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
2725  	case KVM_CAP_XEN_HVM:
2726  	case KVM_CAP_ADJUST_CLOCK:
2727  	case KVM_CAP_VCPU_EVENTS:
2728  	case KVM_CAP_HYPERV:
2729  	case KVM_CAP_HYPERV_VAPIC:
2730  	case KVM_CAP_HYPERV_SPIN:
2731  	case KVM_CAP_PCI_SEGMENT:
2732  	case KVM_CAP_DEBUGREGS:
2733  	case KVM_CAP_X86_ROBUST_SINGLESTEP:
2734  	case KVM_CAP_XSAVE:
2735  	case KVM_CAP_ASYNC_PF:
2736  	case KVM_CAP_GET_TSC_KHZ:
2737  	case KVM_CAP_KVMCLOCK_CTRL:
2738  	case KVM_CAP_READONLY_MEM:
2739  	case KVM_CAP_HYPERV_TIME:
2740  	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
2741  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2742  	case KVM_CAP_ASSIGN_DEV_IRQ:
2743  	case KVM_CAP_PCI_2_3:
2744  #endif
2745  		r = 1;
2746  		break;
2747  	case KVM_CAP_COALESCED_MMIO:
2748  		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
2749  		break;
2750  	case KVM_CAP_VAPIC:
2751  		r = !kvm_x86_ops->cpu_has_accelerated_tpr();
2752  		break;
2753  	case KVM_CAP_NR_VCPUS:
2754  		r = KVM_SOFT_MAX_VCPUS;
2755  		break;
2756  	case KVM_CAP_MAX_VCPUS:
2757  		r = KVM_MAX_VCPUS;
2758  		break;
2759  	case KVM_CAP_NR_MEMSLOTS:
2760  		r = KVM_USER_MEM_SLOTS;
2761  		break;
2762  	case KVM_CAP_PV_MMU:	/* obsolete */
2763  		r = 0;
2764  		break;
2765  #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
2766  	case KVM_CAP_IOMMU:
2767  		r = iommu_present(&pci_bus_type);
2768  		break;
2769  #endif
2770  	case KVM_CAP_MCE:
2771  		r = KVM_MAX_MCE_BANKS;
2772  		break;
2773  	case KVM_CAP_XCRS:
2774  		r = cpu_has_xsave;
2775  		break;
2776  	case KVM_CAP_TSC_CONTROL:
2777  		r = kvm_has_tsc_control;
2778  		break;
2779  	case KVM_CAP_TSC_DEADLINE_TIMER:
2780  		r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER);
2781  		break;
2782  	default:
2783  		r = 0;
2784  		break;
2785  	}
2786  	return r;
2787  
2788  }
2789  
2790  long kvm_arch_dev_ioctl(struct file *filp,
2791  			unsigned int ioctl, unsigned long arg)
2792  {
2793  	void __user *argp = (void __user *)arg;
2794  	long r;
2795  
2796  	switch (ioctl) {
2797  	case KVM_GET_MSR_INDEX_LIST: {
2798  		struct kvm_msr_list __user *user_msr_list = argp;
2799  		struct kvm_msr_list msr_list;
2800  		unsigned n;
2801  
2802  		r = -EFAULT;
2803  		if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
2804  			goto out;
2805  		n = msr_list.nmsrs;
2806  		msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
2807  		if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
2808  			goto out;
2809  		r = -E2BIG;
2810  		if (n < msr_list.nmsrs)
2811  			goto out;
2812  		r = -EFAULT;
2813  		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
2814  				 num_msrs_to_save * sizeof(u32)))
2815  			goto out;
2816  		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
2817  				 &emulated_msrs,
2818  				 ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
2819  			goto out;
2820  		r = 0;
2821  		break;
2822  	}
2823  	case KVM_GET_SUPPORTED_CPUID:
2824  	case KVM_GET_EMULATED_CPUID: {
2825  		struct kvm_cpuid2 __user *cpuid_arg = argp;
2826  		struct kvm_cpuid2 cpuid;
2827  
2828  		r = -EFAULT;
2829  		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
2830  			goto out;
2831  
2832  		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
2833  					    ioctl);
2834  		if (r)
2835  			goto out;
2836  
2837  		r = -EFAULT;
2838  		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
2839  			goto out;
2840  		r = 0;
2841  		break;
2842  	}
2843  	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
2844  		u64 mce_cap;
2845  
2846  		mce_cap = KVM_MCE_CAP_SUPPORTED;
2847  		r = -EFAULT;
2848  		if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
2849  			goto out;
2850  		r = 0;
2851  		break;
2852  	}
2853  	default:
2854  		r = -EINVAL;
2855  	}
2856  out:
2857  	return r;
2858  }
2859  
2860  static void wbinvd_ipi(void *garbage)
2861  {
2862  	wbinvd();
2863  }
2864  
2865  static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
2866  {
2867  	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
2868  }
2869  
2870  void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2871  {
2872  	/* Address WBINVD may be executed by guest */
2873  	if (need_emulate_wbinvd(vcpu)) {
2874  		if (kvm_x86_ops->has_wbinvd_exit())
2875  			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
2876  		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
2877  			smp_call_function_single(vcpu->cpu,
2878  					wbinvd_ipi, NULL, 1);
2879  	}
2880  
2881  	kvm_x86_ops->vcpu_load(vcpu, cpu);
2882  
2883  	/* Apply any externally detected TSC adjustments (due to suspend) */
2884  	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
2885  		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
2886  		vcpu->arch.tsc_offset_adjustment = 0;
2887  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2888  	}
2889  
2890  	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2891  		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
2892  				native_read_tsc() - vcpu->arch.last_host_tsc;
2893  		if (tsc_delta < 0)
2894  			mark_tsc_unstable("KVM discovered backwards TSC");
2895  		if (check_tsc_unstable()) {
2896  			u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu,
2897  						vcpu->arch.last_guest_tsc);
2898  			kvm_x86_ops->write_tsc_offset(vcpu, offset);
2899  			vcpu->arch.tsc_catchup = 1;
2900  		}
2901  		/*
2902  		 * On a host with synchronized TSC, there is no need to update
2903  		 * kvmclock on vcpu->cpu migration
2904  		 */
2905  		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
2906  			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2907  		if (vcpu->cpu != cpu)
2908  			kvm_migrate_timers(vcpu);
2909  		vcpu->cpu = cpu;
2910  	}
2911  
2912  	accumulate_steal_time(vcpu);
2913  	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
2914  }
2915  
2916  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2917  {
2918  	kvm_x86_ops->vcpu_put(vcpu);
2919  	kvm_put_guest_fpu(vcpu);
2920  	vcpu->arch.last_host_tsc = native_read_tsc();
2921  }
2922  
2923  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
2924  				    struct kvm_lapic_state *s)
2925  {
2926  	kvm_x86_ops->sync_pir_to_irr(vcpu);
2927  	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
2928  
2929  	return 0;
2930  }
2931  
2932  static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
2933  				    struct kvm_lapic_state *s)
2934  {
2935  	kvm_apic_post_state_restore(vcpu, s);
2936  	update_cr8_intercept(vcpu);
2937  
2938  	return 0;
2939  }
2940  
2941  static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2942  				    struct kvm_interrupt *irq)
2943  {
2944  	if (irq->irq >= KVM_NR_INTERRUPTS)
2945  		return -EINVAL;
2946  	if (irqchip_in_kernel(vcpu->kvm))
2947  		return -ENXIO;
2948  
2949  	kvm_queue_interrupt(vcpu, irq->irq, false);
2950  	kvm_make_request(KVM_REQ_EVENT, vcpu);
2951  
2952  	return 0;
2953  }
2954  
2955  static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
2956  {
2957  	kvm_inject_nmi(vcpu);
2958  
2959  	return 0;
2960  }
2961  
2962  static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
2963  					   struct kvm_tpr_access_ctl *tac)
2964  {
2965  	if (tac->flags)
2966  		return -EINVAL;
2967  	vcpu->arch.tpr_access_reporting = !!tac->enabled;
2968  	return 0;
2969  }
2970  
2971  static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
2972  					u64 mcg_cap)
2973  {
2974  	int r;
2975  	unsigned bank_num = mcg_cap & 0xff, bank;
2976  
2977  	r = -EINVAL;
2978  	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
2979  		goto out;
2980  	if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
2981  		goto out;
2982  	r = 0;
2983  	vcpu->arch.mcg_cap = mcg_cap;
2984  	/* Init IA32_MCG_CTL to all 1s */
2985  	if (mcg_cap & MCG_CTL_P)
2986  		vcpu->arch.mcg_ctl = ~(u64)0;
2987  	/* Init IA32_MCi_CTL to all 1s */
2988  	for (bank = 0; bank < bank_num; bank++)
2989  		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
2990  out:
2991  	return r;
2992  }
2993  
2994  static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2995  				      struct kvm_x86_mce *mce)
2996  {
2997  	u64 mcg_cap = vcpu->arch.mcg_cap;
2998  	unsigned bank_num = mcg_cap & 0xff;
2999  	u64 *banks = vcpu->arch.mce_banks;
3000  
3001  	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
3002  		return -EINVAL;
3003  	/*
3004  	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
3005  	 * reporting is disabled
3006  	 */
3007  	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
3008  	    vcpu->arch.mcg_ctl != ~(u64)0)
3009  		return 0;
3010  	banks += 4 * mce->bank;
3011  	/*
3012  	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
3013  	 * reporting is disabled for the bank
3014  	 */
3015  	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
3016  		return 0;
3017  	if (mce->status & MCI_STATUS_UC) {
3018  		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
3019  		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
3020  			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
3021  			return 0;
3022  		}
3023  		if (banks[1] & MCI_STATUS_VAL)
3024  			mce->status |= MCI_STATUS_OVER;
3025  		banks[2] = mce->addr;
3026  		banks[3] = mce->misc;
3027  		vcpu->arch.mcg_status = mce->mcg_status;
3028  		banks[1] = mce->status;
3029  		kvm_queue_exception(vcpu, MC_VECTOR);
3030  	} else if (!(banks[1] & MCI_STATUS_VAL)
3031  		   || !(banks[1] & MCI_STATUS_UC)) {
3032  		if (banks[1] & MCI_STATUS_VAL)
3033  			mce->status |= MCI_STATUS_OVER;
3034  		banks[2] = mce->addr;
3035  		banks[3] = mce->misc;
3036  		banks[1] = mce->status;
3037  	} else
3038  		banks[1] |= MCI_STATUS_OVER;
3039  	return 0;
3040  }
3041  
3042  static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
3043  					       struct kvm_vcpu_events *events)
3044  {
3045  	process_nmi(vcpu);
3046  	events->exception.injected =
3047  		vcpu->arch.exception.pending &&
3048  		!kvm_exception_is_soft(vcpu->arch.exception.nr);
3049  	events->exception.nr = vcpu->arch.exception.nr;
3050  	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
3051  	events->exception.pad = 0;
3052  	events->exception.error_code = vcpu->arch.exception.error_code;
3053  
3054  	events->interrupt.injected =
3055  		vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft;
3056  	events->interrupt.nr = vcpu->arch.interrupt.nr;
3057  	events->interrupt.soft = 0;
3058  	events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
3059  
3060  	events->nmi.injected = vcpu->arch.nmi_injected;
3061  	events->nmi.pending = vcpu->arch.nmi_pending != 0;
3062  	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
3063  	events->nmi.pad = 0;
3064  
3065  	events->sipi_vector = 0; /* never valid when reporting to user space */
3066  
3067  	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
3068  			 | KVM_VCPUEVENT_VALID_SHADOW);
3069  	memset(&events->reserved, 0, sizeof(events->reserved));
3070  }
3071  
3072  static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
3073  					      struct kvm_vcpu_events *events)
3074  {
3075  	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
3076  			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
3077  			      | KVM_VCPUEVENT_VALID_SHADOW))
3078  		return -EINVAL;
3079  
3080  	process_nmi(vcpu);
3081  	vcpu->arch.exception.pending = events->exception.injected;
3082  	vcpu->arch.exception.nr = events->exception.nr;
3083  	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
3084  	vcpu->arch.exception.error_code = events->exception.error_code;
3085  
3086  	vcpu->arch.interrupt.pending = events->interrupt.injected;
3087  	vcpu->arch.interrupt.nr = events->interrupt.nr;
3088  	vcpu->arch.interrupt.soft = events->interrupt.soft;
3089  	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
3090  		kvm_x86_ops->set_interrupt_shadow(vcpu,
3091  						  events->interrupt.shadow);
3092  
3093  	vcpu->arch.nmi_injected = events->nmi.injected;
3094  	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
3095  		vcpu->arch.nmi_pending = events->nmi.pending;
3096  	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
3097  
3098  	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
3099  	    kvm_vcpu_has_lapic(vcpu))
3100  		vcpu->arch.apic->sipi_vector = events->sipi_vector;
3101  
3102  	kvm_make_request(KVM_REQ_EVENT, vcpu);
3103  
3104  	return 0;
3105  }
3106  
3107  static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
3108  					     struct kvm_debugregs *dbgregs)
3109  {
3110  	unsigned long val;
3111  
3112  	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
3113  	kvm_get_dr(vcpu, 6, &val);
3114  	dbgregs->dr6 = val;
3115  	dbgregs->dr7 = vcpu->arch.dr7;
3116  	dbgregs->flags = 0;
3117  	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
3118  }
3119  
3120  static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
3121  					    struct kvm_debugregs *dbgregs)
3122  {
3123  	if (dbgregs->flags)
3124  		return -EINVAL;
3125  
3126  	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
3127  	vcpu->arch.dr6 = dbgregs->dr6;
3128  	kvm_update_dr6(vcpu);
3129  	vcpu->arch.dr7 = dbgregs->dr7;
3130  	kvm_update_dr7(vcpu);
3131  
3132  	return 0;
3133  }
3134  
3135  #define XSTATE_COMPACTION_ENABLED (1ULL << 63)
3136  
3137  static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
3138  {
3139  	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
3140  	u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
3141  	u64 valid;
3142  
3143  	/*
3144  	 * Copy legacy XSAVE area, to avoid complications with CPUID
3145  	 * leaves 0 and 1 in the loop below.
3146  	 */
3147  	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
3148  
3149  	/* Set XSTATE_BV */
3150  	*(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
3151  
3152  	/*
3153  	 * Copy each region from the possibly compacted offset to the
3154  	 * non-compacted offset.
3155  	 */
3156  	valid = xstate_bv & ~XSTATE_FPSSE;
3157  	while (valid) {
3158  		u64 feature = valid & -valid;
3159  		int index = fls64(feature) - 1;
3160  		void *src = get_xsave_addr(xsave, feature);
3161  
3162  		if (src) {
3163  			u32 size, offset, ecx, edx;
3164  			cpuid_count(XSTATE_CPUID, index,
3165  				    &size, &offset, &ecx, &edx);
3166  			memcpy(dest + offset, src, size);
3167  		}
3168  
3169  		valid -= feature;
3170  	}
3171  }
3172  
3173  static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
3174  {
3175  	struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
3176  	u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
3177  	u64 valid;
3178  
3179  	/*
3180  	 * Copy legacy XSAVE area, to avoid complications with CPUID
3181  	 * leaves 0 and 1 in the loop below.
3182  	 */
3183  	memcpy(xsave, src, XSAVE_HDR_OFFSET);
3184  
3185  	/* Set XSTATE_BV and possibly XCOMP_BV.  */
3186  	xsave->xsave_hdr.xstate_bv = xstate_bv;
3187  	if (cpu_has_xsaves)
3188  		xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
3189  
3190  	/*
3191  	 * Copy each region from the non-compacted offset to the
3192  	 * possibly compacted offset.
3193  	 */
3194  	valid = xstate_bv & ~XSTATE_FPSSE;
3195  	while (valid) {
3196  		u64 feature = valid & -valid;
3197  		int index = fls64(feature) - 1;
3198  		void *dest = get_xsave_addr(xsave, feature);
3199  
3200  		if (dest) {
3201  			u32 size, offset, ecx, edx;
3202  			cpuid_count(XSTATE_CPUID, index,
3203  				    &size, &offset, &ecx, &edx);
3204  			memcpy(dest, src + offset, size);
3205  		} else
3206  			WARN_ON_ONCE(1);
3207  
3208  		valid -= feature;
3209  	}
3210  }
3211  
3212  static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
3213  					 struct kvm_xsave *guest_xsave)
3214  {
3215  	if (cpu_has_xsave) {
3216  		memset(guest_xsave, 0, sizeof(struct kvm_xsave));
3217  		fill_xsave((u8 *) guest_xsave->region, vcpu);
3218  	} else {
3219  		memcpy(guest_xsave->region,
3220  			&vcpu->arch.guest_fpu.state->fxsave,
3221  			sizeof(struct i387_fxsave_struct));
3222  		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
3223  			XSTATE_FPSSE;
3224  	}
3225  }
3226  
3227  static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
3228  					struct kvm_xsave *guest_xsave)
3229  {
3230  	u64 xstate_bv =
3231  		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
3232  
3233  	if (cpu_has_xsave) {
3234  		/*
3235  		 * Here we allow setting states that are not present in
3236  		 * CPUID leaf 0xD, index 0, EDX:EAX.  This is for compatibility
3237  		 * with old userspace.
3238  		 */
3239  		if (xstate_bv & ~kvm_supported_xcr0())
3240  			return -EINVAL;
3241  		load_xsave(vcpu, (u8 *)guest_xsave->region);
3242  	} else {
3243  		if (xstate_bv & ~XSTATE_FPSSE)
3244  			return -EINVAL;
3245  		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
3246  			guest_xsave->region, sizeof(struct i387_fxsave_struct));
3247  	}
3248  	return 0;
3249  }
3250  
3251  static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
3252  					struct kvm_xcrs *guest_xcrs)
3253  {
3254  	if (!cpu_has_xsave) {
3255  		guest_xcrs->nr_xcrs = 0;
3256  		return;
3257  	}
3258  
3259  	guest_xcrs->nr_xcrs = 1;
3260  	guest_xcrs->flags = 0;
3261  	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
3262  	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
3263  }
3264  
3265  static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
3266  				       struct kvm_xcrs *guest_xcrs)
3267  {
3268  	int i, r = 0;
3269  
3270  	if (!cpu_has_xsave)
3271  		return -EINVAL;
3272  
3273  	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
3274  		return -EINVAL;
3275  
3276  	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
3277  		/* Only support XCR0 currently */
3278  		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
3279  			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
3280  				guest_xcrs->xcrs[i].value);
3281  			break;
3282  		}
3283  	if (r)
3284  		r = -EINVAL;
3285  	return r;
3286  }
3287  
3288  /*
3289   * kvm_set_guest_paused() indicates to the guest kernel that it has been
3290   * stopped by the hypervisor.  This function will be called from the host only.
3291   * EINVAL is returned when the host attempts to set the flag for a guest that
3292   * does not support pv clocks.
3293   */
3294  static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
3295  {
3296  	if (!vcpu->arch.pv_time_enabled)
3297  		return -EINVAL;
3298  	vcpu->arch.pvclock_set_guest_stopped_request = true;
3299  	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3300  	return 0;
3301  }
3302  
3303  long kvm_arch_vcpu_ioctl(struct file *filp,
3304  			 unsigned int ioctl, unsigned long arg)
3305  {
3306  	struct kvm_vcpu *vcpu = filp->private_data;
3307  	void __user *argp = (void __user *)arg;
3308  	int r;
3309  	union {
3310  		struct kvm_lapic_state *lapic;
3311  		struct kvm_xsave *xsave;
3312  		struct kvm_xcrs *xcrs;
3313  		void *buffer;
3314  	} u;
3315  
3316  	u.buffer = NULL;
3317  	switch (ioctl) {
3318  	case KVM_GET_LAPIC: {
3319  		r = -EINVAL;
3320  		if (!vcpu->arch.apic)
3321  			goto out;
3322  		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
3323  
3324  		r = -ENOMEM;
3325  		if (!u.lapic)
3326  			goto out;
3327  		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
3328  		if (r)
3329  			goto out;
3330  		r = -EFAULT;
3331  		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
3332  			goto out;
3333  		r = 0;
3334  		break;
3335  	}
3336  	case KVM_SET_LAPIC: {
3337  		r = -EINVAL;
3338  		if (!vcpu->arch.apic)
3339  			goto out;
3340  		u.lapic = memdup_user(argp, sizeof(*u.lapic));
3341  		if (IS_ERR(u.lapic))
3342  			return PTR_ERR(u.lapic);
3343  
3344  		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
3345  		break;
3346  	}
3347  	case KVM_INTERRUPT: {
3348  		struct kvm_interrupt irq;
3349  
3350  		r = -EFAULT;
3351  		if (copy_from_user(&irq, argp, sizeof irq))
3352  			goto out;
3353  		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
3354  		break;
3355  	}
3356  	case KVM_NMI: {
3357  		r = kvm_vcpu_ioctl_nmi(vcpu);
3358  		break;
3359  	}
3360  	case KVM_SET_CPUID: {
3361  		struct kvm_cpuid __user *cpuid_arg = argp;
3362  		struct kvm_cpuid cpuid;
3363  
3364  		r = -EFAULT;
3365  		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3366  			goto out;
3367  		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
3368  		break;
3369  	}
3370  	case KVM_SET_CPUID2: {
3371  		struct kvm_cpuid2 __user *cpuid_arg = argp;
3372  		struct kvm_cpuid2 cpuid;
3373  
3374  		r = -EFAULT;
3375  		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3376  			goto out;
3377  		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
3378  					      cpuid_arg->entries);
3379  		break;
3380  	}
3381  	case KVM_GET_CPUID2: {
3382  		struct kvm_cpuid2 __user *cpuid_arg = argp;
3383  		struct kvm_cpuid2 cpuid;
3384  
3385  		r = -EFAULT;
3386  		if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
3387  			goto out;
3388  		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
3389  					      cpuid_arg->entries);
3390  		if (r)
3391  			goto out;
3392  		r = -EFAULT;
3393  		if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
3394  			goto out;
3395  		r = 0;
3396  		break;
3397  	}
3398  	case KVM_GET_MSRS:
3399  		r = msr_io(vcpu, argp, kvm_get_msr, 1);
3400  		break;
3401  	case KVM_SET_MSRS:
3402  		r = msr_io(vcpu, argp, do_set_msr, 0);
3403  		break;
3404  	case KVM_TPR_ACCESS_REPORTING: {
3405  		struct kvm_tpr_access_ctl tac;
3406  
3407  		r = -EFAULT;
3408  		if (copy_from_user(&tac, argp, sizeof tac))
3409  			goto out;
3410  		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
3411  		if (r)
3412  			goto out;
3413  		r = -EFAULT;
3414  		if (copy_to_user(argp, &tac, sizeof tac))
3415  			goto out;
3416  		r = 0;
3417  		break;
3418  	};
3419  	case KVM_SET_VAPIC_ADDR: {
3420  		struct kvm_vapic_addr va;
3421  
3422  		r = -EINVAL;
3423  		if (!irqchip_in_kernel(vcpu->kvm))
3424  			goto out;
3425  		r = -EFAULT;
3426  		if (copy_from_user(&va, argp, sizeof va))
3427  			goto out;
3428  		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
3429  		break;
3430  	}
3431  	case KVM_X86_SETUP_MCE: {
3432  		u64 mcg_cap;
3433  
3434  		r = -EFAULT;
3435  		if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap))
3436  			goto out;
3437  		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
3438  		break;
3439  	}
3440  	case KVM_X86_SET_MCE: {
3441  		struct kvm_x86_mce mce;
3442  
3443  		r = -EFAULT;
3444  		if (copy_from_user(&mce, argp, sizeof mce))
3445  			goto out;
3446  		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
3447  		break;
3448  	}
3449  	case KVM_GET_VCPU_EVENTS: {
3450  		struct kvm_vcpu_events events;
3451  
3452  		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
3453  
3454  		r = -EFAULT;
3455  		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
3456  			break;
3457  		r = 0;
3458  		break;
3459  	}
3460  	case KVM_SET_VCPU_EVENTS: {
3461  		struct kvm_vcpu_events events;
3462  
3463  		r = -EFAULT;
3464  		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
3465  			break;
3466  
3467  		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
3468  		break;
3469  	}
3470  	case KVM_GET_DEBUGREGS: {
3471  		struct kvm_debugregs dbgregs;
3472  
3473  		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
3474  
3475  		r = -EFAULT;
3476  		if (copy_to_user(argp, &dbgregs,
3477  				 sizeof(struct kvm_debugregs)))
3478  			break;
3479  		r = 0;
3480  		break;
3481  	}
3482  	case KVM_SET_DEBUGREGS: {
3483  		struct kvm_debugregs dbgregs;
3484  
3485  		r = -EFAULT;
3486  		if (copy_from_user(&dbgregs, argp,
3487  				   sizeof(struct kvm_debugregs)))
3488  			break;
3489  
3490  		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
3491  		break;
3492  	}
3493  	case KVM_GET_XSAVE: {
3494  		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
3495  		r = -ENOMEM;
3496  		if (!u.xsave)
3497  			break;
3498  
3499  		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
3500  
3501  		r = -EFAULT;
3502  		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
3503  			break;
3504  		r = 0;
3505  		break;
3506  	}
3507  	case KVM_SET_XSAVE: {
3508  		u.xsave = memdup_user(argp, sizeof(*u.xsave));
3509  		if (IS_ERR(u.xsave))
3510  			return PTR_ERR(u.xsave);
3511  
3512  		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
3513  		break;
3514  	}
3515  	case KVM_GET_XCRS: {
3516  		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
3517  		r = -ENOMEM;
3518  		if (!u.xcrs)
3519  			break;
3520  
3521  		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
3522  
3523  		r = -EFAULT;
3524  		if (copy_to_user(argp, u.xcrs,
3525  				 sizeof(struct kvm_xcrs)))
3526  			break;
3527  		r = 0;
3528  		break;
3529  	}
3530  	case KVM_SET_XCRS: {
3531  		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
3532  		if (IS_ERR(u.xcrs))
3533  			return PTR_ERR(u.xcrs);
3534  
3535  		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3536  		break;
3537  	}
3538  	case KVM_SET_TSC_KHZ: {
3539  		u32 user_tsc_khz;
3540  
3541  		r = -EINVAL;
3542  		user_tsc_khz = (u32)arg;
3543  
3544  		if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3545  			goto out;
3546  
3547  		if (user_tsc_khz == 0)
3548  			user_tsc_khz = tsc_khz;
3549  
3550  		kvm_set_tsc_khz(vcpu, user_tsc_khz);
3551  
3552  		r = 0;
3553  		goto out;
3554  	}
3555  	case KVM_GET_TSC_KHZ: {
3556  		r = vcpu->arch.virtual_tsc_khz;
3557  		goto out;
3558  	}
3559  	case KVM_KVMCLOCK_CTRL: {
3560  		r = kvm_set_guest_paused(vcpu);
3561  		goto out;
3562  	}
3563  	default:
3564  		r = -EINVAL;
3565  	}
3566  out:
3567  	kfree(u.buffer);
3568  	return r;
3569  }
3570  
3571  int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
3572  {
3573  	return VM_FAULT_SIGBUS;
3574  }
3575  
3576  static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
3577  {
3578  	int ret;
3579  
3580  	if (addr > (unsigned int)(-3 * PAGE_SIZE))
3581  		return -EINVAL;
3582  	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
3583  	return ret;
3584  }
3585  
3586  static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
3587  					      u64 ident_addr)
3588  {
3589  	kvm->arch.ept_identity_map_addr = ident_addr;
3590  	return 0;
3591  }
3592  
3593  static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
3594  					  u32 kvm_nr_mmu_pages)
3595  {
3596  	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
3597  		return -EINVAL;
3598  
3599  	mutex_lock(&kvm->slots_lock);
3600  
3601  	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
3602  	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
3603  
3604  	mutex_unlock(&kvm->slots_lock);
3605  	return 0;
3606  }
3607  
3608  static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
3609  {
3610  	return kvm->arch.n_max_mmu_pages;
3611  }
3612  
3613  static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3614  {
3615  	int r;
3616  
3617  	r = 0;
3618  	switch (chip->chip_id) {
3619  	case KVM_IRQCHIP_PIC_MASTER:
3620  		memcpy(&chip->chip.pic,
3621  			&pic_irqchip(kvm)->pics[0],
3622  			sizeof(struct kvm_pic_state));
3623  		break;
3624  	case KVM_IRQCHIP_PIC_SLAVE:
3625  		memcpy(&chip->chip.pic,
3626  			&pic_irqchip(kvm)->pics[1],
3627  			sizeof(struct kvm_pic_state));
3628  		break;
3629  	case KVM_IRQCHIP_IOAPIC:
3630  		r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
3631  		break;
3632  	default:
3633  		r = -EINVAL;
3634  		break;
3635  	}
3636  	return r;
3637  }
3638  
3639  static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
3640  {
3641  	int r;
3642  
3643  	r = 0;
3644  	switch (chip->chip_id) {
3645  	case KVM_IRQCHIP_PIC_MASTER:
3646  		spin_lock(&pic_irqchip(kvm)->lock);
3647  		memcpy(&pic_irqchip(kvm)->pics[0],
3648  			&chip->chip.pic,
3649  			sizeof(struct kvm_pic_state));
3650  		spin_unlock(&pic_irqchip(kvm)->lock);
3651  		break;
3652  	case KVM_IRQCHIP_PIC_SLAVE:
3653  		spin_lock(&pic_irqchip(kvm)->lock);
3654  		memcpy(&pic_irqchip(kvm)->pics[1],
3655  			&chip->chip.pic,
3656  			sizeof(struct kvm_pic_state));
3657  		spin_unlock(&pic_irqchip(kvm)->lock);
3658  		break;
3659  	case KVM_IRQCHIP_IOAPIC:
3660  		r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
3661  		break;
3662  	default:
3663  		r = -EINVAL;
3664  		break;
3665  	}
3666  	kvm_pic_update_irq(pic_irqchip(kvm));
3667  	return r;
3668  }
3669  
3670  static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3671  {
3672  	int r = 0;
3673  
3674  	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3675  	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
3676  	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3677  	return r;
3678  }
3679  
3680  static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
3681  {
3682  	int r = 0;
3683  
3684  	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3685  	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
3686  	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
3687  	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3688  	return r;
3689  }
3690  
3691  static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3692  {
3693  	int r = 0;
3694  
3695  	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3696  	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
3697  		sizeof(ps->channels));
3698  	ps->flags = kvm->arch.vpit->pit_state.flags;
3699  	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3700  	memset(&ps->reserved, 0, sizeof(ps->reserved));
3701  	return r;
3702  }
3703  
3704  static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
3705  {
3706  	int r = 0, start = 0;
3707  	u32 prev_legacy, cur_legacy;
3708  	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3709  	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
3710  	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
3711  	if (!prev_legacy && cur_legacy)
3712  		start = 1;
3713  	memcpy(&kvm->arch.vpit->pit_state.channels, &ps->channels,
3714  	       sizeof(kvm->arch.vpit->pit_state.channels));
3715  	kvm->arch.vpit->pit_state.flags = ps->flags;
3716  	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
3717  	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3718  	return r;
3719  }
3720  
3721  static int kvm_vm_ioctl_reinject(struct kvm *kvm,
3722  				 struct kvm_reinject_control *control)
3723  {
3724  	if (!kvm->arch.vpit)
3725  		return -ENXIO;
3726  	mutex_lock(&kvm->arch.vpit->pit_state.lock);
3727  	kvm->arch.vpit->pit_state.reinject = control->pit_reinject;
3728  	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3729  	return 0;
3730  }
3731  
3732  /**
3733   * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
3734   * @kvm: kvm instance
3735   * @log: slot id and address to which we copy the log
3736   *
3737   * We need to keep it in mind that VCPU threads can write to the bitmap
3738   * concurrently.  So, to avoid losing data, we keep the following order for
3739   * each bit:
3740   *
3741   *   1. Take a snapshot of the bit and clear it if needed.
3742   *   2. Write protect the corresponding page.
3743   *   3. Flush TLB's if needed.
3744   *   4. Copy the snapshot to the userspace.
3745   *
3746   * Between 2 and 3, the guest may write to the page using the remaining TLB
3747   * entry.  This is not a problem because the page will be reported dirty at
3748   * step 4 using the snapshot taken before and step 3 ensures that successive
3749   * writes will be logged for the next call.
3750   */
3751  int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
3752  {
3753  	int r;
3754  	struct kvm_memory_slot *memslot;
3755  	unsigned long n, i;
3756  	unsigned long *dirty_bitmap;
3757  	unsigned long *dirty_bitmap_buffer;
3758  	bool is_dirty = false;
3759  
3760  	mutex_lock(&kvm->slots_lock);
3761  
3762  	r = -EINVAL;
3763  	if (log->slot >= KVM_USER_MEM_SLOTS)
3764  		goto out;
3765  
3766  	memslot = id_to_memslot(kvm->memslots, log->slot);
3767  
3768  	dirty_bitmap = memslot->dirty_bitmap;
3769  	r = -ENOENT;
3770  	if (!dirty_bitmap)
3771  		goto out;
3772  
3773  	n = kvm_dirty_bitmap_bytes(memslot);
3774  
3775  	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
3776  	memset(dirty_bitmap_buffer, 0, n);
3777  
3778  	spin_lock(&kvm->mmu_lock);
3779  
3780  	for (i = 0; i < n / sizeof(long); i++) {
3781  		unsigned long mask;
3782  		gfn_t offset;
3783  
3784  		if (!dirty_bitmap[i])
3785  			continue;
3786  
3787  		is_dirty = true;
3788  
3789  		mask = xchg(&dirty_bitmap[i], 0);
3790  		dirty_bitmap_buffer[i] = mask;
3791  
3792  		offset = i * BITS_PER_LONG;
3793  		kvm_mmu_write_protect_pt_masked(kvm, memslot, offset, mask);
3794  	}
3795  
3796  	spin_unlock(&kvm->mmu_lock);
3797  
3798  	/* See the comments in kvm_mmu_slot_remove_write_access(). */
3799  	lockdep_assert_held(&kvm->slots_lock);
3800  
3801  	/*
3802  	 * All the TLBs can be flushed out of mmu lock, see the comments in
3803  	 * kvm_mmu_slot_remove_write_access().
3804  	 */
3805  	if (is_dirty)
3806  		kvm_flush_remote_tlbs(kvm);
3807  
3808  	r = -EFAULT;
3809  	if (copy_to_user(log->dirty_bitmap, dirty_bitmap_buffer, n))
3810  		goto out;
3811  
3812  	r = 0;
3813  out:
3814  	mutex_unlock(&kvm->slots_lock);
3815  	return r;
3816  }
3817  
3818  int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
3819  			bool line_status)
3820  {
3821  	if (!irqchip_in_kernel(kvm))
3822  		return -ENXIO;
3823  
3824  	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
3825  					irq_event->irq, irq_event->level,
3826  					line_status);
3827  	return 0;
3828  }
3829  
3830  long kvm_arch_vm_ioctl(struct file *filp,
3831  		       unsigned int ioctl, unsigned long arg)
3832  {
3833  	struct kvm *kvm = filp->private_data;
3834  	void __user *argp = (void __user *)arg;
3835  	int r = -ENOTTY;
3836  	/*
3837  	 * This union makes it completely explicit to gcc-3.x
3838  	 * that these two variables' stack usage should be
3839  	 * combined, not added together.
3840  	 */
3841  	union {
3842  		struct kvm_pit_state ps;
3843  		struct kvm_pit_state2 ps2;
3844  		struct kvm_pit_config pit_config;
3845  	} u;
3846  
3847  	switch (ioctl) {
3848  	case KVM_SET_TSS_ADDR:
3849  		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
3850  		break;
3851  	case KVM_SET_IDENTITY_MAP_ADDR: {
3852  		u64 ident_addr;
3853  
3854  		r = -EFAULT;
3855  		if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
3856  			goto out;
3857  		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
3858  		break;
3859  	}
3860  	case KVM_SET_NR_MMU_PAGES:
3861  		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
3862  		break;
3863  	case KVM_GET_NR_MMU_PAGES:
3864  		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
3865  		break;
3866  	case KVM_CREATE_IRQCHIP: {
3867  		struct kvm_pic *vpic;
3868  
3869  		mutex_lock(&kvm->lock);
3870  		r = -EEXIST;
3871  		if (kvm->arch.vpic)
3872  			goto create_irqchip_unlock;
3873  		r = -EINVAL;
3874  		if (atomic_read(&kvm->online_vcpus))
3875  			goto create_irqchip_unlock;
3876  		r = -ENOMEM;
3877  		vpic = kvm_create_pic(kvm);
3878  		if (vpic) {
3879  			r = kvm_ioapic_init(kvm);
3880  			if (r) {
3881  				mutex_lock(&kvm->slots_lock);
3882  				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3883  							  &vpic->dev_master);
3884  				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3885  							  &vpic->dev_slave);
3886  				kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3887  							  &vpic->dev_eclr);
3888  				mutex_unlock(&kvm->slots_lock);
3889  				kfree(vpic);
3890  				goto create_irqchip_unlock;
3891  			}
3892  		} else
3893  			goto create_irqchip_unlock;
3894  		smp_wmb();
3895  		kvm->arch.vpic = vpic;
3896  		smp_wmb();
3897  		r = kvm_setup_default_irq_routing(kvm);
3898  		if (r) {
3899  			mutex_lock(&kvm->slots_lock);
3900  			mutex_lock(&kvm->irq_lock);
3901  			kvm_ioapic_destroy(kvm);
3902  			kvm_destroy_pic(kvm);
3903  			mutex_unlock(&kvm->irq_lock);
3904  			mutex_unlock(&kvm->slots_lock);
3905  		}
3906  	create_irqchip_unlock:
3907  		mutex_unlock(&kvm->lock);
3908  		break;
3909  	}
3910  	case KVM_CREATE_PIT:
3911  		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
3912  		goto create_pit;
3913  	case KVM_CREATE_PIT2:
3914  		r = -EFAULT;
3915  		if (copy_from_user(&u.pit_config, argp,
3916  				   sizeof(struct kvm_pit_config)))
3917  			goto out;
3918  	create_pit:
3919  		mutex_lock(&kvm->slots_lock);
3920  		r = -EEXIST;
3921  		if (kvm->arch.vpit)
3922  			goto create_pit_unlock;
3923  		r = -ENOMEM;
3924  		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
3925  		if (kvm->arch.vpit)
3926  			r = 0;
3927  	create_pit_unlock:
3928  		mutex_unlock(&kvm->slots_lock);
3929  		break;
3930  	case KVM_GET_IRQCHIP: {
3931  		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3932  		struct kvm_irqchip *chip;
3933  
3934  		chip = memdup_user(argp, sizeof(*chip));
3935  		if (IS_ERR(chip)) {
3936  			r = PTR_ERR(chip);
3937  			goto out;
3938  		}
3939  
3940  		r = -ENXIO;
3941  		if (!irqchip_in_kernel(kvm))
3942  			goto get_irqchip_out;
3943  		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
3944  		if (r)
3945  			goto get_irqchip_out;
3946  		r = -EFAULT;
3947  		if (copy_to_user(argp, chip, sizeof *chip))
3948  			goto get_irqchip_out;
3949  		r = 0;
3950  	get_irqchip_out:
3951  		kfree(chip);
3952  		break;
3953  	}
3954  	case KVM_SET_IRQCHIP: {
3955  		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3956  		struct kvm_irqchip *chip;
3957  
3958  		chip = memdup_user(argp, sizeof(*chip));
3959  		if (IS_ERR(chip)) {
3960  			r = PTR_ERR(chip);
3961  			goto out;
3962  		}
3963  
3964  		r = -ENXIO;
3965  		if (!irqchip_in_kernel(kvm))
3966  			goto set_irqchip_out;
3967  		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
3968  		if (r)
3969  			goto set_irqchip_out;
3970  		r = 0;
3971  	set_irqchip_out:
3972  		kfree(chip);
3973  		break;
3974  	}
3975  	case KVM_GET_PIT: {
3976  		r = -EFAULT;
3977  		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
3978  			goto out;
3979  		r = -ENXIO;
3980  		if (!kvm->arch.vpit)
3981  			goto out;
3982  		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
3983  		if (r)
3984  			goto out;
3985  		r = -EFAULT;
3986  		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
3987  			goto out;
3988  		r = 0;
3989  		break;
3990  	}
3991  	case KVM_SET_PIT: {
3992  		r = -EFAULT;
3993  		if (copy_from_user(&u.ps, argp, sizeof u.ps))
3994  			goto out;
3995  		r = -ENXIO;
3996  		if (!kvm->arch.vpit)
3997  			goto out;
3998  		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
3999  		break;
4000  	}
4001  	case KVM_GET_PIT2: {
4002  		r = -ENXIO;
4003  		if (!kvm->arch.vpit)
4004  			goto out;
4005  		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
4006  		if (r)
4007  			goto out;
4008  		r = -EFAULT;
4009  		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
4010  			goto out;
4011  		r = 0;
4012  		break;
4013  	}
4014  	case KVM_SET_PIT2: {
4015  		r = -EFAULT;
4016  		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
4017  			goto out;
4018  		r = -ENXIO;
4019  		if (!kvm->arch.vpit)
4020  			goto out;
4021  		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
4022  		break;
4023  	}
4024  	case KVM_REINJECT_CONTROL: {
4025  		struct kvm_reinject_control control;
4026  		r =  -EFAULT;
4027  		if (copy_from_user(&control, argp, sizeof(control)))
4028  			goto out;
4029  		r = kvm_vm_ioctl_reinject(kvm, &control);
4030  		break;
4031  	}
4032  	case KVM_XEN_HVM_CONFIG: {
4033  		r = -EFAULT;
4034  		if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
4035  				   sizeof(struct kvm_xen_hvm_config)))
4036  			goto out;
4037  		r = -EINVAL;
4038  		if (kvm->arch.xen_hvm_config.flags)
4039  			goto out;
4040  		r = 0;
4041  		break;
4042  	}
4043  	case KVM_SET_CLOCK: {
4044  		struct kvm_clock_data user_ns;
4045  		u64 now_ns;
4046  		s64 delta;
4047  
4048  		r = -EFAULT;
4049  		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
4050  			goto out;
4051  
4052  		r = -EINVAL;
4053  		if (user_ns.flags)
4054  			goto out;
4055  
4056  		r = 0;
4057  		local_irq_disable();
4058  		now_ns = get_kernel_ns();
4059  		delta = user_ns.clock - now_ns;
4060  		local_irq_enable();
4061  		kvm->arch.kvmclock_offset = delta;
4062  		kvm_gen_update_masterclock(kvm);
4063  		break;
4064  	}
4065  	case KVM_GET_CLOCK: {
4066  		struct kvm_clock_data user_ns;
4067  		u64 now_ns;
4068  
4069  		local_irq_disable();
4070  		now_ns = get_kernel_ns();
4071  		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
4072  		local_irq_enable();
4073  		user_ns.flags = 0;
4074  		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
4075  
4076  		r = -EFAULT;
4077  		if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
4078  			goto out;
4079  		r = 0;
4080  		break;
4081  	}
4082  
4083  	default:
4084  		r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
4085  	}
4086  out:
4087  	return r;
4088  }
4089  
4090  static void kvm_init_msr_list(void)
4091  {
4092  	u32 dummy[2];
4093  	unsigned i, j;
4094  
4095  	/* skip the first msrs in the list. KVM-specific */
4096  	for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
4097  		if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
4098  			continue;
4099  
4100  		/*
4101  		 * Even MSRs that are valid in the host may not be exposed
4102  		 * to the guests in some cases.  We could work around this
4103  		 * in VMX with the generic MSR save/load machinery, but it
4104  		 * is not really worthwhile since it will really only
4105  		 * happen with nested virtualization.
4106  		 */
4107  		switch (msrs_to_save[i]) {
4108  		case MSR_IA32_BNDCFGS:
4109  			if (!kvm_x86_ops->mpx_supported())
4110  				continue;
4111  			break;
4112  		default:
4113  			break;
4114  		}
4115  
4116  		if (j < i)
4117  			msrs_to_save[j] = msrs_to_save[i];
4118  		j++;
4119  	}
4120  	num_msrs_to_save = j;
4121  }
4122  
4123  static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
4124  			   const void *v)
4125  {
4126  	int handled = 0;
4127  	int n;
4128  
4129  	do {
4130  		n = min(len, 8);
4131  		if (!(vcpu->arch.apic &&
4132  		      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
4133  		    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
4134  			break;
4135  		handled += n;
4136  		addr += n;
4137  		len -= n;
4138  		v += n;
4139  	} while (len);
4140  
4141  	return handled;
4142  }
4143  
4144  static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
4145  {
4146  	int handled = 0;
4147  	int n;
4148  
4149  	do {
4150  		n = min(len, 8);
4151  		if (!(vcpu->arch.apic &&
4152  		      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
4153  		    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
4154  			break;
4155  		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
4156  		handled += n;
4157  		addr += n;
4158  		len -= n;
4159  		v += n;
4160  	} while (len);
4161  
4162  	return handled;
4163  }
4164  
4165  static void kvm_set_segment(struct kvm_vcpu *vcpu,
4166  			struct kvm_segment *var, int seg)
4167  {
4168  	kvm_x86_ops->set_segment(vcpu, var, seg);
4169  }
4170  
4171  void kvm_get_segment(struct kvm_vcpu *vcpu,
4172  		     struct kvm_segment *var, int seg)
4173  {
4174  	kvm_x86_ops->get_segment(vcpu, var, seg);
4175  }
4176  
4177  gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
4178  			   struct x86_exception *exception)
4179  {
4180  	gpa_t t_gpa;
4181  
4182  	BUG_ON(!mmu_is_nested(vcpu));
4183  
4184  	/* NPT walks are always user-walks */
4185  	access |= PFERR_USER_MASK;
4186  	t_gpa  = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, exception);
4187  
4188  	return t_gpa;
4189  }
4190  
4191  gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
4192  			      struct x86_exception *exception)
4193  {
4194  	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4195  	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4196  }
4197  
4198   gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
4199  				struct x86_exception *exception)
4200  {
4201  	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4202  	access |= PFERR_FETCH_MASK;
4203  	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4204  }
4205  
4206  gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
4207  			       struct x86_exception *exception)
4208  {
4209  	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4210  	access |= PFERR_WRITE_MASK;
4211  	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4212  }
4213  
4214  /* uses this to access any guest's mapped memory without checking CPL */
4215  gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
4216  				struct x86_exception *exception)
4217  {
4218  	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
4219  }
4220  
4221  static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
4222  				      struct kvm_vcpu *vcpu, u32 access,
4223  				      struct x86_exception *exception)
4224  {
4225  	void *data = val;
4226  	int r = X86EMUL_CONTINUE;
4227  
4228  	while (bytes) {
4229  		gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
4230  							    exception);
4231  		unsigned offset = addr & (PAGE_SIZE-1);
4232  		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
4233  		int ret;
4234  
4235  		if (gpa == UNMAPPED_GVA)
4236  			return X86EMUL_PROPAGATE_FAULT;
4237  		ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data,
4238  					  offset, toread);
4239  		if (ret < 0) {
4240  			r = X86EMUL_IO_NEEDED;
4241  			goto out;
4242  		}
4243  
4244  		bytes -= toread;
4245  		data += toread;
4246  		addr += toread;
4247  	}
4248  out:
4249  	return r;
4250  }
4251  
4252  /* used for instruction fetching */
4253  static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
4254  				gva_t addr, void *val, unsigned int bytes,
4255  				struct x86_exception *exception)
4256  {
4257  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4258  	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4259  	unsigned offset;
4260  	int ret;
4261  
4262  	/* Inline kvm_read_guest_virt_helper for speed.  */
4263  	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK,
4264  						    exception);
4265  	if (unlikely(gpa == UNMAPPED_GVA))
4266  		return X86EMUL_PROPAGATE_FAULT;
4267  
4268  	offset = addr & (PAGE_SIZE-1);
4269  	if (WARN_ON(offset + bytes > PAGE_SIZE))
4270  		bytes = (unsigned)PAGE_SIZE - offset;
4271  	ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val,
4272  				  offset, bytes);
4273  	if (unlikely(ret < 0))
4274  		return X86EMUL_IO_NEEDED;
4275  
4276  	return X86EMUL_CONTINUE;
4277  }
4278  
4279  int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
4280  			       gva_t addr, void *val, unsigned int bytes,
4281  			       struct x86_exception *exception)
4282  {
4283  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4284  	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
4285  
4286  	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
4287  					  exception);
4288  }
4289  EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
4290  
4291  static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4292  				      gva_t addr, void *val, unsigned int bytes,
4293  				      struct x86_exception *exception)
4294  {
4295  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4296  	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
4297  }
4298  
4299  int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
4300  				       gva_t addr, void *val,
4301  				       unsigned int bytes,
4302  				       struct x86_exception *exception)
4303  {
4304  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4305  	void *data = val;
4306  	int r = X86EMUL_CONTINUE;
4307  
4308  	while (bytes) {
4309  		gpa_t gpa =  vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
4310  							     PFERR_WRITE_MASK,
4311  							     exception);
4312  		unsigned offset = addr & (PAGE_SIZE-1);
4313  		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
4314  		int ret;
4315  
4316  		if (gpa == UNMAPPED_GVA)
4317  			return X86EMUL_PROPAGATE_FAULT;
4318  		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
4319  		if (ret < 0) {
4320  			r = X86EMUL_IO_NEEDED;
4321  			goto out;
4322  		}
4323  
4324  		bytes -= towrite;
4325  		data += towrite;
4326  		addr += towrite;
4327  	}
4328  out:
4329  	return r;
4330  }
4331  EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
4332  
4333  static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
4334  				gpa_t *gpa, struct x86_exception *exception,
4335  				bool write)
4336  {
4337  	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
4338  		| (write ? PFERR_WRITE_MASK : 0);
4339  
4340  	if (vcpu_match_mmio_gva(vcpu, gva)
4341  	    && !permission_fault(vcpu, vcpu->arch.walk_mmu,
4342  				 vcpu->arch.access, access)) {
4343  		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
4344  					(gva & (PAGE_SIZE - 1));
4345  		trace_vcpu_match_mmio(gva, *gpa, write, false);
4346  		return 1;
4347  	}
4348  
4349  	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
4350  
4351  	if (*gpa == UNMAPPED_GVA)
4352  		return -1;
4353  
4354  	/* For APIC access vmexit */
4355  	if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4356  		return 1;
4357  
4358  	if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
4359  		trace_vcpu_match_mmio(gva, *gpa, write, true);
4360  		return 1;
4361  	}
4362  
4363  	return 0;
4364  }
4365  
4366  int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
4367  			const void *val, int bytes)
4368  {
4369  	int ret;
4370  
4371  	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
4372  	if (ret < 0)
4373  		return 0;
4374  	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
4375  	return 1;
4376  }
4377  
4378  struct read_write_emulator_ops {
4379  	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
4380  				  int bytes);
4381  	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
4382  				  void *val, int bytes);
4383  	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4384  			       int bytes, void *val);
4385  	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
4386  				    void *val, int bytes);
4387  	bool write;
4388  };
4389  
4390  static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
4391  {
4392  	if (vcpu->mmio_read_completed) {
4393  		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
4394  			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
4395  		vcpu->mmio_read_completed = 0;
4396  		return 1;
4397  	}
4398  
4399  	return 0;
4400  }
4401  
4402  static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4403  			void *val, int bytes)
4404  {
4405  	return !kvm_read_guest(vcpu->kvm, gpa, val, bytes);
4406  }
4407  
4408  static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
4409  			 void *val, int bytes)
4410  {
4411  	return emulator_write_phys(vcpu, gpa, val, bytes);
4412  }
4413  
4414  static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
4415  {
4416  	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
4417  	return vcpu_mmio_write(vcpu, gpa, bytes, val);
4418  }
4419  
4420  static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4421  			  void *val, int bytes)
4422  {
4423  	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
4424  	return X86EMUL_IO_NEEDED;
4425  }
4426  
4427  static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
4428  			   void *val, int bytes)
4429  {
4430  	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
4431  
4432  	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
4433  	return X86EMUL_CONTINUE;
4434  }
4435  
4436  static const struct read_write_emulator_ops read_emultor = {
4437  	.read_write_prepare = read_prepare,
4438  	.read_write_emulate = read_emulate,
4439  	.read_write_mmio = vcpu_mmio_read,
4440  	.read_write_exit_mmio = read_exit_mmio,
4441  };
4442  
4443  static const struct read_write_emulator_ops write_emultor = {
4444  	.read_write_emulate = write_emulate,
4445  	.read_write_mmio = write_mmio,
4446  	.read_write_exit_mmio = write_exit_mmio,
4447  	.write = true,
4448  };
4449  
4450  static int emulator_read_write_onepage(unsigned long addr, void *val,
4451  				       unsigned int bytes,
4452  				       struct x86_exception *exception,
4453  				       struct kvm_vcpu *vcpu,
4454  				       const struct read_write_emulator_ops *ops)
4455  {
4456  	gpa_t gpa;
4457  	int handled, ret;
4458  	bool write = ops->write;
4459  	struct kvm_mmio_fragment *frag;
4460  
4461  	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
4462  
4463  	if (ret < 0)
4464  		return X86EMUL_PROPAGATE_FAULT;
4465  
4466  	/* For APIC access vmexit */
4467  	if (ret)
4468  		goto mmio;
4469  
4470  	if (ops->read_write_emulate(vcpu, gpa, val, bytes))
4471  		return X86EMUL_CONTINUE;
4472  
4473  mmio:
4474  	/*
4475  	 * Is this MMIO handled locally?
4476  	 */
4477  	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
4478  	if (handled == bytes)
4479  		return X86EMUL_CONTINUE;
4480  
4481  	gpa += handled;
4482  	bytes -= handled;
4483  	val += handled;
4484  
4485  	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
4486  	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
4487  	frag->gpa = gpa;
4488  	frag->data = val;
4489  	frag->len = bytes;
4490  	return X86EMUL_CONTINUE;
4491  }
4492  
4493  int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
4494  			void *val, unsigned int bytes,
4495  			struct x86_exception *exception,
4496  			const struct read_write_emulator_ops *ops)
4497  {
4498  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4499  	gpa_t gpa;
4500  	int rc;
4501  
4502  	if (ops->read_write_prepare &&
4503  		  ops->read_write_prepare(vcpu, val, bytes))
4504  		return X86EMUL_CONTINUE;
4505  
4506  	vcpu->mmio_nr_fragments = 0;
4507  
4508  	/* Crossing a page boundary? */
4509  	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
4510  		int now;
4511  
4512  		now = -addr & ~PAGE_MASK;
4513  		rc = emulator_read_write_onepage(addr, val, now, exception,
4514  						 vcpu, ops);
4515  
4516  		if (rc != X86EMUL_CONTINUE)
4517  			return rc;
4518  		addr += now;
4519  		val += now;
4520  		bytes -= now;
4521  	}
4522  
4523  	rc = emulator_read_write_onepage(addr, val, bytes, exception,
4524  					 vcpu, ops);
4525  	if (rc != X86EMUL_CONTINUE)
4526  		return rc;
4527  
4528  	if (!vcpu->mmio_nr_fragments)
4529  		return rc;
4530  
4531  	gpa = vcpu->mmio_fragments[0].gpa;
4532  
4533  	vcpu->mmio_needed = 1;
4534  	vcpu->mmio_cur_fragment = 0;
4535  
4536  	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
4537  	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
4538  	vcpu->run->exit_reason = KVM_EXIT_MMIO;
4539  	vcpu->run->mmio.phys_addr = gpa;
4540  
4541  	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
4542  }
4543  
4544  static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
4545  				  unsigned long addr,
4546  				  void *val,
4547  				  unsigned int bytes,
4548  				  struct x86_exception *exception)
4549  {
4550  	return emulator_read_write(ctxt, addr, val, bytes,
4551  				   exception, &read_emultor);
4552  }
4553  
4554  int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
4555  			    unsigned long addr,
4556  			    const void *val,
4557  			    unsigned int bytes,
4558  			    struct x86_exception *exception)
4559  {
4560  	return emulator_read_write(ctxt, addr, (void *)val, bytes,
4561  				   exception, &write_emultor);
4562  }
4563  
4564  #define CMPXCHG_TYPE(t, ptr, old, new) \
4565  	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
4566  
4567  #ifdef CONFIG_X86_64
4568  #  define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
4569  #else
4570  #  define CMPXCHG64(ptr, old, new) \
4571  	(cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
4572  #endif
4573  
4574  static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4575  				     unsigned long addr,
4576  				     const void *old,
4577  				     const void *new,
4578  				     unsigned int bytes,
4579  				     struct x86_exception *exception)
4580  {
4581  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4582  	gpa_t gpa;
4583  	struct page *page;
4584  	char *kaddr;
4585  	bool exchanged;
4586  
4587  	/* guests cmpxchg8b have to be emulated atomically */
4588  	if (bytes > 8 || (bytes & (bytes - 1)))
4589  		goto emul_write;
4590  
4591  	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
4592  
4593  	if (gpa == UNMAPPED_GVA ||
4594  	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
4595  		goto emul_write;
4596  
4597  	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
4598  		goto emul_write;
4599  
4600  	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
4601  	if (is_error_page(page))
4602  		goto emul_write;
4603  
4604  	kaddr = kmap_atomic(page);
4605  	kaddr += offset_in_page(gpa);
4606  	switch (bytes) {
4607  	case 1:
4608  		exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
4609  		break;
4610  	case 2:
4611  		exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
4612  		break;
4613  	case 4:
4614  		exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
4615  		break;
4616  	case 8:
4617  		exchanged = CMPXCHG64(kaddr, old, new);
4618  		break;
4619  	default:
4620  		BUG();
4621  	}
4622  	kunmap_atomic(kaddr);
4623  	kvm_release_page_dirty(page);
4624  
4625  	if (!exchanged)
4626  		return X86EMUL_CMPXCHG_FAILED;
4627  
4628  	mark_page_dirty(vcpu->kvm, gpa >> PAGE_SHIFT);
4629  	kvm_mmu_pte_write(vcpu, gpa, new, bytes);
4630  
4631  	return X86EMUL_CONTINUE;
4632  
4633  emul_write:
4634  	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
4635  
4636  	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
4637  }
4638  
4639  static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
4640  {
4641  	/* TODO: String I/O for in kernel device */
4642  	int r;
4643  
4644  	if (vcpu->arch.pio.in)
4645  		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
4646  				    vcpu->arch.pio.size, pd);
4647  	else
4648  		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
4649  				     vcpu->arch.pio.port, vcpu->arch.pio.size,
4650  				     pd);
4651  	return r;
4652  }
4653  
4654  static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
4655  			       unsigned short port, void *val,
4656  			       unsigned int count, bool in)
4657  {
4658  	vcpu->arch.pio.port = port;
4659  	vcpu->arch.pio.in = in;
4660  	vcpu->arch.pio.count  = count;
4661  	vcpu->arch.pio.size = size;
4662  
4663  	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
4664  		vcpu->arch.pio.count = 0;
4665  		return 1;
4666  	}
4667  
4668  	vcpu->run->exit_reason = KVM_EXIT_IO;
4669  	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
4670  	vcpu->run->io.size = size;
4671  	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
4672  	vcpu->run->io.count = count;
4673  	vcpu->run->io.port = port;
4674  
4675  	return 0;
4676  }
4677  
4678  static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
4679  				    int size, unsigned short port, void *val,
4680  				    unsigned int count)
4681  {
4682  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4683  	int ret;
4684  
4685  	if (vcpu->arch.pio.count)
4686  		goto data_avail;
4687  
4688  	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
4689  	if (ret) {
4690  data_avail:
4691  		memcpy(val, vcpu->arch.pio_data, size * count);
4692  		trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
4693  		vcpu->arch.pio.count = 0;
4694  		return 1;
4695  	}
4696  
4697  	return 0;
4698  }
4699  
4700  static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4701  				     int size, unsigned short port,
4702  				     const void *val, unsigned int count)
4703  {
4704  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4705  
4706  	memcpy(vcpu->arch.pio_data, val, size * count);
4707  	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
4708  	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
4709  }
4710  
4711  static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4712  {
4713  	return kvm_x86_ops->get_segment_base(vcpu, seg);
4714  }
4715  
4716  static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4717  {
4718  	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4719  }
4720  
4721  int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4722  {
4723  	if (!need_emulate_wbinvd(vcpu))
4724  		return X86EMUL_CONTINUE;
4725  
4726  	if (kvm_x86_ops->has_wbinvd_exit()) {
4727  		int cpu = get_cpu();
4728  
4729  		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4730  		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
4731  				wbinvd_ipi, NULL, 1);
4732  		put_cpu();
4733  		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
4734  	} else
4735  		wbinvd();
4736  	return X86EMUL_CONTINUE;
4737  }
4738  EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4739  
4740  static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4741  {
4742  	kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4743  }
4744  
4745  int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4746  {
4747  	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4748  }
4749  
4750  int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4751  {
4752  
4753  	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4754  }
4755  
4756  static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4757  {
4758  	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4759  }
4760  
4761  static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4762  {
4763  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4764  	unsigned long value;
4765  
4766  	switch (cr) {
4767  	case 0:
4768  		value = kvm_read_cr0(vcpu);
4769  		break;
4770  	case 2:
4771  		value = vcpu->arch.cr2;
4772  		break;
4773  	case 3:
4774  		value = kvm_read_cr3(vcpu);
4775  		break;
4776  	case 4:
4777  		value = kvm_read_cr4(vcpu);
4778  		break;
4779  	case 8:
4780  		value = kvm_get_cr8(vcpu);
4781  		break;
4782  	default:
4783  		kvm_err("%s: unexpected cr %u\n", __func__, cr);
4784  		return 0;
4785  	}
4786  
4787  	return value;
4788  }
4789  
4790  static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4791  {
4792  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4793  	int res = 0;
4794  
4795  	switch (cr) {
4796  	case 0:
4797  		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
4798  		break;
4799  	case 2:
4800  		vcpu->arch.cr2 = val;
4801  		break;
4802  	case 3:
4803  		res = kvm_set_cr3(vcpu, val);
4804  		break;
4805  	case 4:
4806  		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
4807  		break;
4808  	case 8:
4809  		res = kvm_set_cr8(vcpu, val);
4810  		break;
4811  	default:
4812  		kvm_err("%s: unexpected cr %u\n", __func__, cr);
4813  		res = -1;
4814  	}
4815  
4816  	return res;
4817  }
4818  
4819  static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4820  {
4821  	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4822  }
4823  
4824  static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4825  {
4826  	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4827  }
4828  
4829  static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4830  {
4831  	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4832  }
4833  
4834  static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4835  {
4836  	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4837  }
4838  
4839  static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4840  {
4841  	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4842  }
4843  
4844  static unsigned long emulator_get_cached_segment_base(
4845  	struct x86_emulate_ctxt *ctxt, int seg)
4846  {
4847  	return get_segment_base(emul_to_vcpu(ctxt), seg);
4848  }
4849  
4850  static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4851  				 struct desc_struct *desc, u32 *base3,
4852  				 int seg)
4853  {
4854  	struct kvm_segment var;
4855  
4856  	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4857  	*selector = var.selector;
4858  
4859  	if (var.unusable) {
4860  		memset(desc, 0, sizeof(*desc));
4861  		return false;
4862  	}
4863  
4864  	if (var.g)
4865  		var.limit >>= 12;
4866  	set_desc_limit(desc, var.limit);
4867  	set_desc_base(desc, (unsigned long)var.base);
4868  #ifdef CONFIG_X86_64
4869  	if (base3)
4870  		*base3 = var.base >> 32;
4871  #endif
4872  	desc->type = var.type;
4873  	desc->s = var.s;
4874  	desc->dpl = var.dpl;
4875  	desc->p = var.present;
4876  	desc->avl = var.avl;
4877  	desc->l = var.l;
4878  	desc->d = var.db;
4879  	desc->g = var.g;
4880  
4881  	return true;
4882  }
4883  
4884  static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4885  				 struct desc_struct *desc, u32 base3,
4886  				 int seg)
4887  {
4888  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4889  	struct kvm_segment var;
4890  
4891  	var.selector = selector;
4892  	var.base = get_desc_base(desc);
4893  #ifdef CONFIG_X86_64
4894  	var.base |= ((u64)base3) << 32;
4895  #endif
4896  	var.limit = get_desc_limit(desc);
4897  	if (desc->g)
4898  		var.limit = (var.limit << 12) | 0xfff;
4899  	var.type = desc->type;
4900  	var.dpl = desc->dpl;
4901  	var.db = desc->d;
4902  	var.s = desc->s;
4903  	var.l = desc->l;
4904  	var.g = desc->g;
4905  	var.avl = desc->avl;
4906  	var.present = desc->p;
4907  	var.unusable = !var.present;
4908  	var.padding = 0;
4909  
4910  	kvm_set_segment(vcpu, &var, seg);
4911  	return;
4912  }
4913  
4914  static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4915  			    u32 msr_index, u64 *pdata)
4916  {
4917  	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4918  }
4919  
4920  static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4921  			    u32 msr_index, u64 data)
4922  {
4923  	struct msr_data msr;
4924  
4925  	msr.data = data;
4926  	msr.index = msr_index;
4927  	msr.host_initiated = false;
4928  	return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
4929  }
4930  
4931  static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
4932  			      u32 pmc)
4933  {
4934  	return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc);
4935  }
4936  
4937  static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
4938  			     u32 pmc, u64 *pdata)
4939  {
4940  	return kvm_pmu_read_pmc(emul_to_vcpu(ctxt), pmc, pdata);
4941  }
4942  
4943  static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4944  {
4945  	emul_to_vcpu(ctxt)->arch.halt_request = 1;
4946  }
4947  
4948  static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4949  {
4950  	preempt_disable();
4951  	kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4952  	/*
4953  	 * CR0.TS may reference the host fpu state, not the guest fpu state,
4954  	 * so it may be clear at this point.
4955  	 */
4956  	clts();
4957  }
4958  
4959  static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4960  {
4961  	preempt_enable();
4962  }
4963  
4964  static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4965  			      struct x86_instruction_info *info,
4966  			      enum x86_intercept_stage stage)
4967  {
4968  	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4969  }
4970  
4971  static void emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
4972  			       u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
4973  {
4974  	kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx);
4975  }
4976  
4977  static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
4978  {
4979  	return kvm_register_read(emul_to_vcpu(ctxt), reg);
4980  }
4981  
4982  static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
4983  {
4984  	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
4985  }
4986  
4987  static const struct x86_emulate_ops emulate_ops = {
4988  	.read_gpr            = emulator_read_gpr,
4989  	.write_gpr           = emulator_write_gpr,
4990  	.read_std            = kvm_read_guest_virt_system,
4991  	.write_std           = kvm_write_guest_virt_system,
4992  	.fetch               = kvm_fetch_guest_virt,
4993  	.read_emulated       = emulator_read_emulated,
4994  	.write_emulated      = emulator_write_emulated,
4995  	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
4996  	.invlpg              = emulator_invlpg,
4997  	.pio_in_emulated     = emulator_pio_in_emulated,
4998  	.pio_out_emulated    = emulator_pio_out_emulated,
4999  	.get_segment         = emulator_get_segment,
5000  	.set_segment         = emulator_set_segment,
5001  	.get_cached_segment_base = emulator_get_cached_segment_base,
5002  	.get_gdt             = emulator_get_gdt,
5003  	.get_idt	     = emulator_get_idt,
5004  	.set_gdt             = emulator_set_gdt,
5005  	.set_idt	     = emulator_set_idt,
5006  	.get_cr              = emulator_get_cr,
5007  	.set_cr              = emulator_set_cr,
5008  	.cpl                 = emulator_get_cpl,
5009  	.get_dr              = emulator_get_dr,
5010  	.set_dr              = emulator_set_dr,
5011  	.set_msr             = emulator_set_msr,
5012  	.get_msr             = emulator_get_msr,
5013  	.check_pmc	     = emulator_check_pmc,
5014  	.read_pmc            = emulator_read_pmc,
5015  	.halt                = emulator_halt,
5016  	.wbinvd              = emulator_wbinvd,
5017  	.fix_hypercall       = emulator_fix_hypercall,
5018  	.get_fpu             = emulator_get_fpu,
5019  	.put_fpu             = emulator_put_fpu,
5020  	.intercept           = emulator_intercept,
5021  	.get_cpuid           = emulator_get_cpuid,
5022  };
5023  
5024  static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
5025  {
5026  	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
5027  	/*
5028  	 * an sti; sti; sequence only disable interrupts for the first
5029  	 * instruction. So, if the last instruction, be it emulated or
5030  	 * not, left the system with the INT_STI flag enabled, it
5031  	 * means that the last instruction is an sti. We should not
5032  	 * leave the flag on in this case. The same goes for mov ss
5033  	 */
5034  	if (int_shadow & mask)
5035  		mask = 0;
5036  	if (unlikely(int_shadow || mask)) {
5037  		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
5038  		if (!mask)
5039  			kvm_make_request(KVM_REQ_EVENT, vcpu);
5040  	}
5041  }
5042  
5043  static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
5044  {
5045  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5046  	if (ctxt->exception.vector == PF_VECTOR)
5047  		return kvm_propagate_fault(vcpu, &ctxt->exception);
5048  
5049  	if (ctxt->exception.error_code_valid)
5050  		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
5051  				      ctxt->exception.error_code);
5052  	else
5053  		kvm_queue_exception(vcpu, ctxt->exception.vector);
5054  	return false;
5055  }
5056  
5057  static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
5058  {
5059  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5060  	int cs_db, cs_l;
5061  
5062  	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5063  
5064  	ctxt->eflags = kvm_get_rflags(vcpu);
5065  	ctxt->eip = kvm_rip_read(vcpu);
5066  	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
5067  		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
5068  		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
5069  		     cs_db				? X86EMUL_MODE_PROT32 :
5070  							  X86EMUL_MODE_PROT16;
5071  	ctxt->guest_mode = is_guest_mode(vcpu);
5072  
5073  	init_decode_cache(ctxt);
5074  	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
5075  }
5076  
5077  int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
5078  {
5079  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5080  	int ret;
5081  
5082  	init_emulate_ctxt(vcpu);
5083  
5084  	ctxt->op_bytes = 2;
5085  	ctxt->ad_bytes = 2;
5086  	ctxt->_eip = ctxt->eip + inc_eip;
5087  	ret = emulate_int_real(ctxt, irq);
5088  
5089  	if (ret != X86EMUL_CONTINUE)
5090  		return EMULATE_FAIL;
5091  
5092  	ctxt->eip = ctxt->_eip;
5093  	kvm_rip_write(vcpu, ctxt->eip);
5094  	kvm_set_rflags(vcpu, ctxt->eflags);
5095  
5096  	if (irq == NMI_VECTOR)
5097  		vcpu->arch.nmi_pending = 0;
5098  	else
5099  		vcpu->arch.interrupt.pending = false;
5100  
5101  	return EMULATE_DONE;
5102  }
5103  EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
5104  
5105  static int handle_emulation_failure(struct kvm_vcpu *vcpu)
5106  {
5107  	int r = EMULATE_DONE;
5108  
5109  	++vcpu->stat.insn_emulation_fail;
5110  	trace_kvm_emulate_insn_failed(vcpu);
5111  	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
5112  		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5113  		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
5114  		vcpu->run->internal.ndata = 0;
5115  		r = EMULATE_FAIL;
5116  	}
5117  	kvm_queue_exception(vcpu, UD_VECTOR);
5118  
5119  	return r;
5120  }
5121  
5122  static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
5123  				  bool write_fault_to_shadow_pgtable,
5124  				  int emulation_type)
5125  {
5126  	gpa_t gpa = cr2;
5127  	pfn_t pfn;
5128  
5129  	if (emulation_type & EMULTYPE_NO_REEXECUTE)
5130  		return false;
5131  
5132  	if (!vcpu->arch.mmu.direct_map) {
5133  		/*
5134  		 * Write permission should be allowed since only
5135  		 * write access need to be emulated.
5136  		 */
5137  		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
5138  
5139  		/*
5140  		 * If the mapping is invalid in guest, let cpu retry
5141  		 * it to generate fault.
5142  		 */
5143  		if (gpa == UNMAPPED_GVA)
5144  			return true;
5145  	}
5146  
5147  	/*
5148  	 * Do not retry the unhandleable instruction if it faults on the
5149  	 * readonly host memory, otherwise it will goto a infinite loop:
5150  	 * retry instruction -> write #PF -> emulation fail -> retry
5151  	 * instruction -> ...
5152  	 */
5153  	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
5154  
5155  	/*
5156  	 * If the instruction failed on the error pfn, it can not be fixed,
5157  	 * report the error to userspace.
5158  	 */
5159  	if (is_error_noslot_pfn(pfn))
5160  		return false;
5161  
5162  	kvm_release_pfn_clean(pfn);
5163  
5164  	/* The instructions are well-emulated on direct mmu. */
5165  	if (vcpu->arch.mmu.direct_map) {
5166  		unsigned int indirect_shadow_pages;
5167  
5168  		spin_lock(&vcpu->kvm->mmu_lock);
5169  		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
5170  		spin_unlock(&vcpu->kvm->mmu_lock);
5171  
5172  		if (indirect_shadow_pages)
5173  			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5174  
5175  		return true;
5176  	}
5177  
5178  	/*
5179  	 * if emulation was due to access to shadowed page table
5180  	 * and it failed try to unshadow page and re-enter the
5181  	 * guest to let CPU execute the instruction.
5182  	 */
5183  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5184  
5185  	/*
5186  	 * If the access faults on its page table, it can not
5187  	 * be fixed by unprotecting shadow page and it should
5188  	 * be reported to userspace.
5189  	 */
5190  	return !write_fault_to_shadow_pgtable;
5191  }
5192  
5193  static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
5194  			      unsigned long cr2,  int emulation_type)
5195  {
5196  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5197  	unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
5198  
5199  	last_retry_eip = vcpu->arch.last_retry_eip;
5200  	last_retry_addr = vcpu->arch.last_retry_addr;
5201  
5202  	/*
5203  	 * If the emulation is caused by #PF and it is non-page_table
5204  	 * writing instruction, it means the VM-EXIT is caused by shadow
5205  	 * page protected, we can zap the shadow page and retry this
5206  	 * instruction directly.
5207  	 *
5208  	 * Note: if the guest uses a non-page-table modifying instruction
5209  	 * on the PDE that points to the instruction, then we will unmap
5210  	 * the instruction and go to an infinite loop. So, we cache the
5211  	 * last retried eip and the last fault address, if we meet the eip
5212  	 * and the address again, we can break out of the potential infinite
5213  	 * loop.
5214  	 */
5215  	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
5216  
5217  	if (!(emulation_type & EMULTYPE_RETRY))
5218  		return false;
5219  
5220  	if (x86_page_table_writing_insn(ctxt))
5221  		return false;
5222  
5223  	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
5224  		return false;
5225  
5226  	vcpu->arch.last_retry_eip = ctxt->eip;
5227  	vcpu->arch.last_retry_addr = cr2;
5228  
5229  	if (!vcpu->arch.mmu.direct_map)
5230  		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
5231  
5232  	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
5233  
5234  	return true;
5235  }
5236  
5237  static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
5238  static int complete_emulated_pio(struct kvm_vcpu *vcpu);
5239  
5240  static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
5241  				unsigned long *db)
5242  {
5243  	u32 dr6 = 0;
5244  	int i;
5245  	u32 enable, rwlen;
5246  
5247  	enable = dr7;
5248  	rwlen = dr7 >> 16;
5249  	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
5250  		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
5251  			dr6 |= (1 << i);
5252  	return dr6;
5253  }
5254  
5255  static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r)
5256  {
5257  	struct kvm_run *kvm_run = vcpu->run;
5258  
5259  	/*
5260  	 * rflags is the old, "raw" value of the flags.  The new value has
5261  	 * not been saved yet.
5262  	 *
5263  	 * This is correct even for TF set by the guest, because "the
5264  	 * processor will not generate this exception after the instruction
5265  	 * that sets the TF flag".
5266  	 */
5267  	if (unlikely(rflags & X86_EFLAGS_TF)) {
5268  		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5269  			kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 |
5270  						  DR6_RTM;
5271  			kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
5272  			kvm_run->debug.arch.exception = DB_VECTOR;
5273  			kvm_run->exit_reason = KVM_EXIT_DEBUG;
5274  			*r = EMULATE_USER_EXIT;
5275  		} else {
5276  			vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF;
5277  			/*
5278  			 * "Certain debug exceptions may clear bit 0-3.  The
5279  			 * remaining contents of the DR6 register are never
5280  			 * cleared by the processor".
5281  			 */
5282  			vcpu->arch.dr6 &= ~15;
5283  			vcpu->arch.dr6 |= DR6_BS | DR6_RTM;
5284  			kvm_queue_exception(vcpu, DB_VECTOR);
5285  		}
5286  	}
5287  }
5288  
5289  static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
5290  {
5291  	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
5292  	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
5293  		struct kvm_run *kvm_run = vcpu->run;
5294  		unsigned long eip = kvm_get_linear_rip(vcpu);
5295  		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5296  					   vcpu->arch.guest_debug_dr7,
5297  					   vcpu->arch.eff_db);
5298  
5299  		if (dr6 != 0) {
5300  			kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
5301  			kvm_run->debug.arch.pc = eip;
5302  			kvm_run->debug.arch.exception = DB_VECTOR;
5303  			kvm_run->exit_reason = KVM_EXIT_DEBUG;
5304  			*r = EMULATE_USER_EXIT;
5305  			return true;
5306  		}
5307  	}
5308  
5309  	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
5310  	    !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
5311  		unsigned long eip = kvm_get_linear_rip(vcpu);
5312  		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
5313  					   vcpu->arch.dr7,
5314  					   vcpu->arch.db);
5315  
5316  		if (dr6 != 0) {
5317  			vcpu->arch.dr6 &= ~15;
5318  			vcpu->arch.dr6 |= dr6 | DR6_RTM;
5319  			kvm_queue_exception(vcpu, DB_VECTOR);
5320  			*r = EMULATE_DONE;
5321  			return true;
5322  		}
5323  	}
5324  
5325  	return false;
5326  }
5327  
5328  int x86_emulate_instruction(struct kvm_vcpu *vcpu,
5329  			    unsigned long cr2,
5330  			    int emulation_type,
5331  			    void *insn,
5332  			    int insn_len)
5333  {
5334  	int r;
5335  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
5336  	bool writeback = true;
5337  	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
5338  
5339  	/*
5340  	 * Clear write_fault_to_shadow_pgtable here to ensure it is
5341  	 * never reused.
5342  	 */
5343  	vcpu->arch.write_fault_to_shadow_pgtable = false;
5344  	kvm_clear_exception_queue(vcpu);
5345  
5346  	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
5347  		init_emulate_ctxt(vcpu);
5348  
5349  		/*
5350  		 * We will reenter on the same instruction since
5351  		 * we do not set complete_userspace_io.  This does not
5352  		 * handle watchpoints yet, those would be handled in
5353  		 * the emulate_ops.
5354  		 */
5355  		if (kvm_vcpu_check_breakpoint(vcpu, &r))
5356  			return r;
5357  
5358  		ctxt->interruptibility = 0;
5359  		ctxt->have_exception = false;
5360  		ctxt->exception.vector = -1;
5361  		ctxt->perm_ok = false;
5362  
5363  		ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
5364  
5365  		r = x86_decode_insn(ctxt, insn, insn_len);
5366  
5367  		trace_kvm_emulate_insn_start(vcpu);
5368  		++vcpu->stat.insn_emulation;
5369  		if (r != EMULATION_OK)  {
5370  			if (emulation_type & EMULTYPE_TRAP_UD)
5371  				return EMULATE_FAIL;
5372  			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5373  						emulation_type))
5374  				return EMULATE_DONE;
5375  			if (emulation_type & EMULTYPE_SKIP)
5376  				return EMULATE_FAIL;
5377  			return handle_emulation_failure(vcpu);
5378  		}
5379  	}
5380  
5381  	if (emulation_type & EMULTYPE_SKIP) {
5382  		kvm_rip_write(vcpu, ctxt->_eip);
5383  		if (ctxt->eflags & X86_EFLAGS_RF)
5384  			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
5385  		return EMULATE_DONE;
5386  	}
5387  
5388  	if (retry_instruction(ctxt, cr2, emulation_type))
5389  		return EMULATE_DONE;
5390  
5391  	/* this is needed for vmware backdoor interface to work since it
5392  	   changes registers values  during IO operation */
5393  	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
5394  		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
5395  		emulator_invalidate_register_cache(ctxt);
5396  	}
5397  
5398  restart:
5399  	r = x86_emulate_insn(ctxt);
5400  
5401  	if (r == EMULATION_INTERCEPTED)
5402  		return EMULATE_DONE;
5403  
5404  	if (r == EMULATION_FAILED) {
5405  		if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
5406  					emulation_type))
5407  			return EMULATE_DONE;
5408  
5409  		return handle_emulation_failure(vcpu);
5410  	}
5411  
5412  	if (ctxt->have_exception) {
5413  		r = EMULATE_DONE;
5414  		if (inject_emulated_exception(vcpu))
5415  			return r;
5416  	} else if (vcpu->arch.pio.count) {
5417  		if (!vcpu->arch.pio.in) {
5418  			/* FIXME: return into emulator if single-stepping.  */
5419  			vcpu->arch.pio.count = 0;
5420  		} else {
5421  			writeback = false;
5422  			vcpu->arch.complete_userspace_io = complete_emulated_pio;
5423  		}
5424  		r = EMULATE_USER_EXIT;
5425  	} else if (vcpu->mmio_needed) {
5426  		if (!vcpu->mmio_is_write)
5427  			writeback = false;
5428  		r = EMULATE_USER_EXIT;
5429  		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
5430  	} else if (r == EMULATION_RESTART)
5431  		goto restart;
5432  	else
5433  		r = EMULATE_DONE;
5434  
5435  	if (writeback) {
5436  		unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
5437  		toggle_interruptibility(vcpu, ctxt->interruptibility);
5438  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5439  		kvm_rip_write(vcpu, ctxt->eip);
5440  		if (r == EMULATE_DONE)
5441  			kvm_vcpu_check_singlestep(vcpu, rflags, &r);
5442  		if (!ctxt->have_exception ||
5443  		    exception_type(ctxt->exception.vector) == EXCPT_TRAP)
5444  			__kvm_set_rflags(vcpu, ctxt->eflags);
5445  
5446  		/*
5447  		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
5448  		 * do nothing, and it will be requested again as soon as
5449  		 * the shadow expires.  But we still need to check here,
5450  		 * because POPF has no interrupt shadow.
5451  		 */
5452  		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
5453  			kvm_make_request(KVM_REQ_EVENT, vcpu);
5454  	} else
5455  		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
5456  
5457  	return r;
5458  }
5459  EXPORT_SYMBOL_GPL(x86_emulate_instruction);
5460  
5461  int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
5462  {
5463  	unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
5464  	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
5465  					    size, port, &val, 1);
5466  	/* do not return to emulator after return from userspace */
5467  	vcpu->arch.pio.count = 0;
5468  	return ret;
5469  }
5470  EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
5471  
5472  static void tsc_bad(void *info)
5473  {
5474  	__this_cpu_write(cpu_tsc_khz, 0);
5475  }
5476  
5477  static void tsc_khz_changed(void *data)
5478  {
5479  	struct cpufreq_freqs *freq = data;
5480  	unsigned long khz = 0;
5481  
5482  	if (data)
5483  		khz = freq->new;
5484  	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5485  		khz = cpufreq_quick_get(raw_smp_processor_id());
5486  	if (!khz)
5487  		khz = tsc_khz;
5488  	__this_cpu_write(cpu_tsc_khz, khz);
5489  }
5490  
5491  static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
5492  				     void *data)
5493  {
5494  	struct cpufreq_freqs *freq = data;
5495  	struct kvm *kvm;
5496  	struct kvm_vcpu *vcpu;
5497  	int i, send_ipi = 0;
5498  
5499  	/*
5500  	 * We allow guests to temporarily run on slowing clocks,
5501  	 * provided we notify them after, or to run on accelerating
5502  	 * clocks, provided we notify them before.  Thus time never
5503  	 * goes backwards.
5504  	 *
5505  	 * However, we have a problem.  We can't atomically update
5506  	 * the frequency of a given CPU from this function; it is
5507  	 * merely a notifier, which can be called from any CPU.
5508  	 * Changing the TSC frequency at arbitrary points in time
5509  	 * requires a recomputation of local variables related to
5510  	 * the TSC for each VCPU.  We must flag these local variables
5511  	 * to be updated and be sure the update takes place with the
5512  	 * new frequency before any guests proceed.
5513  	 *
5514  	 * Unfortunately, the combination of hotplug CPU and frequency
5515  	 * change creates an intractable locking scenario; the order
5516  	 * of when these callouts happen is undefined with respect to
5517  	 * CPU hotplug, and they can race with each other.  As such,
5518  	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
5519  	 * undefined; you can actually have a CPU frequency change take
5520  	 * place in between the computation of X and the setting of the
5521  	 * variable.  To protect against this problem, all updates of
5522  	 * the per_cpu tsc_khz variable are done in an interrupt
5523  	 * protected IPI, and all callers wishing to update the value
5524  	 * must wait for a synchronous IPI to complete (which is trivial
5525  	 * if the caller is on the CPU already).  This establishes the
5526  	 * necessary total order on variable updates.
5527  	 *
5528  	 * Note that because a guest time update may take place
5529  	 * anytime after the setting of the VCPU's request bit, the
5530  	 * correct TSC value must be set before the request.  However,
5531  	 * to ensure the update actually makes it to any guest which
5532  	 * starts running in hardware virtualization between the set
5533  	 * and the acquisition of the spinlock, we must also ping the
5534  	 * CPU after setting the request bit.
5535  	 *
5536  	 */
5537  
5538  	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
5539  		return 0;
5540  	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
5541  		return 0;
5542  
5543  	smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5544  
5545  	spin_lock(&kvm_lock);
5546  	list_for_each_entry(kvm, &vm_list, vm_list) {
5547  		kvm_for_each_vcpu(i, vcpu, kvm) {
5548  			if (vcpu->cpu != freq->cpu)
5549  				continue;
5550  			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5551  			if (vcpu->cpu != smp_processor_id())
5552  				send_ipi = 1;
5553  		}
5554  	}
5555  	spin_unlock(&kvm_lock);
5556  
5557  	if (freq->old < freq->new && send_ipi) {
5558  		/*
5559  		 * We upscale the frequency.  Must make the guest
5560  		 * doesn't see old kvmclock values while running with
5561  		 * the new frequency, otherwise we risk the guest sees
5562  		 * time go backwards.
5563  		 *
5564  		 * In case we update the frequency for another cpu
5565  		 * (which might be in guest context) send an interrupt
5566  		 * to kick the cpu out of guest context.  Next time
5567  		 * guest context is entered kvmclock will be updated,
5568  		 * so the guest will not see stale values.
5569  		 */
5570  		smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
5571  	}
5572  	return 0;
5573  }
5574  
5575  static struct notifier_block kvmclock_cpufreq_notifier_block = {
5576  	.notifier_call  = kvmclock_cpufreq_notifier
5577  };
5578  
5579  static int kvmclock_cpu_notifier(struct notifier_block *nfb,
5580  					unsigned long action, void *hcpu)
5581  {
5582  	unsigned int cpu = (unsigned long)hcpu;
5583  
5584  	switch (action) {
5585  		case CPU_ONLINE:
5586  		case CPU_DOWN_FAILED:
5587  			smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5588  			break;
5589  		case CPU_DOWN_PREPARE:
5590  			smp_call_function_single(cpu, tsc_bad, NULL, 1);
5591  			break;
5592  	}
5593  	return NOTIFY_OK;
5594  }
5595  
5596  static struct notifier_block kvmclock_cpu_notifier_block = {
5597  	.notifier_call  = kvmclock_cpu_notifier,
5598  	.priority = -INT_MAX
5599  };
5600  
5601  static void kvm_timer_init(void)
5602  {
5603  	int cpu;
5604  
5605  	max_tsc_khz = tsc_khz;
5606  
5607  	cpu_notifier_register_begin();
5608  	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5609  #ifdef CONFIG_CPU_FREQ
5610  		struct cpufreq_policy policy;
5611  		memset(&policy, 0, sizeof(policy));
5612  		cpu = get_cpu();
5613  		cpufreq_get_policy(&policy, cpu);
5614  		if (policy.cpuinfo.max_freq)
5615  			max_tsc_khz = policy.cpuinfo.max_freq;
5616  		put_cpu();
5617  #endif
5618  		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
5619  					  CPUFREQ_TRANSITION_NOTIFIER);
5620  	}
5621  	pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
5622  	for_each_online_cpu(cpu)
5623  		smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
5624  
5625  	__register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5626  	cpu_notifier_register_done();
5627  
5628  }
5629  
5630  static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
5631  
5632  int kvm_is_in_guest(void)
5633  {
5634  	return __this_cpu_read(current_vcpu) != NULL;
5635  }
5636  
5637  static int kvm_is_user_mode(void)
5638  {
5639  	int user_mode = 3;
5640  
5641  	if (__this_cpu_read(current_vcpu))
5642  		user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
5643  
5644  	return user_mode != 0;
5645  }
5646  
5647  static unsigned long kvm_get_guest_ip(void)
5648  {
5649  	unsigned long ip = 0;
5650  
5651  	if (__this_cpu_read(current_vcpu))
5652  		ip = kvm_rip_read(__this_cpu_read(current_vcpu));
5653  
5654  	return ip;
5655  }
5656  
5657  static struct perf_guest_info_callbacks kvm_guest_cbs = {
5658  	.is_in_guest		= kvm_is_in_guest,
5659  	.is_user_mode		= kvm_is_user_mode,
5660  	.get_guest_ip		= kvm_get_guest_ip,
5661  };
5662  
5663  void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
5664  {
5665  	__this_cpu_write(current_vcpu, vcpu);
5666  }
5667  EXPORT_SYMBOL_GPL(kvm_before_handle_nmi);
5668  
5669  void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
5670  {
5671  	__this_cpu_write(current_vcpu, NULL);
5672  }
5673  EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);
5674  
5675  static void kvm_set_mmio_spte_mask(void)
5676  {
5677  	u64 mask;
5678  	int maxphyaddr = boot_cpu_data.x86_phys_bits;
5679  
5680  	/*
5681  	 * Set the reserved bits and the present bit of an paging-structure
5682  	 * entry to generate page fault with PFER.RSV = 1.
5683  	 */
5684  	 /* Mask the reserved physical address bits. */
5685  	mask = rsvd_bits(maxphyaddr, 51);
5686  
5687  	/* Bit 62 is always reserved for 32bit host. */
5688  	mask |= 0x3ull << 62;
5689  
5690  	/* Set the present bit. */
5691  	mask |= 1ull;
5692  
5693  #ifdef CONFIG_X86_64
5694  	/*
5695  	 * If reserved bit is not supported, clear the present bit to disable
5696  	 * mmio page fault.
5697  	 */
5698  	if (maxphyaddr == 52)
5699  		mask &= ~1ull;
5700  #endif
5701  
5702  	kvm_mmu_set_mmio_spte_mask(mask);
5703  }
5704  
5705  #ifdef CONFIG_X86_64
5706  static void pvclock_gtod_update_fn(struct work_struct *work)
5707  {
5708  	struct kvm *kvm;
5709  
5710  	struct kvm_vcpu *vcpu;
5711  	int i;
5712  
5713  	spin_lock(&kvm_lock);
5714  	list_for_each_entry(kvm, &vm_list, vm_list)
5715  		kvm_for_each_vcpu(i, vcpu, kvm)
5716  			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
5717  	atomic_set(&kvm_guest_has_master_clock, 0);
5718  	spin_unlock(&kvm_lock);
5719  }
5720  
5721  static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
5722  
5723  /*
5724   * Notification about pvclock gtod data update.
5725   */
5726  static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
5727  			       void *priv)
5728  {
5729  	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
5730  	struct timekeeper *tk = priv;
5731  
5732  	update_pvclock_gtod(tk);
5733  
5734  	/* disable master clock if host does not trust, or does not
5735  	 * use, TSC clocksource
5736  	 */
5737  	if (gtod->clock.vclock_mode != VCLOCK_TSC &&
5738  	    atomic_read(&kvm_guest_has_master_clock) != 0)
5739  		queue_work(system_long_wq, &pvclock_gtod_work);
5740  
5741  	return 0;
5742  }
5743  
5744  static struct notifier_block pvclock_gtod_notifier = {
5745  	.notifier_call = pvclock_gtod_notify,
5746  };
5747  #endif
5748  
5749  int kvm_arch_init(void *opaque)
5750  {
5751  	int r;
5752  	struct kvm_x86_ops *ops = opaque;
5753  
5754  	if (kvm_x86_ops) {
5755  		printk(KERN_ERR "kvm: already loaded the other module\n");
5756  		r = -EEXIST;
5757  		goto out;
5758  	}
5759  
5760  	if (!ops->cpu_has_kvm_support()) {
5761  		printk(KERN_ERR "kvm: no hardware support\n");
5762  		r = -EOPNOTSUPP;
5763  		goto out;
5764  	}
5765  	if (ops->disabled_by_bios()) {
5766  		printk(KERN_ERR "kvm: disabled by bios\n");
5767  		r = -EOPNOTSUPP;
5768  		goto out;
5769  	}
5770  
5771  	r = -ENOMEM;
5772  	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
5773  	if (!shared_msrs) {
5774  		printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
5775  		goto out;
5776  	}
5777  
5778  	r = kvm_mmu_module_init();
5779  	if (r)
5780  		goto out_free_percpu;
5781  
5782  	kvm_set_mmio_spte_mask();
5783  
5784  	kvm_x86_ops = ops;
5785  	kvm_init_msr_list();
5786  
5787  	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
5788  			PT_DIRTY_MASK, PT64_NX_MASK, 0);
5789  
5790  	kvm_timer_init();
5791  
5792  	perf_register_guest_info_callbacks(&kvm_guest_cbs);
5793  
5794  	if (cpu_has_xsave)
5795  		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
5796  
5797  	kvm_lapic_init();
5798  #ifdef CONFIG_X86_64
5799  	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
5800  #endif
5801  
5802  	return 0;
5803  
5804  out_free_percpu:
5805  	free_percpu(shared_msrs);
5806  out:
5807  	return r;
5808  }
5809  
5810  void kvm_arch_exit(void)
5811  {
5812  	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
5813  
5814  	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
5815  		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
5816  					    CPUFREQ_TRANSITION_NOTIFIER);
5817  	unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
5818  #ifdef CONFIG_X86_64
5819  	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
5820  #endif
5821  	kvm_x86_ops = NULL;
5822  	kvm_mmu_module_exit();
5823  	free_percpu(shared_msrs);
5824  }
5825  
5826  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
5827  {
5828  	++vcpu->stat.halt_exits;
5829  	if (irqchip_in_kernel(vcpu->kvm)) {
5830  		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
5831  		return 1;
5832  	} else {
5833  		vcpu->run->exit_reason = KVM_EXIT_HLT;
5834  		return 0;
5835  	}
5836  }
5837  EXPORT_SYMBOL_GPL(kvm_emulate_halt);
5838  
5839  int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
5840  {
5841  	u64 param, ingpa, outgpa, ret;
5842  	uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
5843  	bool fast, longmode;
5844  
5845  	/*
5846  	 * hypercall generates UD from non zero cpl and real mode
5847  	 * per HYPER-V spec
5848  	 */
5849  	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
5850  		kvm_queue_exception(vcpu, UD_VECTOR);
5851  		return 0;
5852  	}
5853  
5854  	longmode = is_64_bit_mode(vcpu);
5855  
5856  	if (!longmode) {
5857  		param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
5858  			(kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
5859  		ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
5860  			(kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
5861  		outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
5862  			(kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
5863  	}
5864  #ifdef CONFIG_X86_64
5865  	else {
5866  		param = kvm_register_read(vcpu, VCPU_REGS_RCX);
5867  		ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
5868  		outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
5869  	}
5870  #endif
5871  
5872  	code = param & 0xffff;
5873  	fast = (param >> 16) & 0x1;
5874  	rep_cnt = (param >> 32) & 0xfff;
5875  	rep_idx = (param >> 48) & 0xfff;
5876  
5877  	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
5878  
5879  	switch (code) {
5880  	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
5881  		kvm_vcpu_on_spin(vcpu);
5882  		break;
5883  	default:
5884  		res = HV_STATUS_INVALID_HYPERCALL_CODE;
5885  		break;
5886  	}
5887  
5888  	ret = res | (((u64)rep_done & 0xfff) << 32);
5889  	if (longmode) {
5890  		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5891  	} else {
5892  		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
5893  		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
5894  	}
5895  
5896  	return 1;
5897  }
5898  
5899  /*
5900   * kvm_pv_kick_cpu_op:  Kick a vcpu.
5901   *
5902   * @apicid - apicid of vcpu to be kicked.
5903   */
5904  static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
5905  {
5906  	struct kvm_lapic_irq lapic_irq;
5907  
5908  	lapic_irq.shorthand = 0;
5909  	lapic_irq.dest_mode = 0;
5910  	lapic_irq.dest_id = apicid;
5911  
5912  	lapic_irq.delivery_mode = APIC_DM_REMRD;
5913  	kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
5914  }
5915  
5916  int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
5917  {
5918  	unsigned long nr, a0, a1, a2, a3, ret;
5919  	int op_64_bit, r = 1;
5920  
5921  	if (kvm_hv_hypercall_enabled(vcpu->kvm))
5922  		return kvm_hv_hypercall(vcpu);
5923  
5924  	nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
5925  	a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
5926  	a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
5927  	a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
5928  	a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
5929  
5930  	trace_kvm_hypercall(nr, a0, a1, a2, a3);
5931  
5932  	op_64_bit = is_64_bit_mode(vcpu);
5933  	if (!op_64_bit) {
5934  		nr &= 0xFFFFFFFF;
5935  		a0 &= 0xFFFFFFFF;
5936  		a1 &= 0xFFFFFFFF;
5937  		a2 &= 0xFFFFFFFF;
5938  		a3 &= 0xFFFFFFFF;
5939  	}
5940  
5941  	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
5942  		ret = -KVM_EPERM;
5943  		goto out;
5944  	}
5945  
5946  	switch (nr) {
5947  	case KVM_HC_VAPIC_POLL_IRQ:
5948  		ret = 0;
5949  		break;
5950  	case KVM_HC_KICK_CPU:
5951  		kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
5952  		ret = 0;
5953  		break;
5954  	default:
5955  		ret = -KVM_ENOSYS;
5956  		break;
5957  	}
5958  out:
5959  	if (!op_64_bit)
5960  		ret = (u32)ret;
5961  	kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
5962  	++vcpu->stat.hypercalls;
5963  	return r;
5964  }
5965  EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
5966  
5967  static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
5968  {
5969  	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
5970  	char instruction[3];
5971  	unsigned long rip = kvm_rip_read(vcpu);
5972  
5973  	kvm_x86_ops->patch_hypercall(vcpu, instruction);
5974  
5975  	return emulator_write_emulated(ctxt, rip, instruction, 3, NULL);
5976  }
5977  
5978  /*
5979   * Check if userspace requested an interrupt window, and that the
5980   * interrupt window is open.
5981   *
5982   * No need to exit to userspace if we already have an interrupt queued.
5983   */
5984  static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
5985  {
5986  	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
5987  		vcpu->run->request_interrupt_window &&
5988  		kvm_arch_interrupt_allowed(vcpu));
5989  }
5990  
5991  static void post_kvm_run_save(struct kvm_vcpu *vcpu)
5992  {
5993  	struct kvm_run *kvm_run = vcpu->run;
5994  
5995  	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
5996  	kvm_run->cr8 = kvm_get_cr8(vcpu);
5997  	kvm_run->apic_base = kvm_get_apic_base(vcpu);
5998  	if (irqchip_in_kernel(vcpu->kvm))
5999  		kvm_run->ready_for_interrupt_injection = 1;
6000  	else
6001  		kvm_run->ready_for_interrupt_injection =
6002  			kvm_arch_interrupt_allowed(vcpu) &&
6003  			!kvm_cpu_has_interrupt(vcpu) &&
6004  			!kvm_event_needs_reinjection(vcpu);
6005  }
6006  
6007  static void update_cr8_intercept(struct kvm_vcpu *vcpu)
6008  {
6009  	int max_irr, tpr;
6010  
6011  	if (!kvm_x86_ops->update_cr8_intercept)
6012  		return;
6013  
6014  	if (!vcpu->arch.apic)
6015  		return;
6016  
6017  	if (!vcpu->arch.apic->vapic_addr)
6018  		max_irr = kvm_lapic_find_highest_irr(vcpu);
6019  	else
6020  		max_irr = -1;
6021  
6022  	if (max_irr != -1)
6023  		max_irr >>= 4;
6024  
6025  	tpr = kvm_lapic_get_cr8(vcpu);
6026  
6027  	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
6028  }
6029  
6030  static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
6031  {
6032  	int r;
6033  
6034  	/* try to reinject previous events if any */
6035  	if (vcpu->arch.exception.pending) {
6036  		trace_kvm_inj_exception(vcpu->arch.exception.nr,
6037  					vcpu->arch.exception.has_error_code,
6038  					vcpu->arch.exception.error_code);
6039  
6040  		if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
6041  			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
6042  					     X86_EFLAGS_RF);
6043  
6044  		if (vcpu->arch.exception.nr == DB_VECTOR &&
6045  		    (vcpu->arch.dr7 & DR7_GD)) {
6046  			vcpu->arch.dr7 &= ~DR7_GD;
6047  			kvm_update_dr7(vcpu);
6048  		}
6049  
6050  		kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
6051  					  vcpu->arch.exception.has_error_code,
6052  					  vcpu->arch.exception.error_code,
6053  					  vcpu->arch.exception.reinject);
6054  		return 0;
6055  	}
6056  
6057  	if (vcpu->arch.nmi_injected) {
6058  		kvm_x86_ops->set_nmi(vcpu);
6059  		return 0;
6060  	}
6061  
6062  	if (vcpu->arch.interrupt.pending) {
6063  		kvm_x86_ops->set_irq(vcpu);
6064  		return 0;
6065  	}
6066  
6067  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
6068  		r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
6069  		if (r != 0)
6070  			return r;
6071  	}
6072  
6073  	/* try to inject new event if pending */
6074  	if (vcpu->arch.nmi_pending) {
6075  		if (kvm_x86_ops->nmi_allowed(vcpu)) {
6076  			--vcpu->arch.nmi_pending;
6077  			vcpu->arch.nmi_injected = true;
6078  			kvm_x86_ops->set_nmi(vcpu);
6079  		}
6080  	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
6081  		/*
6082  		 * Because interrupts can be injected asynchronously, we are
6083  		 * calling check_nested_events again here to avoid a race condition.
6084  		 * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
6085  		 * proposal and current concerns.  Perhaps we should be setting
6086  		 * KVM_REQ_EVENT only on certain events and not unconditionally?
6087  		 */
6088  		if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
6089  			r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
6090  			if (r != 0)
6091  				return r;
6092  		}
6093  		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
6094  			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
6095  					    false);
6096  			kvm_x86_ops->set_irq(vcpu);
6097  		}
6098  	}
6099  	return 0;
6100  }
6101  
6102  static void process_nmi(struct kvm_vcpu *vcpu)
6103  {
6104  	unsigned limit = 2;
6105  
6106  	/*
6107  	 * x86 is limited to one NMI running, and one NMI pending after it.
6108  	 * If an NMI is already in progress, limit further NMIs to just one.
6109  	 * Otherwise, allow two (and we'll inject the first one immediately).
6110  	 */
6111  	if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
6112  		limit = 1;
6113  
6114  	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
6115  	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
6116  	kvm_make_request(KVM_REQ_EVENT, vcpu);
6117  }
6118  
6119  static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
6120  {
6121  	u64 eoi_exit_bitmap[4];
6122  	u32 tmr[8];
6123  
6124  	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
6125  		return;
6126  
6127  	memset(eoi_exit_bitmap, 0, 32);
6128  	memset(tmr, 0, 32);
6129  
6130  	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
6131  	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
6132  	kvm_apic_update_tmr(vcpu, tmr);
6133  }
6134  
6135  static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
6136  {
6137  	++vcpu->stat.tlb_flush;
6138  	kvm_x86_ops->tlb_flush(vcpu);
6139  }
6140  
6141  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
6142  {
6143  	struct page *page = NULL;
6144  
6145  	if (!irqchip_in_kernel(vcpu->kvm))
6146  		return;
6147  
6148  	if (!kvm_x86_ops->set_apic_access_page_addr)
6149  		return;
6150  
6151  	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
6152  	kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
6153  
6154  	/*
6155  	 * Do not pin apic access page in memory, the MMU notifier
6156  	 * will call us again if it is migrated or swapped out.
6157  	 */
6158  	put_page(page);
6159  }
6160  EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
6161  
6162  void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
6163  					   unsigned long address)
6164  {
6165  	/*
6166  	 * The physical address of apic access page is stored in the VMCS.
6167  	 * Update it when it becomes invalid.
6168  	 */
6169  	if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
6170  		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
6171  }
6172  
6173  /*
6174   * Returns 1 to let __vcpu_run() continue the guest execution loop without
6175   * exiting to the userspace.  Otherwise, the value will be returned to the
6176   * userspace.
6177   */
6178  static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
6179  {
6180  	int r;
6181  	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
6182  		vcpu->run->request_interrupt_window;
6183  	bool req_immediate_exit = false;
6184  
6185  	if (vcpu->requests) {
6186  		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
6187  			kvm_mmu_unload(vcpu);
6188  		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
6189  			__kvm_migrate_timers(vcpu);
6190  		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
6191  			kvm_gen_update_masterclock(vcpu->kvm);
6192  		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
6193  			kvm_gen_kvmclock_update(vcpu);
6194  		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
6195  			r = kvm_guest_time_update(vcpu);
6196  			if (unlikely(r))
6197  				goto out;
6198  		}
6199  		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
6200  			kvm_mmu_sync_roots(vcpu);
6201  		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
6202  			kvm_vcpu_flush_tlb(vcpu);
6203  		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
6204  			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
6205  			r = 0;
6206  			goto out;
6207  		}
6208  		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
6209  			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
6210  			r = 0;
6211  			goto out;
6212  		}
6213  		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
6214  			vcpu->fpu_active = 0;
6215  			kvm_x86_ops->fpu_deactivate(vcpu);
6216  		}
6217  		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
6218  			/* Page is swapped out. Do synthetic halt */
6219  			vcpu->arch.apf.halted = true;
6220  			r = 1;
6221  			goto out;
6222  		}
6223  		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
6224  			record_steal_time(vcpu);
6225  		if (kvm_check_request(KVM_REQ_NMI, vcpu))
6226  			process_nmi(vcpu);
6227  		if (kvm_check_request(KVM_REQ_PMU, vcpu))
6228  			kvm_handle_pmu_event(vcpu);
6229  		if (kvm_check_request(KVM_REQ_PMI, vcpu))
6230  			kvm_deliver_pmi(vcpu);
6231  		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
6232  			vcpu_scan_ioapic(vcpu);
6233  		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
6234  			kvm_vcpu_reload_apic_access_page(vcpu);
6235  	}
6236  
6237  	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
6238  		kvm_apic_accept_events(vcpu);
6239  		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
6240  			r = 1;
6241  			goto out;
6242  		}
6243  
6244  		if (inject_pending_event(vcpu, req_int_win) != 0)
6245  			req_immediate_exit = true;
6246  		/* enable NMI/IRQ window open exits if needed */
6247  		else if (vcpu->arch.nmi_pending)
6248  			kvm_x86_ops->enable_nmi_window(vcpu);
6249  		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
6250  			kvm_x86_ops->enable_irq_window(vcpu);
6251  
6252  		if (kvm_lapic_enabled(vcpu)) {
6253  			/*
6254  			 * Update architecture specific hints for APIC
6255  			 * virtual interrupt delivery.
6256  			 */
6257  			if (kvm_x86_ops->hwapic_irr_update)
6258  				kvm_x86_ops->hwapic_irr_update(vcpu,
6259  					kvm_lapic_find_highest_irr(vcpu));
6260  			update_cr8_intercept(vcpu);
6261  			kvm_lapic_sync_to_vapic(vcpu);
6262  		}
6263  	}
6264  
6265  	r = kvm_mmu_reload(vcpu);
6266  	if (unlikely(r)) {
6267  		goto cancel_injection;
6268  	}
6269  
6270  	preempt_disable();
6271  
6272  	kvm_x86_ops->prepare_guest_switch(vcpu);
6273  	if (vcpu->fpu_active)
6274  		kvm_load_guest_fpu(vcpu);
6275  	kvm_load_guest_xcr0(vcpu);
6276  
6277  	vcpu->mode = IN_GUEST_MODE;
6278  
6279  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6280  
6281  	/* We should set ->mode before check ->requests,
6282  	 * see the comment in make_all_cpus_request.
6283  	 */
6284  	smp_mb__after_srcu_read_unlock();
6285  
6286  	local_irq_disable();
6287  
6288  	if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
6289  	    || need_resched() || signal_pending(current)) {
6290  		vcpu->mode = OUTSIDE_GUEST_MODE;
6291  		smp_wmb();
6292  		local_irq_enable();
6293  		preempt_enable();
6294  		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6295  		r = 1;
6296  		goto cancel_injection;
6297  	}
6298  
6299  	if (req_immediate_exit)
6300  		smp_send_reschedule(vcpu->cpu);
6301  
6302  	kvm_guest_enter();
6303  
6304  	if (unlikely(vcpu->arch.switch_db_regs)) {
6305  		set_debugreg(0, 7);
6306  		set_debugreg(vcpu->arch.eff_db[0], 0);
6307  		set_debugreg(vcpu->arch.eff_db[1], 1);
6308  		set_debugreg(vcpu->arch.eff_db[2], 2);
6309  		set_debugreg(vcpu->arch.eff_db[3], 3);
6310  		set_debugreg(vcpu->arch.dr6, 6);
6311  	}
6312  
6313  	trace_kvm_entry(vcpu->vcpu_id);
6314  	kvm_x86_ops->run(vcpu);
6315  
6316  	/*
6317  	 * Do this here before restoring debug registers on the host.  And
6318  	 * since we do this before handling the vmexit, a DR access vmexit
6319  	 * can (a) read the correct value of the debug registers, (b) set
6320  	 * KVM_DEBUGREG_WONT_EXIT again.
6321  	 */
6322  	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
6323  		int i;
6324  
6325  		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
6326  		kvm_x86_ops->sync_dirty_debug_regs(vcpu);
6327  		for (i = 0; i < KVM_NR_DB_REGS; i++)
6328  			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6329  	}
6330  
6331  	/*
6332  	 * If the guest has used debug registers, at least dr7
6333  	 * will be disabled while returning to the host.
6334  	 * If we don't have active breakpoints in the host, we don't
6335  	 * care about the messed up debug address registers. But if
6336  	 * we have some of them active, restore the old state.
6337  	 */
6338  	if (hw_breakpoint_active())
6339  		hw_breakpoint_restore();
6340  
6341  	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
6342  							   native_read_tsc());
6343  
6344  	vcpu->mode = OUTSIDE_GUEST_MODE;
6345  	smp_wmb();
6346  
6347  	/* Interrupt is enabled by handle_external_intr() */
6348  	kvm_x86_ops->handle_external_intr(vcpu);
6349  
6350  	++vcpu->stat.exits;
6351  
6352  	/*
6353  	 * We must have an instruction between local_irq_enable() and
6354  	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
6355  	 * the interrupt shadow.  The stat.exits increment will do nicely.
6356  	 * But we need to prevent reordering, hence this barrier():
6357  	 */
6358  	barrier();
6359  
6360  	kvm_guest_exit();
6361  
6362  	preempt_enable();
6363  
6364  	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6365  
6366  	/*
6367  	 * Profile KVM exit RIPs:
6368  	 */
6369  	if (unlikely(prof_on == KVM_PROFILING)) {
6370  		unsigned long rip = kvm_rip_read(vcpu);
6371  		profile_hit(KVM_PROFILING, (void *)rip);
6372  	}
6373  
6374  	if (unlikely(vcpu->arch.tsc_always_catchup))
6375  		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
6376  
6377  	if (vcpu->arch.apic_attention)
6378  		kvm_lapic_sync_from_vapic(vcpu);
6379  
6380  	r = kvm_x86_ops->handle_exit(vcpu);
6381  	return r;
6382  
6383  cancel_injection:
6384  	kvm_x86_ops->cancel_injection(vcpu);
6385  	if (unlikely(vcpu->arch.apic_attention))
6386  		kvm_lapic_sync_from_vapic(vcpu);
6387  out:
6388  	return r;
6389  }
6390  
6391  
6392  static int __vcpu_run(struct kvm_vcpu *vcpu)
6393  {
6394  	int r;
6395  	struct kvm *kvm = vcpu->kvm;
6396  
6397  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6398  
6399  	r = 1;
6400  	while (r > 0) {
6401  		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6402  		    !vcpu->arch.apf.halted)
6403  			r = vcpu_enter_guest(vcpu);
6404  		else {
6405  			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6406  			kvm_vcpu_block(vcpu);
6407  			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6408  			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
6409  				kvm_apic_accept_events(vcpu);
6410  				switch(vcpu->arch.mp_state) {
6411  				case KVM_MP_STATE_HALTED:
6412  					vcpu->arch.pv.pv_unhalted = false;
6413  					vcpu->arch.mp_state =
6414  						KVM_MP_STATE_RUNNABLE;
6415  				case KVM_MP_STATE_RUNNABLE:
6416  					vcpu->arch.apf.halted = false;
6417  					break;
6418  				case KVM_MP_STATE_INIT_RECEIVED:
6419  					break;
6420  				default:
6421  					r = -EINTR;
6422  					break;
6423  				}
6424  			}
6425  		}
6426  
6427  		if (r <= 0)
6428  			break;
6429  
6430  		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
6431  		if (kvm_cpu_has_pending_timer(vcpu))
6432  			kvm_inject_pending_timer_irqs(vcpu);
6433  
6434  		if (dm_request_for_irq_injection(vcpu)) {
6435  			r = -EINTR;
6436  			vcpu->run->exit_reason = KVM_EXIT_INTR;
6437  			++vcpu->stat.request_irq_exits;
6438  		}
6439  
6440  		kvm_check_async_pf_completion(vcpu);
6441  
6442  		if (signal_pending(current)) {
6443  			r = -EINTR;
6444  			vcpu->run->exit_reason = KVM_EXIT_INTR;
6445  			++vcpu->stat.signal_exits;
6446  		}
6447  		if (need_resched()) {
6448  			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6449  			cond_resched();
6450  			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
6451  		}
6452  	}
6453  
6454  	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
6455  
6456  	return r;
6457  }
6458  
6459  static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
6460  {
6461  	int r;
6462  	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
6463  	r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
6464  	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
6465  	if (r != EMULATE_DONE)
6466  		return 0;
6467  	return 1;
6468  }
6469  
6470  static int complete_emulated_pio(struct kvm_vcpu *vcpu)
6471  {
6472  	BUG_ON(!vcpu->arch.pio.count);
6473  
6474  	return complete_emulated_io(vcpu);
6475  }
6476  
6477  /*
6478   * Implements the following, as a state machine:
6479   *
6480   * read:
6481   *   for each fragment
6482   *     for each mmio piece in the fragment
6483   *       write gpa, len
6484   *       exit
6485   *       copy data
6486   *   execute insn
6487   *
6488   * write:
6489   *   for each fragment
6490   *     for each mmio piece in the fragment
6491   *       write gpa, len
6492   *       copy data
6493   *       exit
6494   */
6495  static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
6496  {
6497  	struct kvm_run *run = vcpu->run;
6498  	struct kvm_mmio_fragment *frag;
6499  	unsigned len;
6500  
6501  	BUG_ON(!vcpu->mmio_needed);
6502  
6503  	/* Complete previous fragment */
6504  	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
6505  	len = min(8u, frag->len);
6506  	if (!vcpu->mmio_is_write)
6507  		memcpy(frag->data, run->mmio.data, len);
6508  
6509  	if (frag->len <= 8) {
6510  		/* Switch to the next fragment. */
6511  		frag++;
6512  		vcpu->mmio_cur_fragment++;
6513  	} else {
6514  		/* Go forward to the next mmio piece. */
6515  		frag->data += len;
6516  		frag->gpa += len;
6517  		frag->len -= len;
6518  	}
6519  
6520  	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
6521  		vcpu->mmio_needed = 0;
6522  
6523  		/* FIXME: return into emulator if single-stepping.  */
6524  		if (vcpu->mmio_is_write)
6525  			return 1;
6526  		vcpu->mmio_read_completed = 1;
6527  		return complete_emulated_io(vcpu);
6528  	}
6529  
6530  	run->exit_reason = KVM_EXIT_MMIO;
6531  	run->mmio.phys_addr = frag->gpa;
6532  	if (vcpu->mmio_is_write)
6533  		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
6534  	run->mmio.len = min(8u, frag->len);
6535  	run->mmio.is_write = vcpu->mmio_is_write;
6536  	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
6537  	return 0;
6538  }
6539  
6540  
6541  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
6542  {
6543  	int r;
6544  	sigset_t sigsaved;
6545  
6546  	if (!tsk_used_math(current) && init_fpu(current))
6547  		return -ENOMEM;
6548  
6549  	if (vcpu->sigset_active)
6550  		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
6551  
6552  	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
6553  		kvm_vcpu_block(vcpu);
6554  		kvm_apic_accept_events(vcpu);
6555  		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
6556  		r = -EAGAIN;
6557  		goto out;
6558  	}
6559  
6560  	/* re-sync apic's tpr */
6561  	if (!irqchip_in_kernel(vcpu->kvm)) {
6562  		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
6563  			r = -EINVAL;
6564  			goto out;
6565  		}
6566  	}
6567  
6568  	if (unlikely(vcpu->arch.complete_userspace_io)) {
6569  		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
6570  		vcpu->arch.complete_userspace_io = NULL;
6571  		r = cui(vcpu);
6572  		if (r <= 0)
6573  			goto out;
6574  	} else
6575  		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
6576  
6577  	r = __vcpu_run(vcpu);
6578  
6579  out:
6580  	post_kvm_run_save(vcpu);
6581  	if (vcpu->sigset_active)
6582  		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
6583  
6584  	return r;
6585  }
6586  
6587  int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6588  {
6589  	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
6590  		/*
6591  		 * We are here if userspace calls get_regs() in the middle of
6592  		 * instruction emulation. Registers state needs to be copied
6593  		 * back from emulation context to vcpu. Userspace shouldn't do
6594  		 * that usually, but some bad designed PV devices (vmware
6595  		 * backdoor interface) need this to work
6596  		 */
6597  		emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
6598  		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6599  	}
6600  	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
6601  	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
6602  	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
6603  	regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
6604  	regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
6605  	regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
6606  	regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
6607  	regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
6608  #ifdef CONFIG_X86_64
6609  	regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
6610  	regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
6611  	regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
6612  	regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
6613  	regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
6614  	regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
6615  	regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
6616  	regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
6617  #endif
6618  
6619  	regs->rip = kvm_rip_read(vcpu);
6620  	regs->rflags = kvm_get_rflags(vcpu);
6621  
6622  	return 0;
6623  }
6624  
6625  int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
6626  {
6627  	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
6628  	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
6629  
6630  	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
6631  	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
6632  	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
6633  	kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
6634  	kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
6635  	kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
6636  	kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
6637  	kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
6638  #ifdef CONFIG_X86_64
6639  	kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
6640  	kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
6641  	kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
6642  	kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
6643  	kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
6644  	kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
6645  	kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
6646  	kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
6647  #endif
6648  
6649  	kvm_rip_write(vcpu, regs->rip);
6650  	kvm_set_rflags(vcpu, regs->rflags);
6651  
6652  	vcpu->arch.exception.pending = false;
6653  
6654  	kvm_make_request(KVM_REQ_EVENT, vcpu);
6655  
6656  	return 0;
6657  }
6658  
6659  void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
6660  {
6661  	struct kvm_segment cs;
6662  
6663  	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
6664  	*db = cs.db;
6665  	*l = cs.l;
6666  }
6667  EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
6668  
6669  int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
6670  				  struct kvm_sregs *sregs)
6671  {
6672  	struct desc_ptr dt;
6673  
6674  	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6675  	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6676  	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6677  	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6678  	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6679  	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6680  
6681  	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6682  	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6683  
6684  	kvm_x86_ops->get_idt(vcpu, &dt);
6685  	sregs->idt.limit = dt.size;
6686  	sregs->idt.base = dt.address;
6687  	kvm_x86_ops->get_gdt(vcpu, &dt);
6688  	sregs->gdt.limit = dt.size;
6689  	sregs->gdt.base = dt.address;
6690  
6691  	sregs->cr0 = kvm_read_cr0(vcpu);
6692  	sregs->cr2 = vcpu->arch.cr2;
6693  	sregs->cr3 = kvm_read_cr3(vcpu);
6694  	sregs->cr4 = kvm_read_cr4(vcpu);
6695  	sregs->cr8 = kvm_get_cr8(vcpu);
6696  	sregs->efer = vcpu->arch.efer;
6697  	sregs->apic_base = kvm_get_apic_base(vcpu);
6698  
6699  	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
6700  
6701  	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
6702  		set_bit(vcpu->arch.interrupt.nr,
6703  			(unsigned long *)sregs->interrupt_bitmap);
6704  
6705  	return 0;
6706  }
6707  
6708  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
6709  				    struct kvm_mp_state *mp_state)
6710  {
6711  	kvm_apic_accept_events(vcpu);
6712  	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
6713  					vcpu->arch.pv.pv_unhalted)
6714  		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
6715  	else
6716  		mp_state->mp_state = vcpu->arch.mp_state;
6717  
6718  	return 0;
6719  }
6720  
6721  int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
6722  				    struct kvm_mp_state *mp_state)
6723  {
6724  	if (!kvm_vcpu_has_lapic(vcpu) &&
6725  	    mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
6726  		return -EINVAL;
6727  
6728  	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
6729  		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
6730  		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
6731  	} else
6732  		vcpu->arch.mp_state = mp_state->mp_state;
6733  	kvm_make_request(KVM_REQ_EVENT, vcpu);
6734  	return 0;
6735  }
6736  
6737  int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
6738  		    int reason, bool has_error_code, u32 error_code)
6739  {
6740  	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
6741  	int ret;
6742  
6743  	init_emulate_ctxt(vcpu);
6744  
6745  	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
6746  				   has_error_code, error_code);
6747  
6748  	if (ret)
6749  		return EMULATE_FAIL;
6750  
6751  	kvm_rip_write(vcpu, ctxt->eip);
6752  	kvm_set_rflags(vcpu, ctxt->eflags);
6753  	kvm_make_request(KVM_REQ_EVENT, vcpu);
6754  	return EMULATE_DONE;
6755  }
6756  EXPORT_SYMBOL_GPL(kvm_task_switch);
6757  
6758  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
6759  				  struct kvm_sregs *sregs)
6760  {
6761  	struct msr_data apic_base_msr;
6762  	int mmu_reset_needed = 0;
6763  	int pending_vec, max_bits, idx;
6764  	struct desc_ptr dt;
6765  
6766  	if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
6767  		return -EINVAL;
6768  
6769  	dt.size = sregs->idt.limit;
6770  	dt.address = sregs->idt.base;
6771  	kvm_x86_ops->set_idt(vcpu, &dt);
6772  	dt.size = sregs->gdt.limit;
6773  	dt.address = sregs->gdt.base;
6774  	kvm_x86_ops->set_gdt(vcpu, &dt);
6775  
6776  	vcpu->arch.cr2 = sregs->cr2;
6777  	mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
6778  	vcpu->arch.cr3 = sregs->cr3;
6779  	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
6780  
6781  	kvm_set_cr8(vcpu, sregs->cr8);
6782  
6783  	mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
6784  	kvm_x86_ops->set_efer(vcpu, sregs->efer);
6785  	apic_base_msr.data = sregs->apic_base;
6786  	apic_base_msr.host_initiated = true;
6787  	kvm_set_apic_base(vcpu, &apic_base_msr);
6788  
6789  	mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
6790  	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
6791  	vcpu->arch.cr0 = sregs->cr0;
6792  
6793  	mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
6794  	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
6795  	if (sregs->cr4 & X86_CR4_OSXSAVE)
6796  		kvm_update_cpuid(vcpu);
6797  
6798  	idx = srcu_read_lock(&vcpu->kvm->srcu);
6799  	if (!is_long_mode(vcpu) && is_pae(vcpu)) {
6800  		load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
6801  		mmu_reset_needed = 1;
6802  	}
6803  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
6804  
6805  	if (mmu_reset_needed)
6806  		kvm_mmu_reset_context(vcpu);
6807  
6808  	max_bits = KVM_NR_INTERRUPTS;
6809  	pending_vec = find_first_bit(
6810  		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
6811  	if (pending_vec < max_bits) {
6812  		kvm_queue_interrupt(vcpu, pending_vec, false);
6813  		pr_debug("Set back pending irq %d\n", pending_vec);
6814  	}
6815  
6816  	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
6817  	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
6818  	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
6819  	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
6820  	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
6821  	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
6822  
6823  	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
6824  	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
6825  
6826  	update_cr8_intercept(vcpu);
6827  
6828  	/* Older userspace won't unhalt the vcpu on reset. */
6829  	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
6830  	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
6831  	    !is_protmode(vcpu))
6832  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
6833  
6834  	kvm_make_request(KVM_REQ_EVENT, vcpu);
6835  
6836  	return 0;
6837  }
6838  
6839  int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
6840  					struct kvm_guest_debug *dbg)
6841  {
6842  	unsigned long rflags;
6843  	int i, r;
6844  
6845  	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
6846  		r = -EBUSY;
6847  		if (vcpu->arch.exception.pending)
6848  			goto out;
6849  		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
6850  			kvm_queue_exception(vcpu, DB_VECTOR);
6851  		else
6852  			kvm_queue_exception(vcpu, BP_VECTOR);
6853  	}
6854  
6855  	/*
6856  	 * Read rflags as long as potentially injected trace flags are still
6857  	 * filtered out.
6858  	 */
6859  	rflags = kvm_get_rflags(vcpu);
6860  
6861  	vcpu->guest_debug = dbg->control;
6862  	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
6863  		vcpu->guest_debug = 0;
6864  
6865  	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
6866  		for (i = 0; i < KVM_NR_DB_REGS; ++i)
6867  			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
6868  		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
6869  	} else {
6870  		for (i = 0; i < KVM_NR_DB_REGS; i++)
6871  			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
6872  	}
6873  	kvm_update_dr7(vcpu);
6874  
6875  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
6876  		vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
6877  			get_segment_base(vcpu, VCPU_SREG_CS);
6878  
6879  	/*
6880  	 * Trigger an rflags update that will inject or remove the trace
6881  	 * flags.
6882  	 */
6883  	kvm_set_rflags(vcpu, rflags);
6884  
6885  	kvm_x86_ops->update_db_bp_intercept(vcpu);
6886  
6887  	r = 0;
6888  
6889  out:
6890  
6891  	return r;
6892  }
6893  
6894  /*
6895   * Translate a guest virtual address to a guest physical address.
6896   */
6897  int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
6898  				    struct kvm_translation *tr)
6899  {
6900  	unsigned long vaddr = tr->linear_address;
6901  	gpa_t gpa;
6902  	int idx;
6903  
6904  	idx = srcu_read_lock(&vcpu->kvm->srcu);
6905  	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
6906  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
6907  	tr->physical_address = gpa;
6908  	tr->valid = gpa != UNMAPPED_GVA;
6909  	tr->writeable = 1;
6910  	tr->usermode = 0;
6911  
6912  	return 0;
6913  }
6914  
6915  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6916  {
6917  	struct i387_fxsave_struct *fxsave =
6918  			&vcpu->arch.guest_fpu.state->fxsave;
6919  
6920  	memcpy(fpu->fpr, fxsave->st_space, 128);
6921  	fpu->fcw = fxsave->cwd;
6922  	fpu->fsw = fxsave->swd;
6923  	fpu->ftwx = fxsave->twd;
6924  	fpu->last_opcode = fxsave->fop;
6925  	fpu->last_ip = fxsave->rip;
6926  	fpu->last_dp = fxsave->rdp;
6927  	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
6928  
6929  	return 0;
6930  }
6931  
6932  int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
6933  {
6934  	struct i387_fxsave_struct *fxsave =
6935  			&vcpu->arch.guest_fpu.state->fxsave;
6936  
6937  	memcpy(fxsave->st_space, fpu->fpr, 128);
6938  	fxsave->cwd = fpu->fcw;
6939  	fxsave->swd = fpu->fsw;
6940  	fxsave->twd = fpu->ftwx;
6941  	fxsave->fop = fpu->last_opcode;
6942  	fxsave->rip = fpu->last_ip;
6943  	fxsave->rdp = fpu->last_dp;
6944  	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
6945  
6946  	return 0;
6947  }
6948  
6949  int fx_init(struct kvm_vcpu *vcpu)
6950  {
6951  	int err;
6952  
6953  	err = fpu_alloc(&vcpu->arch.guest_fpu);
6954  	if (err)
6955  		return err;
6956  
6957  	fpu_finit(&vcpu->arch.guest_fpu);
6958  	if (cpu_has_xsaves)
6959  		vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
6960  			host_xcr0 | XSTATE_COMPACTION_ENABLED;
6961  
6962  	/*
6963  	 * Ensure guest xcr0 is valid for loading
6964  	 */
6965  	vcpu->arch.xcr0 = XSTATE_FP;
6966  
6967  	vcpu->arch.cr0 |= X86_CR0_ET;
6968  
6969  	return 0;
6970  }
6971  EXPORT_SYMBOL_GPL(fx_init);
6972  
6973  static void fx_free(struct kvm_vcpu *vcpu)
6974  {
6975  	fpu_free(&vcpu->arch.guest_fpu);
6976  }
6977  
6978  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
6979  {
6980  	if (vcpu->guest_fpu_loaded)
6981  		return;
6982  
6983  	/*
6984  	 * Restore all possible states in the guest,
6985  	 * and assume host would use all available bits.
6986  	 * Guest xcr0 would be loaded later.
6987  	 */
6988  	kvm_put_guest_xcr0(vcpu);
6989  	vcpu->guest_fpu_loaded = 1;
6990  	__kernel_fpu_begin();
6991  	fpu_restore_checking(&vcpu->arch.guest_fpu);
6992  	trace_kvm_fpu(1);
6993  }
6994  
6995  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
6996  {
6997  	kvm_put_guest_xcr0(vcpu);
6998  
6999  	if (!vcpu->guest_fpu_loaded)
7000  		return;
7001  
7002  	vcpu->guest_fpu_loaded = 0;
7003  	fpu_save_init(&vcpu->arch.guest_fpu);
7004  	__kernel_fpu_end();
7005  	++vcpu->stat.fpu_reload;
7006  	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
7007  	trace_kvm_fpu(0);
7008  }
7009  
7010  void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
7011  {
7012  	kvmclock_reset(vcpu);
7013  
7014  	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
7015  	fx_free(vcpu);
7016  	kvm_x86_ops->vcpu_free(vcpu);
7017  }
7018  
7019  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
7020  						unsigned int id)
7021  {
7022  	if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
7023  		printk_once(KERN_WARNING
7024  		"kvm: SMP vm created on host with unstable TSC; "
7025  		"guest TSC will not be reliable\n");
7026  	return kvm_x86_ops->vcpu_create(kvm, id);
7027  }
7028  
7029  int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
7030  {
7031  	int r;
7032  
7033  	vcpu->arch.mtrr_state.have_fixed = 1;
7034  	r = vcpu_load(vcpu);
7035  	if (r)
7036  		return r;
7037  	kvm_vcpu_reset(vcpu);
7038  	kvm_mmu_setup(vcpu);
7039  	vcpu_put(vcpu);
7040  
7041  	return r;
7042  }
7043  
7044  int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
7045  {
7046  	int r;
7047  	struct msr_data msr;
7048  	struct kvm *kvm = vcpu->kvm;
7049  
7050  	r = vcpu_load(vcpu);
7051  	if (r)
7052  		return r;
7053  	msr.data = 0x0;
7054  	msr.index = MSR_IA32_TSC;
7055  	msr.host_initiated = true;
7056  	kvm_write_tsc(vcpu, &msr);
7057  	vcpu_put(vcpu);
7058  
7059  	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
7060  					KVMCLOCK_SYNC_PERIOD);
7061  
7062  	return r;
7063  }
7064  
7065  void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
7066  {
7067  	int r;
7068  	vcpu->arch.apf.msr_val = 0;
7069  
7070  	r = vcpu_load(vcpu);
7071  	BUG_ON(r);
7072  	kvm_mmu_unload(vcpu);
7073  	vcpu_put(vcpu);
7074  
7075  	fx_free(vcpu);
7076  	kvm_x86_ops->vcpu_free(vcpu);
7077  }
7078  
7079  void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
7080  {
7081  	atomic_set(&vcpu->arch.nmi_queued, 0);
7082  	vcpu->arch.nmi_pending = 0;
7083  	vcpu->arch.nmi_injected = false;
7084  	kvm_clear_interrupt_queue(vcpu);
7085  	kvm_clear_exception_queue(vcpu);
7086  
7087  	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
7088  	vcpu->arch.dr6 = DR6_INIT;
7089  	kvm_update_dr6(vcpu);
7090  	vcpu->arch.dr7 = DR7_FIXED_1;
7091  	kvm_update_dr7(vcpu);
7092  
7093  	kvm_make_request(KVM_REQ_EVENT, vcpu);
7094  	vcpu->arch.apf.msr_val = 0;
7095  	vcpu->arch.st.msr_val = 0;
7096  
7097  	kvmclock_reset(vcpu);
7098  
7099  	kvm_clear_async_pf_completion_queue(vcpu);
7100  	kvm_async_pf_hash_reset(vcpu);
7101  	vcpu->arch.apf.halted = false;
7102  
7103  	kvm_pmu_reset(vcpu);
7104  
7105  	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
7106  	vcpu->arch.regs_avail = ~0;
7107  	vcpu->arch.regs_dirty = ~0;
7108  
7109  	kvm_x86_ops->vcpu_reset(vcpu);
7110  }
7111  
7112  void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
7113  {
7114  	struct kvm_segment cs;
7115  
7116  	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
7117  	cs.selector = vector << 8;
7118  	cs.base = vector << 12;
7119  	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
7120  	kvm_rip_write(vcpu, 0);
7121  }
7122  
7123  int kvm_arch_hardware_enable(void)
7124  {
7125  	struct kvm *kvm;
7126  	struct kvm_vcpu *vcpu;
7127  	int i;
7128  	int ret;
7129  	u64 local_tsc;
7130  	u64 max_tsc = 0;
7131  	bool stable, backwards_tsc = false;
7132  
7133  	kvm_shared_msr_cpu_online();
7134  	ret = kvm_x86_ops->hardware_enable();
7135  	if (ret != 0)
7136  		return ret;
7137  
7138  	local_tsc = native_read_tsc();
7139  	stable = !check_tsc_unstable();
7140  	list_for_each_entry(kvm, &vm_list, vm_list) {
7141  		kvm_for_each_vcpu(i, vcpu, kvm) {
7142  			if (!stable && vcpu->cpu == smp_processor_id())
7143  				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
7144  			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
7145  				backwards_tsc = true;
7146  				if (vcpu->arch.last_host_tsc > max_tsc)
7147  					max_tsc = vcpu->arch.last_host_tsc;
7148  			}
7149  		}
7150  	}
7151  
7152  	/*
7153  	 * Sometimes, even reliable TSCs go backwards.  This happens on
7154  	 * platforms that reset TSC during suspend or hibernate actions, but
7155  	 * maintain synchronization.  We must compensate.  Fortunately, we can
7156  	 * detect that condition here, which happens early in CPU bringup,
7157  	 * before any KVM threads can be running.  Unfortunately, we can't
7158  	 * bring the TSCs fully up to date with real time, as we aren't yet far
7159  	 * enough into CPU bringup that we know how much real time has actually
7160  	 * elapsed; our helper function, get_kernel_ns() will be using boot
7161  	 * variables that haven't been updated yet.
7162  	 *
7163  	 * So we simply find the maximum observed TSC above, then record the
7164  	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
7165  	 * the adjustment will be applied.  Note that we accumulate
7166  	 * adjustments, in case multiple suspend cycles happen before some VCPU
7167  	 * gets a chance to run again.  In the event that no KVM threads get a
7168  	 * chance to run, we will miss the entire elapsed period, as we'll have
7169  	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
7170  	 * loose cycle time.  This isn't too big a deal, since the loss will be
7171  	 * uniform across all VCPUs (not to mention the scenario is extremely
7172  	 * unlikely). It is possible that a second hibernate recovery happens
7173  	 * much faster than a first, causing the observed TSC here to be
7174  	 * smaller; this would require additional padding adjustment, which is
7175  	 * why we set last_host_tsc to the local tsc observed here.
7176  	 *
7177  	 * N.B. - this code below runs only on platforms with reliable TSC,
7178  	 * as that is the only way backwards_tsc is set above.  Also note
7179  	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
7180  	 * have the same delta_cyc adjustment applied if backwards_tsc
7181  	 * is detected.  Note further, this adjustment is only done once,
7182  	 * as we reset last_host_tsc on all VCPUs to stop this from being
7183  	 * called multiple times (one for each physical CPU bringup).
7184  	 *
7185  	 * Platforms with unreliable TSCs don't have to deal with this, they
7186  	 * will be compensated by the logic in vcpu_load, which sets the TSC to
7187  	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
7188  	 * guarantee that they stay in perfect synchronization.
7189  	 */
7190  	if (backwards_tsc) {
7191  		u64 delta_cyc = max_tsc - local_tsc;
7192  		backwards_tsc_observed = true;
7193  		list_for_each_entry(kvm, &vm_list, vm_list) {
7194  			kvm_for_each_vcpu(i, vcpu, kvm) {
7195  				vcpu->arch.tsc_offset_adjustment += delta_cyc;
7196  				vcpu->arch.last_host_tsc = local_tsc;
7197  				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
7198  			}
7199  
7200  			/*
7201  			 * We have to disable TSC offset matching.. if you were
7202  			 * booting a VM while issuing an S4 host suspend....
7203  			 * you may have some problem.  Solving this issue is
7204  			 * left as an exercise to the reader.
7205  			 */
7206  			kvm->arch.last_tsc_nsec = 0;
7207  			kvm->arch.last_tsc_write = 0;
7208  		}
7209  
7210  	}
7211  	return 0;
7212  }
7213  
7214  void kvm_arch_hardware_disable(void)
7215  {
7216  	kvm_x86_ops->hardware_disable();
7217  	drop_user_return_notifiers();
7218  }
7219  
7220  int kvm_arch_hardware_setup(void)
7221  {
7222  	return kvm_x86_ops->hardware_setup();
7223  }
7224  
7225  void kvm_arch_hardware_unsetup(void)
7226  {
7227  	kvm_x86_ops->hardware_unsetup();
7228  }
7229  
7230  void kvm_arch_check_processor_compat(void *rtn)
7231  {
7232  	kvm_x86_ops->check_processor_compatibility(rtn);
7233  }
7234  
7235  bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
7236  {
7237  	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
7238  }
7239  
7240  struct static_key kvm_no_apic_vcpu __read_mostly;
7241  
7242  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
7243  {
7244  	struct page *page;
7245  	struct kvm *kvm;
7246  	int r;
7247  
7248  	BUG_ON(vcpu->kvm == NULL);
7249  	kvm = vcpu->kvm;
7250  
7251  	vcpu->arch.pv.pv_unhalted = false;
7252  	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
7253  	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
7254  		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7255  	else
7256  		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
7257  
7258  	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
7259  	if (!page) {
7260  		r = -ENOMEM;
7261  		goto fail;
7262  	}
7263  	vcpu->arch.pio_data = page_address(page);
7264  
7265  	kvm_set_tsc_khz(vcpu, max_tsc_khz);
7266  
7267  	r = kvm_mmu_create(vcpu);
7268  	if (r < 0)
7269  		goto fail_free_pio_data;
7270  
7271  	if (irqchip_in_kernel(kvm)) {
7272  		r = kvm_create_lapic(vcpu);
7273  		if (r < 0)
7274  			goto fail_mmu_destroy;
7275  	} else
7276  		static_key_slow_inc(&kvm_no_apic_vcpu);
7277  
7278  	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
7279  				       GFP_KERNEL);
7280  	if (!vcpu->arch.mce_banks) {
7281  		r = -ENOMEM;
7282  		goto fail_free_lapic;
7283  	}
7284  	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
7285  
7286  	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
7287  		r = -ENOMEM;
7288  		goto fail_free_mce_banks;
7289  	}
7290  
7291  	r = fx_init(vcpu);
7292  	if (r)
7293  		goto fail_free_wbinvd_dirty_mask;
7294  
7295  	vcpu->arch.ia32_tsc_adjust_msr = 0x0;
7296  	vcpu->arch.pv_time_enabled = false;
7297  
7298  	vcpu->arch.guest_supported_xcr0 = 0;
7299  	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
7300  
7301  	kvm_async_pf_hash_reset(vcpu);
7302  	kvm_pmu_init(vcpu);
7303  
7304  	return 0;
7305  fail_free_wbinvd_dirty_mask:
7306  	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
7307  fail_free_mce_banks:
7308  	kfree(vcpu->arch.mce_banks);
7309  fail_free_lapic:
7310  	kvm_free_lapic(vcpu);
7311  fail_mmu_destroy:
7312  	kvm_mmu_destroy(vcpu);
7313  fail_free_pio_data:
7314  	free_page((unsigned long)vcpu->arch.pio_data);
7315  fail:
7316  	return r;
7317  }
7318  
7319  void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
7320  {
7321  	int idx;
7322  
7323  	kvm_pmu_destroy(vcpu);
7324  	kfree(vcpu->arch.mce_banks);
7325  	kvm_free_lapic(vcpu);
7326  	idx = srcu_read_lock(&vcpu->kvm->srcu);
7327  	kvm_mmu_destroy(vcpu);
7328  	srcu_read_unlock(&vcpu->kvm->srcu, idx);
7329  	free_page((unsigned long)vcpu->arch.pio_data);
7330  	if (!irqchip_in_kernel(vcpu->kvm))
7331  		static_key_slow_dec(&kvm_no_apic_vcpu);
7332  }
7333  
7334  void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
7335  {
7336  	kvm_x86_ops->sched_in(vcpu, cpu);
7337  }
7338  
7339  int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
7340  {
7341  	if (type)
7342  		return -EINVAL;
7343  
7344  	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
7345  	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
7346  	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
7347  	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
7348  	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
7349  
7350  	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
7351  	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
7352  	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
7353  	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
7354  		&kvm->arch.irq_sources_bitmap);
7355  
7356  	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
7357  	mutex_init(&kvm->arch.apic_map_lock);
7358  	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
7359  
7360  	pvclock_update_vm_gtod_copy(kvm);
7361  
7362  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
7363  	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
7364  
7365  	return 0;
7366  }
7367  
7368  static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
7369  {
7370  	int r;
7371  	r = vcpu_load(vcpu);
7372  	BUG_ON(r);
7373  	kvm_mmu_unload(vcpu);
7374  	vcpu_put(vcpu);
7375  }
7376  
7377  static void kvm_free_vcpus(struct kvm *kvm)
7378  {
7379  	unsigned int i;
7380  	struct kvm_vcpu *vcpu;
7381  
7382  	/*
7383  	 * Unpin any mmu pages first.
7384  	 */
7385  	kvm_for_each_vcpu(i, vcpu, kvm) {
7386  		kvm_clear_async_pf_completion_queue(vcpu);
7387  		kvm_unload_vcpu_mmu(vcpu);
7388  	}
7389  	kvm_for_each_vcpu(i, vcpu, kvm)
7390  		kvm_arch_vcpu_free(vcpu);
7391  
7392  	mutex_lock(&kvm->lock);
7393  	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
7394  		kvm->vcpus[i] = NULL;
7395  
7396  	atomic_set(&kvm->online_vcpus, 0);
7397  	mutex_unlock(&kvm->lock);
7398  }
7399  
7400  void kvm_arch_sync_events(struct kvm *kvm)
7401  {
7402  	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
7403  	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
7404  	kvm_free_all_assigned_devices(kvm);
7405  	kvm_free_pit(kvm);
7406  }
7407  
7408  void kvm_arch_destroy_vm(struct kvm *kvm)
7409  {
7410  	if (current->mm == kvm->mm) {
7411  		/*
7412  		 * Free memory regions allocated on behalf of userspace,
7413  		 * unless the the memory map has changed due to process exit
7414  		 * or fd copying.
7415  		 */
7416  		struct kvm_userspace_memory_region mem;
7417  		memset(&mem, 0, sizeof(mem));
7418  		mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
7419  		kvm_set_memory_region(kvm, &mem);
7420  
7421  		mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
7422  		kvm_set_memory_region(kvm, &mem);
7423  
7424  		mem.slot = TSS_PRIVATE_MEMSLOT;
7425  		kvm_set_memory_region(kvm, &mem);
7426  	}
7427  	kvm_iommu_unmap_guest(kvm);
7428  	kfree(kvm->arch.vpic);
7429  	kfree(kvm->arch.vioapic);
7430  	kvm_free_vcpus(kvm);
7431  	kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
7432  }
7433  
7434  void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
7435  			   struct kvm_memory_slot *dont)
7436  {
7437  	int i;
7438  
7439  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7440  		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
7441  			kvm_kvfree(free->arch.rmap[i]);
7442  			free->arch.rmap[i] = NULL;
7443  		}
7444  		if (i == 0)
7445  			continue;
7446  
7447  		if (!dont || free->arch.lpage_info[i - 1] !=
7448  			     dont->arch.lpage_info[i - 1]) {
7449  			kvm_kvfree(free->arch.lpage_info[i - 1]);
7450  			free->arch.lpage_info[i - 1] = NULL;
7451  		}
7452  	}
7453  }
7454  
7455  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
7456  			    unsigned long npages)
7457  {
7458  	int i;
7459  
7460  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7461  		unsigned long ugfn;
7462  		int lpages;
7463  		int level = i + 1;
7464  
7465  		lpages = gfn_to_index(slot->base_gfn + npages - 1,
7466  				      slot->base_gfn, level) + 1;
7467  
7468  		slot->arch.rmap[i] =
7469  			kvm_kvzalloc(lpages * sizeof(*slot->arch.rmap[i]));
7470  		if (!slot->arch.rmap[i])
7471  			goto out_free;
7472  		if (i == 0)
7473  			continue;
7474  
7475  		slot->arch.lpage_info[i - 1] = kvm_kvzalloc(lpages *
7476  					sizeof(*slot->arch.lpage_info[i - 1]));
7477  		if (!slot->arch.lpage_info[i - 1])
7478  			goto out_free;
7479  
7480  		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
7481  			slot->arch.lpage_info[i - 1][0].write_count = 1;
7482  		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
7483  			slot->arch.lpage_info[i - 1][lpages - 1].write_count = 1;
7484  		ugfn = slot->userspace_addr >> PAGE_SHIFT;
7485  		/*
7486  		 * If the gfn and userspace address are not aligned wrt each
7487  		 * other, or if explicitly asked to, disable large page
7488  		 * support for this slot
7489  		 */
7490  		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
7491  		    !kvm_largepages_enabled()) {
7492  			unsigned long j;
7493  
7494  			for (j = 0; j < lpages; ++j)
7495  				slot->arch.lpage_info[i - 1][j].write_count = 1;
7496  		}
7497  	}
7498  
7499  	return 0;
7500  
7501  out_free:
7502  	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
7503  		kvm_kvfree(slot->arch.rmap[i]);
7504  		slot->arch.rmap[i] = NULL;
7505  		if (i == 0)
7506  			continue;
7507  
7508  		kvm_kvfree(slot->arch.lpage_info[i - 1]);
7509  		slot->arch.lpage_info[i - 1] = NULL;
7510  	}
7511  	return -ENOMEM;
7512  }
7513  
7514  void kvm_arch_memslots_updated(struct kvm *kvm)
7515  {
7516  	/*
7517  	 * memslots->generation has been incremented.
7518  	 * mmio generation may have reached its maximum value.
7519  	 */
7520  	kvm_mmu_invalidate_mmio_sptes(kvm);
7521  }
7522  
7523  int kvm_arch_prepare_memory_region(struct kvm *kvm,
7524  				struct kvm_memory_slot *memslot,
7525  				struct kvm_userspace_memory_region *mem,
7526  				enum kvm_mr_change change)
7527  {
7528  	/*
7529  	 * Only private memory slots need to be mapped here since
7530  	 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
7531  	 */
7532  	if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
7533  		unsigned long userspace_addr;
7534  
7535  		/*
7536  		 * MAP_SHARED to prevent internal slot pages from being moved
7537  		 * by fork()/COW.
7538  		 */
7539  		userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
7540  					 PROT_READ | PROT_WRITE,
7541  					 MAP_SHARED | MAP_ANONYMOUS, 0);
7542  
7543  		if (IS_ERR((void *)userspace_addr))
7544  			return PTR_ERR((void *)userspace_addr);
7545  
7546  		memslot->userspace_addr = userspace_addr;
7547  	}
7548  
7549  	return 0;
7550  }
7551  
7552  void kvm_arch_commit_memory_region(struct kvm *kvm,
7553  				struct kvm_userspace_memory_region *mem,
7554  				const struct kvm_memory_slot *old,
7555  				enum kvm_mr_change change)
7556  {
7557  
7558  	int nr_mmu_pages = 0;
7559  
7560  	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
7561  		int ret;
7562  
7563  		ret = vm_munmap(old->userspace_addr,
7564  				old->npages * PAGE_SIZE);
7565  		if (ret < 0)
7566  			printk(KERN_WARNING
7567  			       "kvm_vm_ioctl_set_memory_region: "
7568  			       "failed to munmap memory\n");
7569  	}
7570  
7571  	if (!kvm->arch.n_requested_mmu_pages)
7572  		nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
7573  
7574  	if (nr_mmu_pages)
7575  		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
7576  	/*
7577  	 * Write protect all pages for dirty logging.
7578  	 *
7579  	 * All the sptes including the large sptes which point to this
7580  	 * slot are set to readonly. We can not create any new large
7581  	 * spte on this slot until the end of the logging.
7582  	 *
7583  	 * See the comments in fast_page_fault().
7584  	 */
7585  	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
7586  		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
7587  }
7588  
7589  void kvm_arch_flush_shadow_all(struct kvm *kvm)
7590  {
7591  	kvm_mmu_invalidate_zap_all_pages(kvm);
7592  }
7593  
7594  void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
7595  				   struct kvm_memory_slot *slot)
7596  {
7597  	kvm_mmu_invalidate_zap_all_pages(kvm);
7598  }
7599  
7600  int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
7601  {
7602  	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
7603  		kvm_x86_ops->check_nested_events(vcpu, false);
7604  
7605  	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
7606  		!vcpu->arch.apf.halted)
7607  		|| !list_empty_careful(&vcpu->async_pf.done)
7608  		|| kvm_apic_has_events(vcpu)
7609  		|| vcpu->arch.pv.pv_unhalted
7610  		|| atomic_read(&vcpu->arch.nmi_queued) ||
7611  		(kvm_arch_interrupt_allowed(vcpu) &&
7612  		 kvm_cpu_has_interrupt(vcpu));
7613  }
7614  
7615  int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
7616  {
7617  	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
7618  }
7619  
7620  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
7621  {
7622  	return kvm_x86_ops->interrupt_allowed(vcpu);
7623  }
7624  
7625  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
7626  {
7627  	if (is_64_bit_mode(vcpu))
7628  		return kvm_rip_read(vcpu);
7629  	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
7630  		     kvm_rip_read(vcpu));
7631  }
7632  EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
7633  
7634  bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
7635  {
7636  	return kvm_get_linear_rip(vcpu) == linear_rip;
7637  }
7638  EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
7639  
7640  unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
7641  {
7642  	unsigned long rflags;
7643  
7644  	rflags = kvm_x86_ops->get_rflags(vcpu);
7645  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7646  		rflags &= ~X86_EFLAGS_TF;
7647  	return rflags;
7648  }
7649  EXPORT_SYMBOL_GPL(kvm_get_rflags);
7650  
7651  static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
7652  {
7653  	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
7654  	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
7655  		rflags |= X86_EFLAGS_TF;
7656  	kvm_x86_ops->set_rflags(vcpu, rflags);
7657  }
7658  
7659  void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
7660  {
7661  	__kvm_set_rflags(vcpu, rflags);
7662  	kvm_make_request(KVM_REQ_EVENT, vcpu);
7663  }
7664  EXPORT_SYMBOL_GPL(kvm_set_rflags);
7665  
7666  void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
7667  {
7668  	int r;
7669  
7670  	if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
7671  	      work->wakeup_all)
7672  		return;
7673  
7674  	r = kvm_mmu_reload(vcpu);
7675  	if (unlikely(r))
7676  		return;
7677  
7678  	if (!vcpu->arch.mmu.direct_map &&
7679  	      work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
7680  		return;
7681  
7682  	vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
7683  }
7684  
7685  static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
7686  {
7687  	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
7688  }
7689  
7690  static inline u32 kvm_async_pf_next_probe(u32 key)
7691  {
7692  	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
7693  }
7694  
7695  static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7696  {
7697  	u32 key = kvm_async_pf_hash_fn(gfn);
7698  
7699  	while (vcpu->arch.apf.gfns[key] != ~0)
7700  		key = kvm_async_pf_next_probe(key);
7701  
7702  	vcpu->arch.apf.gfns[key] = gfn;
7703  }
7704  
7705  static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
7706  {
7707  	int i;
7708  	u32 key = kvm_async_pf_hash_fn(gfn);
7709  
7710  	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
7711  		     (vcpu->arch.apf.gfns[key] != gfn &&
7712  		      vcpu->arch.apf.gfns[key] != ~0); i++)
7713  		key = kvm_async_pf_next_probe(key);
7714  
7715  	return key;
7716  }
7717  
7718  bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7719  {
7720  	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
7721  }
7722  
7723  static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
7724  {
7725  	u32 i, j, k;
7726  
7727  	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
7728  	while (true) {
7729  		vcpu->arch.apf.gfns[i] = ~0;
7730  		do {
7731  			j = kvm_async_pf_next_probe(j);
7732  			if (vcpu->arch.apf.gfns[j] == ~0)
7733  				return;
7734  			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
7735  			/*
7736  			 * k lies cyclically in ]i,j]
7737  			 * |    i.k.j |
7738  			 * |....j i.k.| or  |.k..j i...|
7739  			 */
7740  		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
7741  		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
7742  		i = j;
7743  	}
7744  }
7745  
7746  static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
7747  {
7748  
7749  	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
7750  				      sizeof(val));
7751  }
7752  
7753  void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
7754  				     struct kvm_async_pf *work)
7755  {
7756  	struct x86_exception fault;
7757  
7758  	trace_kvm_async_pf_not_present(work->arch.token, work->gva);
7759  	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
7760  
7761  	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
7762  	    (vcpu->arch.apf.send_user_only &&
7763  	     kvm_x86_ops->get_cpl(vcpu) == 0))
7764  		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
7765  	else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
7766  		fault.vector = PF_VECTOR;
7767  		fault.error_code_valid = true;
7768  		fault.error_code = 0;
7769  		fault.nested_page_fault = false;
7770  		fault.address = work->arch.token;
7771  		kvm_inject_page_fault(vcpu, &fault);
7772  	}
7773  }
7774  
7775  void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
7776  				 struct kvm_async_pf *work)
7777  {
7778  	struct x86_exception fault;
7779  
7780  	trace_kvm_async_pf_ready(work->arch.token, work->gva);
7781  	if (work->wakeup_all)
7782  		work->arch.token = ~0; /* broadcast wakeup */
7783  	else
7784  		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
7785  
7786  	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
7787  	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
7788  		fault.vector = PF_VECTOR;
7789  		fault.error_code_valid = true;
7790  		fault.error_code = 0;
7791  		fault.nested_page_fault = false;
7792  		fault.address = work->arch.token;
7793  		kvm_inject_page_fault(vcpu, &fault);
7794  	}
7795  	vcpu->arch.apf.halted = false;
7796  	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
7797  }
7798  
7799  bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
7800  {
7801  	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
7802  		return true;
7803  	else
7804  		return !kvm_event_needs_reinjection(vcpu) &&
7805  			kvm_x86_ops->interrupt_allowed(vcpu);
7806  }
7807  
7808  void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
7809  {
7810  	atomic_inc(&kvm->arch.noncoherent_dma_count);
7811  }
7812  EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
7813  
7814  void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
7815  {
7816  	atomic_dec(&kvm->arch.noncoherent_dma_count);
7817  }
7818  EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
7819  
7820  bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
7821  {
7822  	return atomic_read(&kvm->arch.noncoherent_dma_count);
7823  }
7824  EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
7825  
7826  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
7827  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
7828  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
7829  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
7830  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
7831  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
7832  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
7833  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
7834  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
7835  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
7836  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
7837  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
7838  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
7839  EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
7840