xref: /openbmc/linux/arch/x86/kvm/x86.c (revision 8c19b6f257fa71ed3a7a9df6ce466c6be31ca04c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * derived from drivers/kvm/kvm_main.c
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright (C) 2008 Qumranet, Inc.
9  * Copyright IBM Corporation, 2008
10  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11  *
12  * Authors:
13  *   Avi Kivity   <avi@qumranet.com>
14  *   Yaniv Kamay  <yaniv@qumranet.com>
15  *   Amit Shah    <amit.shah@qumranet.com>
16  *   Ben-Ami Yassour <benami@il.ibm.com>
17  */
18 
19 #include <linux/kvm_host.h>
20 #include "irq.h"
21 #include "ioapic.h"
22 #include "mmu.h"
23 #include "i8254.h"
24 #include "tss.h"
25 #include "kvm_cache_regs.h"
26 #include "kvm_emulate.h"
27 #include "x86.h"
28 #include "cpuid.h"
29 #include "pmu.h"
30 #include "hyperv.h"
31 #include "lapic.h"
32 #include "xen.h"
33 #include "smm.h"
34 
35 #include <linux/clocksource.h>
36 #include <linux/interrupt.h>
37 #include <linux/kvm.h>
38 #include <linux/fs.h>
39 #include <linux/vmalloc.h>
40 #include <linux/export.h>
41 #include <linux/moduleparam.h>
42 #include <linux/mman.h>
43 #include <linux/highmem.h>
44 #include <linux/iommu.h>
45 #include <linux/cpufreq.h>
46 #include <linux/user-return-notifier.h>
47 #include <linux/srcu.h>
48 #include <linux/slab.h>
49 #include <linux/perf_event.h>
50 #include <linux/uaccess.h>
51 #include <linux/hash.h>
52 #include <linux/pci.h>
53 #include <linux/timekeeper_internal.h>
54 #include <linux/pvclock_gtod.h>
55 #include <linux/kvm_irqfd.h>
56 #include <linux/irqbypass.h>
57 #include <linux/sched/stat.h>
58 #include <linux/sched/isolation.h>
59 #include <linux/mem_encrypt.h>
60 #include <linux/entry-kvm.h>
61 #include <linux/suspend.h>
62 
63 #include <trace/events/kvm.h>
64 
65 #include <asm/debugreg.h>
66 #include <asm/msr.h>
67 #include <asm/desc.h>
68 #include <asm/mce.h>
69 #include <asm/pkru.h>
70 #include <linux/kernel_stat.h>
71 #include <asm/fpu/api.h>
72 #include <asm/fpu/xcr.h>
73 #include <asm/fpu/xstate.h>
74 #include <asm/pvclock.h>
75 #include <asm/div64.h>
76 #include <asm/irq_remapping.h>
77 #include <asm/mshyperv.h>
78 #include <asm/hypervisor.h>
79 #include <asm/tlbflush.h>
80 #include <asm/intel_pt.h>
81 #include <asm/emulate_prefix.h>
82 #include <asm/sgx.h>
83 #include <clocksource/hyperv_timer.h>
84 
85 #define CREATE_TRACE_POINTS
86 #include "trace.h"
87 
88 #define MAX_IO_MSRS 256
89 #define KVM_MAX_MCE_BANKS 32
90 
91 struct kvm_caps kvm_caps __read_mostly = {
92 	.supported_mce_cap = MCG_CTL_P | MCG_SER_P,
93 };
94 EXPORT_SYMBOL_GPL(kvm_caps);
95 
96 #define  ERR_PTR_USR(e)  ((void __user *)ERR_PTR(e))
97 
98 #define emul_to_vcpu(ctxt) \
99 	((struct kvm_vcpu *)(ctxt)->vcpu)
100 
101 /* EFER defaults:
102  * - enable syscall per default because its emulated by KVM
103  * - enable LME and LMA per default on 64 bit KVM
104  */
105 #ifdef CONFIG_X86_64
106 static
107 u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
108 #else
109 static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
110 #endif
111 
112 static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
113 
114 #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE)
115 
116 #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE
117 
118 #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
119                                     KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
120 
121 static void update_cr8_intercept(struct kvm_vcpu *vcpu);
122 static void process_nmi(struct kvm_vcpu *vcpu);
123 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
124 static void store_regs(struct kvm_vcpu *vcpu);
125 static int sync_regs(struct kvm_vcpu *vcpu);
126 static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu);
127 
128 static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
129 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2);
130 
131 struct kvm_x86_ops kvm_x86_ops __read_mostly;
132 
133 #define KVM_X86_OP(func)					     \
134 	DEFINE_STATIC_CALL_NULL(kvm_x86_##func,			     \
135 				*(((struct kvm_x86_ops *)0)->func));
136 #define KVM_X86_OP_OPTIONAL KVM_X86_OP
137 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
138 #include <asm/kvm-x86-ops.h>
139 EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
140 EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
141 
142 static bool __read_mostly ignore_msrs = 0;
143 module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
144 
145 bool __read_mostly report_ignored_msrs = true;
146 module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
147 EXPORT_SYMBOL_GPL(report_ignored_msrs);
148 
149 unsigned int min_timer_period_us = 200;
150 module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
151 
152 static bool __read_mostly kvmclock_periodic_sync = true;
153 module_param(kvmclock_periodic_sync, bool, S_IRUGO);
154 
155 /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
156 static u32 __read_mostly tsc_tolerance_ppm = 250;
157 module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
158 
159 /*
160  * lapic timer advance (tscdeadline mode only) in nanoseconds.  '-1' enables
161  * adaptive tuning starting from default advancement of 1000ns.  '0' disables
162  * advancement entirely.  Any other value is used as-is and disables adaptive
163  * tuning, i.e. allows privileged userspace to set an exact advancement time.
164  */
165 static int __read_mostly lapic_timer_advance_ns = -1;
166 module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
167 
168 static bool __read_mostly vector_hashing = true;
169 module_param(vector_hashing, bool, S_IRUGO);
170 
171 bool __read_mostly enable_vmware_backdoor = false;
172 module_param(enable_vmware_backdoor, bool, S_IRUGO);
173 EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
174 
175 /*
176  * Flags to manipulate forced emulation behavior (any non-zero value will
177  * enable forced emulation).
178  */
179 #define KVM_FEP_CLEAR_RFLAGS_RF	BIT(1)
180 static int __read_mostly force_emulation_prefix;
181 module_param(force_emulation_prefix, int, 0644);
182 
183 int __read_mostly pi_inject_timer = -1;
184 module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
185 
186 /* Enable/disable PMU virtualization */
187 bool __read_mostly enable_pmu = true;
188 EXPORT_SYMBOL_GPL(enable_pmu);
189 module_param(enable_pmu, bool, 0444);
190 
191 bool __read_mostly eager_page_split = true;
192 module_param(eager_page_split, bool, 0644);
193 
194 /*
195  * Restoring the host value for MSRs that are only consumed when running in
196  * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
197  * returns to userspace, i.e. the kernel can run with the guest's value.
198  */
199 #define KVM_MAX_NR_USER_RETURN_MSRS 16
200 
201 struct kvm_user_return_msrs {
202 	struct user_return_notifier urn;
203 	bool registered;
204 	struct kvm_user_return_msr_values {
205 		u64 host;
206 		u64 curr;
207 	} values[KVM_MAX_NR_USER_RETURN_MSRS];
208 };
209 
210 u32 __read_mostly kvm_nr_uret_msrs;
211 EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs);
212 static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS];
213 static struct kvm_user_return_msrs __percpu *user_return_msrs;
214 
215 #define KVM_SUPPORTED_XCR0     (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
216 				| XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
217 				| XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
218 				| XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE)
219 
220 u64 __read_mostly host_efer;
221 EXPORT_SYMBOL_GPL(host_efer);
222 
223 bool __read_mostly allow_smaller_maxphyaddr = 0;
224 EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
225 
226 bool __read_mostly enable_apicv = true;
227 EXPORT_SYMBOL_GPL(enable_apicv);
228 
229 u64 __read_mostly host_xss;
230 EXPORT_SYMBOL_GPL(host_xss);
231 
232 const struct _kvm_stats_desc kvm_vm_stats_desc[] = {
233 	KVM_GENERIC_VM_STATS(),
234 	STATS_DESC_COUNTER(VM, mmu_shadow_zapped),
235 	STATS_DESC_COUNTER(VM, mmu_pte_write),
236 	STATS_DESC_COUNTER(VM, mmu_pde_zapped),
237 	STATS_DESC_COUNTER(VM, mmu_flooded),
238 	STATS_DESC_COUNTER(VM, mmu_recycled),
239 	STATS_DESC_COUNTER(VM, mmu_cache_miss),
240 	STATS_DESC_ICOUNTER(VM, mmu_unsync),
241 	STATS_DESC_ICOUNTER(VM, pages_4k),
242 	STATS_DESC_ICOUNTER(VM, pages_2m),
243 	STATS_DESC_ICOUNTER(VM, pages_1g),
244 	STATS_DESC_ICOUNTER(VM, nx_lpage_splits),
245 	STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size),
246 	STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions)
247 };
248 
249 const struct kvm_stats_header kvm_vm_stats_header = {
250 	.name_size = KVM_STATS_NAME_SIZE,
251 	.num_desc = ARRAY_SIZE(kvm_vm_stats_desc),
252 	.id_offset = sizeof(struct kvm_stats_header),
253 	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
254 	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
255 		       sizeof(kvm_vm_stats_desc),
256 };
257 
258 const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = {
259 	KVM_GENERIC_VCPU_STATS(),
260 	STATS_DESC_COUNTER(VCPU, pf_taken),
261 	STATS_DESC_COUNTER(VCPU, pf_fixed),
262 	STATS_DESC_COUNTER(VCPU, pf_emulate),
263 	STATS_DESC_COUNTER(VCPU, pf_spurious),
264 	STATS_DESC_COUNTER(VCPU, pf_fast),
265 	STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created),
266 	STATS_DESC_COUNTER(VCPU, pf_guest),
267 	STATS_DESC_COUNTER(VCPU, tlb_flush),
268 	STATS_DESC_COUNTER(VCPU, invlpg),
269 	STATS_DESC_COUNTER(VCPU, exits),
270 	STATS_DESC_COUNTER(VCPU, io_exits),
271 	STATS_DESC_COUNTER(VCPU, mmio_exits),
272 	STATS_DESC_COUNTER(VCPU, signal_exits),
273 	STATS_DESC_COUNTER(VCPU, irq_window_exits),
274 	STATS_DESC_COUNTER(VCPU, nmi_window_exits),
275 	STATS_DESC_COUNTER(VCPU, l1d_flush),
276 	STATS_DESC_COUNTER(VCPU, halt_exits),
277 	STATS_DESC_COUNTER(VCPU, request_irq_exits),
278 	STATS_DESC_COUNTER(VCPU, irq_exits),
279 	STATS_DESC_COUNTER(VCPU, host_state_reload),
280 	STATS_DESC_COUNTER(VCPU, fpu_reload),
281 	STATS_DESC_COUNTER(VCPU, insn_emulation),
282 	STATS_DESC_COUNTER(VCPU, insn_emulation_fail),
283 	STATS_DESC_COUNTER(VCPU, hypercalls),
284 	STATS_DESC_COUNTER(VCPU, irq_injections),
285 	STATS_DESC_COUNTER(VCPU, nmi_injections),
286 	STATS_DESC_COUNTER(VCPU, req_event),
287 	STATS_DESC_COUNTER(VCPU, nested_run),
288 	STATS_DESC_COUNTER(VCPU, directed_yield_attempted),
289 	STATS_DESC_COUNTER(VCPU, directed_yield_successful),
290 	STATS_DESC_COUNTER(VCPU, preemption_reported),
291 	STATS_DESC_COUNTER(VCPU, preemption_other),
292 	STATS_DESC_IBOOLEAN(VCPU, guest_mode),
293 	STATS_DESC_COUNTER(VCPU, notify_window_exits),
294 };
295 
296 const struct kvm_stats_header kvm_vcpu_stats_header = {
297 	.name_size = KVM_STATS_NAME_SIZE,
298 	.num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc),
299 	.id_offset = sizeof(struct kvm_stats_header),
300 	.desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE,
301 	.data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE +
302 		       sizeof(kvm_vcpu_stats_desc),
303 };
304 
305 u64 __read_mostly host_xcr0;
306 
307 static struct kmem_cache *x86_emulator_cache;
308 
309 /*
310  * When called, it means the previous get/set msr reached an invalid msr.
311  * Return true if we want to ignore/silent this failed msr access.
312  */
313 static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write)
314 {
315 	const char *op = write ? "wrmsr" : "rdmsr";
316 
317 	if (ignore_msrs) {
318 		if (report_ignored_msrs)
319 			kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
320 				      op, msr, data);
321 		/* Mask the error */
322 		return true;
323 	} else {
324 		kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
325 				      op, msr, data);
326 		return false;
327 	}
328 }
329 
330 static struct kmem_cache *kvm_alloc_emulator_cache(void)
331 {
332 	unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
333 	unsigned int size = sizeof(struct x86_emulate_ctxt);
334 
335 	return kmem_cache_create_usercopy("x86_emulator", size,
336 					  __alignof__(struct x86_emulate_ctxt),
337 					  SLAB_ACCOUNT, useroffset,
338 					  size - useroffset, NULL);
339 }
340 
341 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
342 
343 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
344 {
345 	int i;
346 	for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
347 		vcpu->arch.apf.gfns[i] = ~0;
348 }
349 
350 static void kvm_on_user_return(struct user_return_notifier *urn)
351 {
352 	unsigned slot;
353 	struct kvm_user_return_msrs *msrs
354 		= container_of(urn, struct kvm_user_return_msrs, urn);
355 	struct kvm_user_return_msr_values *values;
356 	unsigned long flags;
357 
358 	/*
359 	 * Disabling irqs at this point since the following code could be
360 	 * interrupted and executed through kvm_arch_hardware_disable()
361 	 */
362 	local_irq_save(flags);
363 	if (msrs->registered) {
364 		msrs->registered = false;
365 		user_return_notifier_unregister(urn);
366 	}
367 	local_irq_restore(flags);
368 	for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) {
369 		values = &msrs->values[slot];
370 		if (values->host != values->curr) {
371 			wrmsrl(kvm_uret_msrs_list[slot], values->host);
372 			values->curr = values->host;
373 		}
374 	}
375 }
376 
377 static int kvm_probe_user_return_msr(u32 msr)
378 {
379 	u64 val;
380 	int ret;
381 
382 	preempt_disable();
383 	ret = rdmsrl_safe(msr, &val);
384 	if (ret)
385 		goto out;
386 	ret = wrmsrl_safe(msr, val);
387 out:
388 	preempt_enable();
389 	return ret;
390 }
391 
392 int kvm_add_user_return_msr(u32 msr)
393 {
394 	BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS);
395 
396 	if (kvm_probe_user_return_msr(msr))
397 		return -1;
398 
399 	kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr;
400 	return kvm_nr_uret_msrs++;
401 }
402 EXPORT_SYMBOL_GPL(kvm_add_user_return_msr);
403 
404 int kvm_find_user_return_msr(u32 msr)
405 {
406 	int i;
407 
408 	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
409 		if (kvm_uret_msrs_list[i] == msr)
410 			return i;
411 	}
412 	return -1;
413 }
414 EXPORT_SYMBOL_GPL(kvm_find_user_return_msr);
415 
416 static void kvm_user_return_msr_cpu_online(void)
417 {
418 	unsigned int cpu = smp_processor_id();
419 	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
420 	u64 value;
421 	int i;
422 
423 	for (i = 0; i < kvm_nr_uret_msrs; ++i) {
424 		rdmsrl_safe(kvm_uret_msrs_list[i], &value);
425 		msrs->values[i].host = value;
426 		msrs->values[i].curr = value;
427 	}
428 }
429 
430 int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
431 {
432 	unsigned int cpu = smp_processor_id();
433 	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
434 	int err;
435 
436 	value = (value & mask) | (msrs->values[slot].host & ~mask);
437 	if (value == msrs->values[slot].curr)
438 		return 0;
439 	err = wrmsrl_safe(kvm_uret_msrs_list[slot], value);
440 	if (err)
441 		return 1;
442 
443 	msrs->values[slot].curr = value;
444 	if (!msrs->registered) {
445 		msrs->urn.on_user_return = kvm_on_user_return;
446 		user_return_notifier_register(&msrs->urn);
447 		msrs->registered = true;
448 	}
449 	return 0;
450 }
451 EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
452 
453 static void drop_user_return_notifiers(void)
454 {
455 	unsigned int cpu = smp_processor_id();
456 	struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
457 
458 	if (msrs->registered)
459 		kvm_on_user_return(&msrs->urn);
460 }
461 
462 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
463 {
464 	return vcpu->arch.apic_base;
465 }
466 
467 enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
468 {
469 	return kvm_apic_mode(kvm_get_apic_base(vcpu));
470 }
471 EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
472 
473 int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
474 {
475 	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
476 	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
477 	u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff |
478 		(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
479 
480 	if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID)
481 		return 1;
482 	if (!msr_info->host_initiated) {
483 		if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
484 			return 1;
485 		if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
486 			return 1;
487 	}
488 
489 	kvm_lapic_set_base(vcpu, msr_info->data);
490 	kvm_recalculate_apic_map(vcpu->kvm);
491 	return 0;
492 }
493 
494 /*
495  * Handle a fault on a hardware virtualization (VMX or SVM) instruction.
496  *
497  * Hardware virtualization extension instructions may fault if a reboot turns
498  * off virtualization while processes are running.  Usually after catching the
499  * fault we just panic; during reboot instead the instruction is ignored.
500  */
501 noinstr void kvm_spurious_fault(void)
502 {
503 	/* Fault while not rebooting.  We want the trace. */
504 	BUG_ON(!kvm_rebooting);
505 }
506 EXPORT_SYMBOL_GPL(kvm_spurious_fault);
507 
508 #define EXCPT_BENIGN		0
509 #define EXCPT_CONTRIBUTORY	1
510 #define EXCPT_PF		2
511 
512 static int exception_class(int vector)
513 {
514 	switch (vector) {
515 	case PF_VECTOR:
516 		return EXCPT_PF;
517 	case DE_VECTOR:
518 	case TS_VECTOR:
519 	case NP_VECTOR:
520 	case SS_VECTOR:
521 	case GP_VECTOR:
522 		return EXCPT_CONTRIBUTORY;
523 	default:
524 		break;
525 	}
526 	return EXCPT_BENIGN;
527 }
528 
529 #define EXCPT_FAULT		0
530 #define EXCPT_TRAP		1
531 #define EXCPT_ABORT		2
532 #define EXCPT_INTERRUPT		3
533 #define EXCPT_DB		4
534 
535 static int exception_type(int vector)
536 {
537 	unsigned int mask;
538 
539 	if (WARN_ON(vector > 31 || vector == NMI_VECTOR))
540 		return EXCPT_INTERRUPT;
541 
542 	mask = 1 << vector;
543 
544 	/*
545 	 * #DBs can be trap-like or fault-like, the caller must check other CPU
546 	 * state, e.g. DR6, to determine whether a #DB is a trap or fault.
547 	 */
548 	if (mask & (1 << DB_VECTOR))
549 		return EXCPT_DB;
550 
551 	if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
552 		return EXCPT_TRAP;
553 
554 	if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
555 		return EXCPT_ABORT;
556 
557 	/* Reserved exceptions will result in fault */
558 	return EXCPT_FAULT;
559 }
560 
561 void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
562 				   struct kvm_queued_exception *ex)
563 {
564 	if (!ex->has_payload)
565 		return;
566 
567 	switch (ex->vector) {
568 	case DB_VECTOR:
569 		/*
570 		 * "Certain debug exceptions may clear bit 0-3.  The
571 		 * remaining contents of the DR6 register are never
572 		 * cleared by the processor".
573 		 */
574 		vcpu->arch.dr6 &= ~DR_TRAP_BITS;
575 		/*
576 		 * In order to reflect the #DB exception payload in guest
577 		 * dr6, three components need to be considered: active low
578 		 * bit, FIXED_1 bits and active high bits (e.g. DR6_BD,
579 		 * DR6_BS and DR6_BT)
580 		 * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits.
581 		 * In the target guest dr6:
582 		 * FIXED_1 bits should always be set.
583 		 * Active low bits should be cleared if 1-setting in payload.
584 		 * Active high bits should be set if 1-setting in payload.
585 		 *
586 		 * Note, the payload is compatible with the pending debug
587 		 * exceptions/exit qualification under VMX, that active_low bits
588 		 * are active high in payload.
589 		 * So they need to be flipped for DR6.
590 		 */
591 		vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
592 		vcpu->arch.dr6 |= ex->payload;
593 		vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
594 
595 		/*
596 		 * The #DB payload is defined as compatible with the 'pending
597 		 * debug exceptions' field under VMX, not DR6. While bit 12 is
598 		 * defined in the 'pending debug exceptions' field (enabled
599 		 * breakpoint), it is reserved and must be zero in DR6.
600 		 */
601 		vcpu->arch.dr6 &= ~BIT(12);
602 		break;
603 	case PF_VECTOR:
604 		vcpu->arch.cr2 = ex->payload;
605 		break;
606 	}
607 
608 	ex->has_payload = false;
609 	ex->payload = 0;
610 }
611 EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
612 
613 static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
614 				       bool has_error_code, u32 error_code,
615 				       bool has_payload, unsigned long payload)
616 {
617 	struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
618 
619 	ex->vector = vector;
620 	ex->injected = false;
621 	ex->pending = true;
622 	ex->has_error_code = has_error_code;
623 	ex->error_code = error_code;
624 	ex->has_payload = has_payload;
625 	ex->payload = payload;
626 }
627 
628 /* Forcibly leave the nested mode in cases like a vCPU reset */
629 static void kvm_leave_nested(struct kvm_vcpu *vcpu)
630 {
631 	kvm_x86_ops.nested_ops->leave_nested(vcpu);
632 }
633 
634 static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
635 		unsigned nr, bool has_error, u32 error_code,
636 	        bool has_payload, unsigned long payload, bool reinject)
637 {
638 	u32 prev_nr;
639 	int class1, class2;
640 
641 	kvm_make_request(KVM_REQ_EVENT, vcpu);
642 
643 	/*
644 	 * If the exception is destined for L2 and isn't being reinjected,
645 	 * morph it to a VM-Exit if L1 wants to intercept the exception.  A
646 	 * previously injected exception is not checked because it was checked
647 	 * when it was original queued, and re-checking is incorrect if _L1_
648 	 * injected the exception, in which case it's exempt from interception.
649 	 */
650 	if (!reinject && is_guest_mode(vcpu) &&
651 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
652 		kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
653 					   has_payload, payload);
654 		return;
655 	}
656 
657 	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
658 	queue:
659 		if (reinject) {
660 			/*
661 			 * On VM-Entry, an exception can be pending if and only
662 			 * if event injection was blocked by nested_run_pending.
663 			 * In that case, however, vcpu_enter_guest() requests an
664 			 * immediate exit, and the guest shouldn't proceed far
665 			 * enough to need reinjection.
666 			 */
667 			WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
668 			vcpu->arch.exception.injected = true;
669 			if (WARN_ON_ONCE(has_payload)) {
670 				/*
671 				 * A reinjected event has already
672 				 * delivered its payload.
673 				 */
674 				has_payload = false;
675 				payload = 0;
676 			}
677 		} else {
678 			vcpu->arch.exception.pending = true;
679 			vcpu->arch.exception.injected = false;
680 		}
681 		vcpu->arch.exception.has_error_code = has_error;
682 		vcpu->arch.exception.vector = nr;
683 		vcpu->arch.exception.error_code = error_code;
684 		vcpu->arch.exception.has_payload = has_payload;
685 		vcpu->arch.exception.payload = payload;
686 		if (!is_guest_mode(vcpu))
687 			kvm_deliver_exception_payload(vcpu,
688 						      &vcpu->arch.exception);
689 		return;
690 	}
691 
692 	/* to check exception */
693 	prev_nr = vcpu->arch.exception.vector;
694 	if (prev_nr == DF_VECTOR) {
695 		/* triple fault -> shutdown */
696 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
697 		return;
698 	}
699 	class1 = exception_class(prev_nr);
700 	class2 = exception_class(nr);
701 	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
702 	    (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
703 		/*
704 		 * Synthesize #DF.  Clear the previously injected or pending
705 		 * exception so as not to incorrectly trigger shutdown.
706 		 */
707 		vcpu->arch.exception.injected = false;
708 		vcpu->arch.exception.pending = false;
709 
710 		kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
711 	} else {
712 		/* replace previous exception with a new one in a hope
713 		   that instruction re-execution will regenerate lost
714 		   exception */
715 		goto queue;
716 	}
717 }
718 
719 void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
720 {
721 	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
722 }
723 EXPORT_SYMBOL_GPL(kvm_queue_exception);
724 
725 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
726 {
727 	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
728 }
729 EXPORT_SYMBOL_GPL(kvm_requeue_exception);
730 
731 void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
732 			   unsigned long payload)
733 {
734 	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
735 }
736 EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
737 
738 static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
739 				    u32 error_code, unsigned long payload)
740 {
741 	kvm_multiple_exception(vcpu, nr, true, error_code,
742 			       true, payload, false);
743 }
744 
745 int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
746 {
747 	if (err)
748 		kvm_inject_gp(vcpu, 0);
749 	else
750 		return kvm_skip_emulated_instruction(vcpu);
751 
752 	return 1;
753 }
754 EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
755 
756 static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
757 {
758 	if (err) {
759 		kvm_inject_gp(vcpu, 0);
760 		return 1;
761 	}
762 
763 	return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
764 				       EMULTYPE_COMPLETE_USER_EXIT);
765 }
766 
767 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
768 {
769 	++vcpu->stat.pf_guest;
770 
771 	/*
772 	 * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
773 	 * whether or not L1 wants to intercept "regular" #PF.
774 	 */
775 	if (is_guest_mode(vcpu) && fault->async_page_fault)
776 		kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
777 					   true, fault->error_code,
778 					   true, fault->address);
779 	else
780 		kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
781 					fault->address);
782 }
783 
784 void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
785 				    struct x86_exception *fault)
786 {
787 	struct kvm_mmu *fault_mmu;
788 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
789 
790 	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
791 					       vcpu->arch.walk_mmu;
792 
793 	/*
794 	 * Invalidate the TLB entry for the faulting address, if it exists,
795 	 * else the access will fault indefinitely (and to emulate hardware).
796 	 */
797 	if ((fault->error_code & PFERR_PRESENT_MASK) &&
798 	    !(fault->error_code & PFERR_RSVD_MASK))
799 		kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
800 				       fault_mmu->root.hpa);
801 
802 	fault_mmu->inject_page_fault(vcpu, fault);
803 }
804 EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
805 
806 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
807 {
808 	atomic_inc(&vcpu->arch.nmi_queued);
809 	kvm_make_request(KVM_REQ_NMI, vcpu);
810 }
811 
812 void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
813 {
814 	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
815 }
816 EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
817 
818 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
819 {
820 	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
821 }
822 EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
823 
824 /*
825  * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
826  * a #GP and return false.
827  */
828 bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
829 {
830 	if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl)
831 		return true;
832 	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
833 	return false;
834 }
835 
836 bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
837 {
838 	if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
839 		return true;
840 
841 	kvm_queue_exception(vcpu, UD_VECTOR);
842 	return false;
843 }
844 EXPORT_SYMBOL_GPL(kvm_require_dr);
845 
846 static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
847 {
848 	return vcpu->arch.reserved_gpa_bits | rsvd_bits(5, 8) | rsvd_bits(1, 2);
849 }
850 
851 /*
852  * Load the pae pdptrs.  Return 1 if they are all valid, 0 otherwise.
853  */
854 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
855 {
856 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
857 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
858 	gpa_t real_gpa;
859 	int i;
860 	int ret;
861 	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
862 
863 	/*
864 	 * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
865 	 * to an L1 GPA.
866 	 */
867 	real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(pdpt_gfn),
868 				     PFERR_USER_MASK | PFERR_WRITE_MASK, NULL);
869 	if (real_gpa == INVALID_GPA)
870 		return 0;
871 
872 	/* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */
873 	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(real_gpa), pdpte,
874 				       cr3 & GENMASK(11, 5), sizeof(pdpte));
875 	if (ret < 0)
876 		return 0;
877 
878 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
879 		if ((pdpte[i] & PT_PRESENT_MASK) &&
880 		    (pdpte[i] & pdptr_rsvd_bits(vcpu))) {
881 			return 0;
882 		}
883 	}
884 
885 	/*
886 	 * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled.
887 	 * Shadow page roots need to be reconstructed instead.
888 	 */
889 	if (!tdp_enabled && memcmp(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)))
890 		kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
891 
892 	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
893 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
894 	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
895 	vcpu->arch.pdptrs_from_userspace = false;
896 
897 	return 1;
898 }
899 EXPORT_SYMBOL_GPL(load_pdptrs);
900 
901 void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0)
902 {
903 	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
904 		kvm_clear_async_pf_completion_queue(vcpu);
905 		kvm_async_pf_hash_reset(vcpu);
906 
907 		/*
908 		 * Clearing CR0.PG is defined to flush the TLB from the guest's
909 		 * perspective.
910 		 */
911 		if (!(cr0 & X86_CR0_PG))
912 			kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
913 	}
914 
915 	if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS)
916 		kvm_mmu_reset_context(vcpu);
917 
918 	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
919 	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
920 	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
921 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
922 }
923 EXPORT_SYMBOL_GPL(kvm_post_set_cr0);
924 
925 int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
926 {
927 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
928 
929 	cr0 |= X86_CR0_ET;
930 
931 #ifdef CONFIG_X86_64
932 	if (cr0 & 0xffffffff00000000UL)
933 		return 1;
934 #endif
935 
936 	cr0 &= ~CR0_RESERVED_BITS;
937 
938 	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
939 		return 1;
940 
941 	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
942 		return 1;
943 
944 #ifdef CONFIG_X86_64
945 	if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
946 	    (cr0 & X86_CR0_PG)) {
947 		int cs_db, cs_l;
948 
949 		if (!is_pae(vcpu))
950 			return 1;
951 		static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
952 		if (cs_l)
953 			return 1;
954 	}
955 #endif
956 	if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
957 	    is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) &&
958 	    !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
959 		return 1;
960 
961 	if (!(cr0 & X86_CR0_PG) &&
962 	    (is_64_bit_mode(vcpu) || kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)))
963 		return 1;
964 
965 	static_call(kvm_x86_set_cr0)(vcpu, cr0);
966 
967 	kvm_post_set_cr0(vcpu, old_cr0, cr0);
968 
969 	return 0;
970 }
971 EXPORT_SYMBOL_GPL(kvm_set_cr0);
972 
973 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
974 {
975 	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
976 }
977 EXPORT_SYMBOL_GPL(kvm_lmsw);
978 
979 void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
980 {
981 	if (vcpu->arch.guest_state_protected)
982 		return;
983 
984 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
985 
986 		if (vcpu->arch.xcr0 != host_xcr0)
987 			xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
988 
989 		if (vcpu->arch.xsaves_enabled &&
990 		    vcpu->arch.ia32_xss != host_xss)
991 			wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
992 	}
993 
994 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
995 	if (static_cpu_has(X86_FEATURE_PKU) &&
996 	    vcpu->arch.pkru != vcpu->arch.host_pkru &&
997 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
998 	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
999 		write_pkru(vcpu->arch.pkru);
1000 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
1001 }
1002 EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
1003 
1004 void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
1005 {
1006 	if (vcpu->arch.guest_state_protected)
1007 		return;
1008 
1009 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
1010 	if (static_cpu_has(X86_FEATURE_PKU) &&
1011 	    ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
1012 	     kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
1013 		vcpu->arch.pkru = rdpkru();
1014 		if (vcpu->arch.pkru != vcpu->arch.host_pkru)
1015 			write_pkru(vcpu->arch.host_pkru);
1016 	}
1017 #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
1018 
1019 	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
1020 
1021 		if (vcpu->arch.xcr0 != host_xcr0)
1022 			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
1023 
1024 		if (vcpu->arch.xsaves_enabled &&
1025 		    vcpu->arch.ia32_xss != host_xss)
1026 			wrmsrl(MSR_IA32_XSS, host_xss);
1027 	}
1028 
1029 }
1030 EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
1031 
1032 #ifdef CONFIG_X86_64
1033 static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu)
1034 {
1035 	return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC;
1036 }
1037 #endif
1038 
1039 static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
1040 {
1041 	u64 xcr0 = xcr;
1042 	u64 old_xcr0 = vcpu->arch.xcr0;
1043 	u64 valid_bits;
1044 
1045 	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
1046 	if (index != XCR_XFEATURE_ENABLED_MASK)
1047 		return 1;
1048 	if (!(xcr0 & XFEATURE_MASK_FP))
1049 		return 1;
1050 	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
1051 		return 1;
1052 
1053 	/*
1054 	 * Do not allow the guest to set bits that we do not support
1055 	 * saving.  However, xcr0 bit 0 is always set, even if the
1056 	 * emulated CPU does not support XSAVE (see kvm_vcpu_reset()).
1057 	 */
1058 	valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP;
1059 	if (xcr0 & ~valid_bits)
1060 		return 1;
1061 
1062 	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
1063 	    (!(xcr0 & XFEATURE_MASK_BNDCSR)))
1064 		return 1;
1065 
1066 	if (xcr0 & XFEATURE_MASK_AVX512) {
1067 		if (!(xcr0 & XFEATURE_MASK_YMM))
1068 			return 1;
1069 		if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
1070 			return 1;
1071 	}
1072 
1073 	if ((xcr0 & XFEATURE_MASK_XTILE) &&
1074 	    ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE))
1075 		return 1;
1076 
1077 	vcpu->arch.xcr0 = xcr0;
1078 
1079 	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
1080 		kvm_update_cpuid_runtime(vcpu);
1081 	return 0;
1082 }
1083 
1084 int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu)
1085 {
1086 	/* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */
1087 	if (static_call(kvm_x86_get_cpl)(vcpu) != 0 ||
1088 	    __kvm_set_xcr(vcpu, kvm_rcx_read(vcpu), kvm_read_edx_eax(vcpu))) {
1089 		kvm_inject_gp(vcpu, 0);
1090 		return 1;
1091 	}
1092 
1093 	return kvm_skip_emulated_instruction(vcpu);
1094 }
1095 EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv);
1096 
1097 bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1098 {
1099 	if (cr4 & cr4_reserved_bits)
1100 		return false;
1101 
1102 	if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
1103 		return false;
1104 
1105 	return true;
1106 }
1107 EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4);
1108 
1109 static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1110 {
1111 	return __kvm_is_valid_cr4(vcpu, cr4) &&
1112 	       static_call(kvm_x86_is_valid_cr4)(vcpu, cr4);
1113 }
1114 
1115 void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4)
1116 {
1117 	if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS)
1118 		kvm_mmu_reset_context(vcpu);
1119 
1120 	/*
1121 	 * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB
1122 	 * according to the SDM; however, stale prev_roots could be reused
1123 	 * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we
1124 	 * free them all.  This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST
1125 	 * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed,
1126 	 * so fall through.
1127 	 */
1128 	if (!tdp_enabled &&
1129 	    (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE))
1130 		kvm_mmu_unload(vcpu);
1131 
1132 	/*
1133 	 * The TLB has to be flushed for all PCIDs if any of the following
1134 	 * (architecturally required) changes happen:
1135 	 * - CR4.PCIDE is changed from 1 to 0
1136 	 * - CR4.PGE is toggled
1137 	 *
1138 	 * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT.
1139 	 */
1140 	if (((cr4 ^ old_cr4) & X86_CR4_PGE) ||
1141 	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
1142 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1143 
1144 	/*
1145 	 * The TLB has to be flushed for the current PCID if any of the
1146 	 * following (architecturally required) changes happen:
1147 	 * - CR4.SMEP is changed from 0 to 1
1148 	 * - CR4.PAE is toggled
1149 	 */
1150 	else if (((cr4 ^ old_cr4) & X86_CR4_PAE) ||
1151 		 ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP)))
1152 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1153 
1154 }
1155 EXPORT_SYMBOL_GPL(kvm_post_set_cr4);
1156 
1157 int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1158 {
1159 	unsigned long old_cr4 = kvm_read_cr4(vcpu);
1160 
1161 	if (!kvm_is_valid_cr4(vcpu, cr4))
1162 		return 1;
1163 
1164 	if (is_long_mode(vcpu)) {
1165 		if (!(cr4 & X86_CR4_PAE))
1166 			return 1;
1167 		if ((cr4 ^ old_cr4) & X86_CR4_LA57)
1168 			return 1;
1169 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
1170 		   && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS)
1171 		   && !load_pdptrs(vcpu, kvm_read_cr3(vcpu)))
1172 		return 1;
1173 
1174 	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
1175 		if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
1176 			return 1;
1177 
1178 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
1179 		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
1180 			return 1;
1181 	}
1182 
1183 	static_call(kvm_x86_set_cr4)(vcpu, cr4);
1184 
1185 	kvm_post_set_cr4(vcpu, old_cr4, cr4);
1186 
1187 	return 0;
1188 }
1189 EXPORT_SYMBOL_GPL(kvm_set_cr4);
1190 
1191 static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid)
1192 {
1193 	struct kvm_mmu *mmu = vcpu->arch.mmu;
1194 	unsigned long roots_to_free = 0;
1195 	int i;
1196 
1197 	/*
1198 	 * MOV CR3 and INVPCID are usually not intercepted when using TDP, but
1199 	 * this is reachable when running EPT=1 and unrestricted_guest=0,  and
1200 	 * also via the emulator.  KVM's TDP page tables are not in the scope of
1201 	 * the invalidation, but the guest's TLB entries need to be flushed as
1202 	 * the CPU may have cached entries in its TLB for the target PCID.
1203 	 */
1204 	if (unlikely(tdp_enabled)) {
1205 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
1206 		return;
1207 	}
1208 
1209 	/*
1210 	 * If neither the current CR3 nor any of the prev_roots use the given
1211 	 * PCID, then nothing needs to be done here because a resync will
1212 	 * happen anyway before switching to any other CR3.
1213 	 */
1214 	if (kvm_get_active_pcid(vcpu) == pcid) {
1215 		kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
1216 		kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
1217 	}
1218 
1219 	/*
1220 	 * If PCID is disabled, there is no need to free prev_roots even if the
1221 	 * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB
1222 	 * with PCIDE=0.
1223 	 */
1224 	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
1225 		return;
1226 
1227 	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
1228 		if (kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd) == pcid)
1229 			roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
1230 
1231 	kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free);
1232 }
1233 
1234 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1235 {
1236 	bool skip_tlb_flush = false;
1237 	unsigned long pcid = 0;
1238 #ifdef CONFIG_X86_64
1239 	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
1240 
1241 	if (pcid_enabled) {
1242 		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
1243 		cr3 &= ~X86_CR3_PCID_NOFLUSH;
1244 		pcid = cr3 & X86_CR3_PCID_MASK;
1245 	}
1246 #endif
1247 
1248 	/* PDPTRs are always reloaded for PAE paging. */
1249 	if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu))
1250 		goto handle_tlb_flush;
1251 
1252 	/*
1253 	 * Do not condition the GPA check on long mode, this helper is used to
1254 	 * stuff CR3, e.g. for RSM emulation, and there is no guarantee that
1255 	 * the current vCPU mode is accurate.
1256 	 */
1257 	if (kvm_vcpu_is_illegal_gpa(vcpu, cr3))
1258 		return 1;
1259 
1260 	if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3))
1261 		return 1;
1262 
1263 	if (cr3 != kvm_read_cr3(vcpu))
1264 		kvm_mmu_new_pgd(vcpu, cr3);
1265 
1266 	vcpu->arch.cr3 = cr3;
1267 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
1268 	/* Do not call post_set_cr3, we do not get here for confidential guests.  */
1269 
1270 handle_tlb_flush:
1271 	/*
1272 	 * A load of CR3 that flushes the TLB flushes only the current PCID,
1273 	 * even if PCID is disabled, in which case PCID=0 is flushed.  It's a
1274 	 * moot point in the end because _disabling_ PCID will flush all PCIDs,
1275 	 * and it's impossible to use a non-zero PCID when PCID is disabled,
1276 	 * i.e. only PCID=0 can be relevant.
1277 	 */
1278 	if (!skip_tlb_flush)
1279 		kvm_invalidate_pcid(vcpu, pcid);
1280 
1281 	return 0;
1282 }
1283 EXPORT_SYMBOL_GPL(kvm_set_cr3);
1284 
1285 int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
1286 {
1287 	if (cr8 & CR8_RESERVED_BITS)
1288 		return 1;
1289 	if (lapic_in_kernel(vcpu))
1290 		kvm_lapic_set_tpr(vcpu, cr8);
1291 	else
1292 		vcpu->arch.cr8 = cr8;
1293 	return 0;
1294 }
1295 EXPORT_SYMBOL_GPL(kvm_set_cr8);
1296 
1297 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
1298 {
1299 	if (lapic_in_kernel(vcpu))
1300 		return kvm_lapic_get_cr8(vcpu);
1301 	else
1302 		return vcpu->arch.cr8;
1303 }
1304 EXPORT_SYMBOL_GPL(kvm_get_cr8);
1305 
1306 static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
1307 {
1308 	int i;
1309 
1310 	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1311 		for (i = 0; i < KVM_NR_DB_REGS; i++)
1312 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
1313 	}
1314 }
1315 
1316 void kvm_update_dr7(struct kvm_vcpu *vcpu)
1317 {
1318 	unsigned long dr7;
1319 
1320 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1321 		dr7 = vcpu->arch.guest_debug_dr7;
1322 	else
1323 		dr7 = vcpu->arch.dr7;
1324 	static_call(kvm_x86_set_dr7)(vcpu, dr7);
1325 	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
1326 	if (dr7 & DR7_BP_EN_MASK)
1327 		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
1328 }
1329 EXPORT_SYMBOL_GPL(kvm_update_dr7);
1330 
1331 static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
1332 {
1333 	u64 fixed = DR6_FIXED_1;
1334 
1335 	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
1336 		fixed |= DR6_RTM;
1337 
1338 	if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT))
1339 		fixed |= DR6_BUS_LOCK;
1340 	return fixed;
1341 }
1342 
1343 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
1344 {
1345 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1346 
1347 	switch (dr) {
1348 	case 0 ... 3:
1349 		vcpu->arch.db[array_index_nospec(dr, size)] = val;
1350 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1351 			vcpu->arch.eff_db[dr] = val;
1352 		break;
1353 	case 4:
1354 	case 6:
1355 		if (!kvm_dr6_valid(val))
1356 			return 1; /* #GP */
1357 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
1358 		break;
1359 	case 5:
1360 	default: /* 7 */
1361 		if (!kvm_dr7_valid(val))
1362 			return 1; /* #GP */
1363 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
1364 		kvm_update_dr7(vcpu);
1365 		break;
1366 	}
1367 
1368 	return 0;
1369 }
1370 EXPORT_SYMBOL_GPL(kvm_set_dr);
1371 
1372 void kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
1373 {
1374 	size_t size = ARRAY_SIZE(vcpu->arch.db);
1375 
1376 	switch (dr) {
1377 	case 0 ... 3:
1378 		*val = vcpu->arch.db[array_index_nospec(dr, size)];
1379 		break;
1380 	case 4:
1381 	case 6:
1382 		*val = vcpu->arch.dr6;
1383 		break;
1384 	case 5:
1385 	default: /* 7 */
1386 		*val = vcpu->arch.dr7;
1387 		break;
1388 	}
1389 }
1390 EXPORT_SYMBOL_GPL(kvm_get_dr);
1391 
1392 int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu)
1393 {
1394 	u32 ecx = kvm_rcx_read(vcpu);
1395 	u64 data;
1396 
1397 	if (kvm_pmu_rdpmc(vcpu, ecx, &data)) {
1398 		kvm_inject_gp(vcpu, 0);
1399 		return 1;
1400 	}
1401 
1402 	kvm_rax_write(vcpu, (u32)data);
1403 	kvm_rdx_write(vcpu, data >> 32);
1404 	return kvm_skip_emulated_instruction(vcpu);
1405 }
1406 EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc);
1407 
1408 /*
1409  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
1410  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
1411  *
1412  * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
1413  * extract the supported MSRs from the related const lists.
1414  * msrs_to_save is selected from the msrs_to_save_all to reflect the
1415  * capabilities of the host cpu. This capabilities test skips MSRs that are
1416  * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
1417  * may depend on host virtualization features rather than host cpu features.
1418  */
1419 
1420 static const u32 msrs_to_save_all[] = {
1421 	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
1422 	MSR_STAR,
1423 #ifdef CONFIG_X86_64
1424 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
1425 #endif
1426 	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
1427 	MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
1428 	MSR_IA32_SPEC_CTRL,
1429 	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
1430 	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
1431 	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
1432 	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
1433 	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
1434 	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
1435 	MSR_IA32_UMWAIT_CONTROL,
1436 
1437 	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
1438 	MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
1439 	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
1440 	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
1441 	MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG,
1442 
1443 	/* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */
1444 	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
1445 	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
1446 	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
1447 	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
1448 	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
1449 	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
1450 	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
1451 	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
1452 
1453 	MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
1454 	MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
1455 
1456 	/* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */
1457 	MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
1458 	MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
1459 	MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
1460 	MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
1461 
1462 	MSR_IA32_XFD, MSR_IA32_XFD_ERR,
1463 };
1464 
1465 static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
1466 static unsigned num_msrs_to_save;
1467 
1468 static const u32 emulated_msrs_all[] = {
1469 	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
1470 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
1471 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
1472 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
1473 	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
1474 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
1475 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
1476 	HV_X64_MSR_RESET,
1477 	HV_X64_MSR_VP_INDEX,
1478 	HV_X64_MSR_VP_RUNTIME,
1479 	HV_X64_MSR_SCONTROL,
1480 	HV_X64_MSR_STIMER0_CONFIG,
1481 	HV_X64_MSR_VP_ASSIST_PAGE,
1482 	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
1483 	HV_X64_MSR_TSC_EMULATION_STATUS,
1484 	HV_X64_MSR_SYNDBG_OPTIONS,
1485 	HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
1486 	HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
1487 	HV_X64_MSR_SYNDBG_PENDING_BUFFER,
1488 
1489 	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
1490 	MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
1491 
1492 	MSR_IA32_TSC_ADJUST,
1493 	MSR_IA32_TSC_DEADLINE,
1494 	MSR_IA32_ARCH_CAPABILITIES,
1495 	MSR_IA32_PERF_CAPABILITIES,
1496 	MSR_IA32_MISC_ENABLE,
1497 	MSR_IA32_MCG_STATUS,
1498 	MSR_IA32_MCG_CTL,
1499 	MSR_IA32_MCG_EXT_CTL,
1500 	MSR_IA32_SMBASE,
1501 	MSR_SMI_COUNT,
1502 	MSR_PLATFORM_INFO,
1503 	MSR_MISC_FEATURES_ENABLES,
1504 	MSR_AMD64_VIRT_SPEC_CTRL,
1505 	MSR_AMD64_TSC_RATIO,
1506 	MSR_IA32_POWER_CTL,
1507 	MSR_IA32_UCODE_REV,
1508 
1509 	/*
1510 	 * The following list leaves out MSRs whose values are determined
1511 	 * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
1512 	 * We always support the "true" VMX control MSRs, even if the host
1513 	 * processor does not, so I am putting these registers here rather
1514 	 * than in msrs_to_save_all.
1515 	 */
1516 	MSR_IA32_VMX_BASIC,
1517 	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1518 	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1519 	MSR_IA32_VMX_TRUE_EXIT_CTLS,
1520 	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1521 	MSR_IA32_VMX_MISC,
1522 	MSR_IA32_VMX_CR0_FIXED0,
1523 	MSR_IA32_VMX_CR4_FIXED0,
1524 	MSR_IA32_VMX_VMCS_ENUM,
1525 	MSR_IA32_VMX_PROCBASED_CTLS2,
1526 	MSR_IA32_VMX_EPT_VPID_CAP,
1527 	MSR_IA32_VMX_VMFUNC,
1528 
1529 	MSR_K7_HWCR,
1530 	MSR_KVM_POLL_CONTROL,
1531 };
1532 
1533 static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
1534 static unsigned num_emulated_msrs;
1535 
1536 /*
1537  * List of msr numbers which are used to expose MSR-based features that
1538  * can be used by a hypervisor to validate requested CPU features.
1539  */
1540 static const u32 msr_based_features_all[] = {
1541 	MSR_IA32_VMX_BASIC,
1542 	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
1543 	MSR_IA32_VMX_PINBASED_CTLS,
1544 	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
1545 	MSR_IA32_VMX_PROCBASED_CTLS,
1546 	MSR_IA32_VMX_TRUE_EXIT_CTLS,
1547 	MSR_IA32_VMX_EXIT_CTLS,
1548 	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
1549 	MSR_IA32_VMX_ENTRY_CTLS,
1550 	MSR_IA32_VMX_MISC,
1551 	MSR_IA32_VMX_CR0_FIXED0,
1552 	MSR_IA32_VMX_CR0_FIXED1,
1553 	MSR_IA32_VMX_CR4_FIXED0,
1554 	MSR_IA32_VMX_CR4_FIXED1,
1555 	MSR_IA32_VMX_VMCS_ENUM,
1556 	MSR_IA32_VMX_PROCBASED_CTLS2,
1557 	MSR_IA32_VMX_EPT_VPID_CAP,
1558 	MSR_IA32_VMX_VMFUNC,
1559 
1560 	MSR_AMD64_DE_CFG,
1561 	MSR_IA32_UCODE_REV,
1562 	MSR_IA32_ARCH_CAPABILITIES,
1563 	MSR_IA32_PERF_CAPABILITIES,
1564 };
1565 
1566 static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
1567 static unsigned int num_msr_based_features;
1568 
1569 /*
1570  * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
1571  * does not yet virtualize. These include:
1572  *   10 - MISC_PACKAGE_CTRLS
1573  *   11 - ENERGY_FILTERING_CTL
1574  *   12 - DOITM
1575  *   18 - FB_CLEAR_CTRL
1576  *   21 - XAPIC_DISABLE_STATUS
1577  *   23 - OVERCLOCKING_STATUS
1578  */
1579 
1580 #define KVM_SUPPORTED_ARCH_CAP \
1581 	(ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
1582 	 ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
1583 	 ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
1584 	 ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
1585 	 ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
1586 
1587 static u64 kvm_get_arch_capabilities(void)
1588 {
1589 	u64 data = 0;
1590 
1591 	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
1592 		rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
1593 		data &= KVM_SUPPORTED_ARCH_CAP;
1594 	}
1595 
1596 	/*
1597 	 * If nx_huge_pages is enabled, KVM's shadow paging will ensure that
1598 	 * the nested hypervisor runs with NX huge pages.  If it is not,
1599 	 * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other
1600 	 * L1 guests, so it need not worry about its own (L2) guests.
1601 	 */
1602 	data |= ARCH_CAP_PSCHANGE_MC_NO;
1603 
1604 	/*
1605 	 * If we're doing cache flushes (either "always" or "cond")
1606 	 * we will do one whenever the guest does a vmlaunch/vmresume.
1607 	 * If an outer hypervisor is doing the cache flush for us
1608 	 * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
1609 	 * capability to the guest too, and if EPT is disabled we're not
1610 	 * vulnerable.  Overall, only VMENTER_L1D_FLUSH_NEVER will
1611 	 * require a nested hypervisor to do a flush of its own.
1612 	 */
1613 	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
1614 		data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
1615 
1616 	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
1617 		data |= ARCH_CAP_RDCL_NO;
1618 	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
1619 		data |= ARCH_CAP_SSB_NO;
1620 	if (!boot_cpu_has_bug(X86_BUG_MDS))
1621 		data |= ARCH_CAP_MDS_NO;
1622 
1623 	if (!boot_cpu_has(X86_FEATURE_RTM)) {
1624 		/*
1625 		 * If RTM=0 because the kernel has disabled TSX, the host might
1626 		 * have TAA_NO or TSX_CTRL.  Clear TAA_NO (the guest sees RTM=0
1627 		 * and therefore knows that there cannot be TAA) but keep
1628 		 * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
1629 		 * and we want to allow migrating those guests to tsx=off hosts.
1630 		 */
1631 		data &= ~ARCH_CAP_TAA_NO;
1632 	} else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
1633 		data |= ARCH_CAP_TAA_NO;
1634 	} else {
1635 		/*
1636 		 * Nothing to do here; we emulate TSX_CTRL if present on the
1637 		 * host so the guest can choose between disabling TSX or
1638 		 * using VERW to clear CPU buffers.
1639 		 */
1640 	}
1641 
1642 	return data;
1643 }
1644 
1645 static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
1646 {
1647 	switch (msr->index) {
1648 	case MSR_IA32_ARCH_CAPABILITIES:
1649 		msr->data = kvm_get_arch_capabilities();
1650 		break;
1651 	case MSR_IA32_PERF_CAPABILITIES:
1652 		msr->data = kvm_caps.supported_perf_cap;
1653 		break;
1654 	case MSR_IA32_UCODE_REV:
1655 		rdmsrl_safe(msr->index, &msr->data);
1656 		break;
1657 	default:
1658 		return static_call(kvm_x86_get_msr_feature)(msr);
1659 	}
1660 	return 0;
1661 }
1662 
1663 static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
1664 {
1665 	struct kvm_msr_entry msr;
1666 	int r;
1667 
1668 	msr.index = index;
1669 	r = kvm_get_msr_feature(&msr);
1670 
1671 	if (r == KVM_MSR_RET_INVALID) {
1672 		/* Unconditionally clear the output for simplicity */
1673 		*data = 0;
1674 		if (kvm_msr_ignored_check(index, 0, false))
1675 			r = 0;
1676 	}
1677 
1678 	if (r)
1679 		return r;
1680 
1681 	*data = msr.data;
1682 
1683 	return 0;
1684 }
1685 
1686 static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1687 {
1688 	if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS))
1689 		return false;
1690 
1691 	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
1692 		return false;
1693 
1694 	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
1695 		return false;
1696 
1697 	if (efer & (EFER_LME | EFER_LMA) &&
1698 	    !guest_cpuid_has(vcpu, X86_FEATURE_LM))
1699 		return false;
1700 
1701 	if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
1702 		return false;
1703 
1704 	return true;
1705 
1706 }
1707 bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
1708 {
1709 	if (efer & efer_reserved_bits)
1710 		return false;
1711 
1712 	return __kvm_valid_efer(vcpu, efer);
1713 }
1714 EXPORT_SYMBOL_GPL(kvm_valid_efer);
1715 
1716 static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
1717 {
1718 	u64 old_efer = vcpu->arch.efer;
1719 	u64 efer = msr_info->data;
1720 	int r;
1721 
1722 	if (efer & efer_reserved_bits)
1723 		return 1;
1724 
1725 	if (!msr_info->host_initiated) {
1726 		if (!__kvm_valid_efer(vcpu, efer))
1727 			return 1;
1728 
1729 		if (is_paging(vcpu) &&
1730 		    (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
1731 			return 1;
1732 	}
1733 
1734 	efer &= ~EFER_LMA;
1735 	efer |= vcpu->arch.efer & EFER_LMA;
1736 
1737 	r = static_call(kvm_x86_set_efer)(vcpu, efer);
1738 	if (r) {
1739 		WARN_ON(r > 0);
1740 		return r;
1741 	}
1742 
1743 	if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS)
1744 		kvm_mmu_reset_context(vcpu);
1745 
1746 	return 0;
1747 }
1748 
1749 void kvm_enable_efer_bits(u64 mask)
1750 {
1751        efer_reserved_bits &= ~mask;
1752 }
1753 EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
1754 
1755 bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
1756 {
1757 	struct kvm_x86_msr_filter *msr_filter;
1758 	struct msr_bitmap_range *ranges;
1759 	struct kvm *kvm = vcpu->kvm;
1760 	bool allowed;
1761 	int idx;
1762 	u32 i;
1763 
1764 	/* x2APIC MSRs do not support filtering. */
1765 	if (index >= 0x800 && index <= 0x8ff)
1766 		return true;
1767 
1768 	idx = srcu_read_lock(&kvm->srcu);
1769 
1770 	msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
1771 	if (!msr_filter) {
1772 		allowed = true;
1773 		goto out;
1774 	}
1775 
1776 	allowed = msr_filter->default_allow;
1777 	ranges = msr_filter->ranges;
1778 
1779 	for (i = 0; i < msr_filter->count; i++) {
1780 		u32 start = ranges[i].base;
1781 		u32 end = start + ranges[i].nmsrs;
1782 		u32 flags = ranges[i].flags;
1783 		unsigned long *bitmap = ranges[i].bitmap;
1784 
1785 		if ((index >= start) && (index < end) && (flags & type)) {
1786 			allowed = !!test_bit(index - start, bitmap);
1787 			break;
1788 		}
1789 	}
1790 
1791 out:
1792 	srcu_read_unlock(&kvm->srcu, idx);
1793 
1794 	return allowed;
1795 }
1796 EXPORT_SYMBOL_GPL(kvm_msr_allowed);
1797 
1798 /*
1799  * Write @data into the MSR specified by @index.  Select MSR specific fault
1800  * checks are bypassed if @host_initiated is %true.
1801  * Returns 0 on success, non-0 otherwise.
1802  * Assumes vcpu_load() was already called.
1803  */
1804 static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
1805 			 bool host_initiated)
1806 {
1807 	struct msr_data msr;
1808 
1809 	switch (index) {
1810 	case MSR_FS_BASE:
1811 	case MSR_GS_BASE:
1812 	case MSR_KERNEL_GS_BASE:
1813 	case MSR_CSTAR:
1814 	case MSR_LSTAR:
1815 		if (is_noncanonical_address(data, vcpu))
1816 			return 1;
1817 		break;
1818 	case MSR_IA32_SYSENTER_EIP:
1819 	case MSR_IA32_SYSENTER_ESP:
1820 		/*
1821 		 * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
1822 		 * non-canonical address is written on Intel but not on
1823 		 * AMD (which ignores the top 32-bits, because it does
1824 		 * not implement 64-bit SYSENTER).
1825 		 *
1826 		 * 64-bit code should hence be able to write a non-canonical
1827 		 * value on AMD.  Making the address canonical ensures that
1828 		 * vmentry does not fail on Intel after writing a non-canonical
1829 		 * value, and that something deterministic happens if the guest
1830 		 * invokes 64-bit SYSENTER.
1831 		 */
1832 		data = __canonical_address(data, vcpu_virt_addr_bits(vcpu));
1833 		break;
1834 	case MSR_TSC_AUX:
1835 		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1836 			return 1;
1837 
1838 		if (!host_initiated &&
1839 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1840 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1841 			return 1;
1842 
1843 		/*
1844 		 * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has
1845 		 * incomplete and conflicting architectural behavior.  Current
1846 		 * AMD CPUs completely ignore bits 63:32, i.e. they aren't
1847 		 * reserved and always read as zeros.  Enforce Intel's reserved
1848 		 * bits check if and only if the guest CPU is Intel, and clear
1849 		 * the bits in all other cases.  This ensures cross-vendor
1850 		 * migration will provide consistent behavior for the guest.
1851 		 */
1852 		if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0)
1853 			return 1;
1854 
1855 		data = (u32)data;
1856 		break;
1857 	}
1858 
1859 	msr.data = data;
1860 	msr.index = index;
1861 	msr.host_initiated = host_initiated;
1862 
1863 	return static_call(kvm_x86_set_msr)(vcpu, &msr);
1864 }
1865 
1866 static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
1867 				     u32 index, u64 data, bool host_initiated)
1868 {
1869 	int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
1870 
1871 	if (ret == KVM_MSR_RET_INVALID)
1872 		if (kvm_msr_ignored_check(index, data, true))
1873 			ret = 0;
1874 
1875 	return ret;
1876 }
1877 
1878 /*
1879  * Read the MSR specified by @index into @data.  Select MSR specific fault
1880  * checks are bypassed if @host_initiated is %true.
1881  * Returns 0 on success, non-0 otherwise.
1882  * Assumes vcpu_load() was already called.
1883  */
1884 int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
1885 		  bool host_initiated)
1886 {
1887 	struct msr_data msr;
1888 	int ret;
1889 
1890 	switch (index) {
1891 	case MSR_TSC_AUX:
1892 		if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX))
1893 			return 1;
1894 
1895 		if (!host_initiated &&
1896 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) &&
1897 		    !guest_cpuid_has(vcpu, X86_FEATURE_RDPID))
1898 			return 1;
1899 		break;
1900 	}
1901 
1902 	msr.index = index;
1903 	msr.host_initiated = host_initiated;
1904 
1905 	ret = static_call(kvm_x86_get_msr)(vcpu, &msr);
1906 	if (!ret)
1907 		*data = msr.data;
1908 	return ret;
1909 }
1910 
1911 static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
1912 				     u32 index, u64 *data, bool host_initiated)
1913 {
1914 	int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
1915 
1916 	if (ret == KVM_MSR_RET_INVALID) {
1917 		/* Unconditionally clear *data for simplicity */
1918 		*data = 0;
1919 		if (kvm_msr_ignored_check(index, 0, false))
1920 			ret = 0;
1921 	}
1922 
1923 	return ret;
1924 }
1925 
1926 static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1927 {
1928 	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
1929 		return KVM_MSR_RET_FILTERED;
1930 	return kvm_get_msr_ignored_check(vcpu, index, data, false);
1931 }
1932 
1933 static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data)
1934 {
1935 	if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
1936 		return KVM_MSR_RET_FILTERED;
1937 	return kvm_set_msr_ignored_check(vcpu, index, data, false);
1938 }
1939 
1940 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
1941 {
1942 	return kvm_get_msr_ignored_check(vcpu, index, data, false);
1943 }
1944 EXPORT_SYMBOL_GPL(kvm_get_msr);
1945 
1946 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
1947 {
1948 	return kvm_set_msr_ignored_check(vcpu, index, data, false);
1949 }
1950 EXPORT_SYMBOL_GPL(kvm_set_msr);
1951 
1952 static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu)
1953 {
1954 	if (!vcpu->run->msr.error) {
1955 		kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
1956 		kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
1957 	}
1958 }
1959 
1960 static int complete_emulated_msr_access(struct kvm_vcpu *vcpu)
1961 {
1962 	return complete_emulated_insn_gp(vcpu, vcpu->run->msr.error);
1963 }
1964 
1965 static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
1966 {
1967 	complete_userspace_rdmsr(vcpu);
1968 	return complete_emulated_msr_access(vcpu);
1969 }
1970 
1971 static int complete_fast_msr_access(struct kvm_vcpu *vcpu)
1972 {
1973 	return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error);
1974 }
1975 
1976 static int complete_fast_rdmsr(struct kvm_vcpu *vcpu)
1977 {
1978 	complete_userspace_rdmsr(vcpu);
1979 	return complete_fast_msr_access(vcpu);
1980 }
1981 
1982 static u64 kvm_msr_reason(int r)
1983 {
1984 	switch (r) {
1985 	case KVM_MSR_RET_INVALID:
1986 		return KVM_MSR_EXIT_REASON_UNKNOWN;
1987 	case KVM_MSR_RET_FILTERED:
1988 		return KVM_MSR_EXIT_REASON_FILTER;
1989 	default:
1990 		return KVM_MSR_EXIT_REASON_INVAL;
1991 	}
1992 }
1993 
1994 static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
1995 			      u32 exit_reason, u64 data,
1996 			      int (*completion)(struct kvm_vcpu *vcpu),
1997 			      int r)
1998 {
1999 	u64 msr_reason = kvm_msr_reason(r);
2000 
2001 	/* Check if the user wanted to know about this MSR fault */
2002 	if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
2003 		return 0;
2004 
2005 	vcpu->run->exit_reason = exit_reason;
2006 	vcpu->run->msr.error = 0;
2007 	memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
2008 	vcpu->run->msr.reason = msr_reason;
2009 	vcpu->run->msr.index = index;
2010 	vcpu->run->msr.data = data;
2011 	vcpu->arch.complete_userspace_io = completion;
2012 
2013 	return 1;
2014 }
2015 
2016 int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
2017 {
2018 	u32 ecx = kvm_rcx_read(vcpu);
2019 	u64 data;
2020 	int r;
2021 
2022 	r = kvm_get_msr_with_filter(vcpu, ecx, &data);
2023 
2024 	if (!r) {
2025 		trace_kvm_msr_read(ecx, data);
2026 
2027 		kvm_rax_write(vcpu, data & -1u);
2028 		kvm_rdx_write(vcpu, (data >> 32) & -1u);
2029 	} else {
2030 		/* MSR read failed? See if we should ask user space */
2031 		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_RDMSR, 0,
2032 				       complete_fast_rdmsr, r))
2033 			return 0;
2034 		trace_kvm_msr_read_ex(ecx);
2035 	}
2036 
2037 	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
2038 }
2039 EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
2040 
2041 int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
2042 {
2043 	u32 ecx = kvm_rcx_read(vcpu);
2044 	u64 data = kvm_read_edx_eax(vcpu);
2045 	int r;
2046 
2047 	r = kvm_set_msr_with_filter(vcpu, ecx, data);
2048 
2049 	if (!r) {
2050 		trace_kvm_msr_write(ecx, data);
2051 	} else {
2052 		/* MSR write failed? See if we should ask user space */
2053 		if (kvm_msr_user_space(vcpu, ecx, KVM_EXIT_X86_WRMSR, data,
2054 				       complete_fast_msr_access, r))
2055 			return 0;
2056 		/* Signal all other negative errors to userspace */
2057 		if (r < 0)
2058 			return r;
2059 		trace_kvm_msr_write_ex(ecx, data);
2060 	}
2061 
2062 	return static_call(kvm_x86_complete_emulated_msr)(vcpu, r);
2063 }
2064 EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
2065 
2066 int kvm_emulate_as_nop(struct kvm_vcpu *vcpu)
2067 {
2068 	return kvm_skip_emulated_instruction(vcpu);
2069 }
2070 
2071 int kvm_emulate_invd(struct kvm_vcpu *vcpu)
2072 {
2073 	/* Treat an INVD instruction as a NOP and just skip it. */
2074 	return kvm_emulate_as_nop(vcpu);
2075 }
2076 EXPORT_SYMBOL_GPL(kvm_emulate_invd);
2077 
2078 int kvm_handle_invalid_op(struct kvm_vcpu *vcpu)
2079 {
2080 	kvm_queue_exception(vcpu, UD_VECTOR);
2081 	return 1;
2082 }
2083 EXPORT_SYMBOL_GPL(kvm_handle_invalid_op);
2084 
2085 
2086 static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn)
2087 {
2088 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) &&
2089 	    !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT))
2090 		return kvm_handle_invalid_op(vcpu);
2091 
2092 	pr_warn_once("kvm: %s instruction emulated as NOP!\n", insn);
2093 	return kvm_emulate_as_nop(vcpu);
2094 }
2095 int kvm_emulate_mwait(struct kvm_vcpu *vcpu)
2096 {
2097 	return kvm_emulate_monitor_mwait(vcpu, "MWAIT");
2098 }
2099 EXPORT_SYMBOL_GPL(kvm_emulate_mwait);
2100 
2101 int kvm_emulate_monitor(struct kvm_vcpu *vcpu)
2102 {
2103 	return kvm_emulate_monitor_mwait(vcpu, "MONITOR");
2104 }
2105 EXPORT_SYMBOL_GPL(kvm_emulate_monitor);
2106 
2107 static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
2108 {
2109 	xfer_to_guest_mode_prepare();
2110 	return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
2111 		xfer_to_guest_mode_work_pending();
2112 }
2113 
2114 /*
2115  * The fast path for frequent and performance sensitive wrmsr emulation,
2116  * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
2117  * the latency of virtual IPI by avoiding the expensive bits of transitioning
2118  * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
2119  * other cases which must be called after interrupts are enabled on the host.
2120  */
2121 static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
2122 {
2123 	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
2124 		return 1;
2125 
2126 	if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
2127 	    ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
2128 	    ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
2129 	    ((u32)(data >> 32) != X2APIC_BROADCAST))
2130 		return kvm_x2apic_icr_write(vcpu->arch.apic, data);
2131 
2132 	return 1;
2133 }
2134 
2135 static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
2136 {
2137 	if (!kvm_can_use_hv_timer(vcpu))
2138 		return 1;
2139 
2140 	kvm_set_lapic_tscdeadline_msr(vcpu, data);
2141 	return 0;
2142 }
2143 
2144 fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
2145 {
2146 	u32 msr = kvm_rcx_read(vcpu);
2147 	u64 data;
2148 	fastpath_t ret = EXIT_FASTPATH_NONE;
2149 
2150 	switch (msr) {
2151 	case APIC_BASE_MSR + (APIC_ICR >> 4):
2152 		data = kvm_read_edx_eax(vcpu);
2153 		if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
2154 			kvm_skip_emulated_instruction(vcpu);
2155 			ret = EXIT_FASTPATH_EXIT_HANDLED;
2156 		}
2157 		break;
2158 	case MSR_IA32_TSC_DEADLINE:
2159 		data = kvm_read_edx_eax(vcpu);
2160 		if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
2161 			kvm_skip_emulated_instruction(vcpu);
2162 			ret = EXIT_FASTPATH_REENTER_GUEST;
2163 		}
2164 		break;
2165 	default:
2166 		break;
2167 	}
2168 
2169 	if (ret != EXIT_FASTPATH_NONE)
2170 		trace_kvm_msr_write(msr, data);
2171 
2172 	return ret;
2173 }
2174 EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
2175 
2176 /*
2177  * Adapt set_msr() to msr_io()'s calling convention
2178  */
2179 static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2180 {
2181 	return kvm_get_msr_ignored_check(vcpu, index, data, true);
2182 }
2183 
2184 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
2185 {
2186 	return kvm_set_msr_ignored_check(vcpu, index, *data, true);
2187 }
2188 
2189 #ifdef CONFIG_X86_64
2190 struct pvclock_clock {
2191 	int vclock_mode;
2192 	u64 cycle_last;
2193 	u64 mask;
2194 	u32 mult;
2195 	u32 shift;
2196 	u64 base_cycles;
2197 	u64 offset;
2198 };
2199 
2200 struct pvclock_gtod_data {
2201 	seqcount_t	seq;
2202 
2203 	struct pvclock_clock clock; /* extract of a clocksource struct */
2204 	struct pvclock_clock raw_clock; /* extract of a clocksource struct */
2205 
2206 	ktime_t		offs_boot;
2207 	u64		wall_time_sec;
2208 };
2209 
2210 static struct pvclock_gtod_data pvclock_gtod_data;
2211 
2212 static void update_pvclock_gtod(struct timekeeper *tk)
2213 {
2214 	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
2215 
2216 	write_seqcount_begin(&vdata->seq);
2217 
2218 	/* copy pvclock gtod data */
2219 	vdata->clock.vclock_mode	= tk->tkr_mono.clock->vdso_clock_mode;
2220 	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
2221 	vdata->clock.mask		= tk->tkr_mono.mask;
2222 	vdata->clock.mult		= tk->tkr_mono.mult;
2223 	vdata->clock.shift		= tk->tkr_mono.shift;
2224 	vdata->clock.base_cycles	= tk->tkr_mono.xtime_nsec;
2225 	vdata->clock.offset		= tk->tkr_mono.base;
2226 
2227 	vdata->raw_clock.vclock_mode	= tk->tkr_raw.clock->vdso_clock_mode;
2228 	vdata->raw_clock.cycle_last	= tk->tkr_raw.cycle_last;
2229 	vdata->raw_clock.mask		= tk->tkr_raw.mask;
2230 	vdata->raw_clock.mult		= tk->tkr_raw.mult;
2231 	vdata->raw_clock.shift		= tk->tkr_raw.shift;
2232 	vdata->raw_clock.base_cycles	= tk->tkr_raw.xtime_nsec;
2233 	vdata->raw_clock.offset		= tk->tkr_raw.base;
2234 
2235 	vdata->wall_time_sec            = tk->xtime_sec;
2236 
2237 	vdata->offs_boot		= tk->offs_boot;
2238 
2239 	write_seqcount_end(&vdata->seq);
2240 }
2241 
2242 static s64 get_kvmclock_base_ns(void)
2243 {
2244 	/* Count up from boot time, but with the frequency of the raw clock.  */
2245 	return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
2246 }
2247 #else
2248 static s64 get_kvmclock_base_ns(void)
2249 {
2250 	/* Master clock not used, so we can just use CLOCK_BOOTTIME.  */
2251 	return ktime_get_boottime_ns();
2252 }
2253 #endif
2254 
2255 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs)
2256 {
2257 	int version;
2258 	int r;
2259 	struct pvclock_wall_clock wc;
2260 	u32 wc_sec_hi;
2261 	u64 wall_nsec;
2262 
2263 	if (!wall_clock)
2264 		return;
2265 
2266 	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
2267 	if (r)
2268 		return;
2269 
2270 	if (version & 1)
2271 		++version;  /* first time write, random junk */
2272 
2273 	++version;
2274 
2275 	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
2276 		return;
2277 
2278 	/*
2279 	 * The guest calculates current wall clock time by adding
2280 	 * system time (updated by kvm_guest_time_update below) to the
2281 	 * wall clock specified here.  We do the reverse here.
2282 	 */
2283 	wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
2284 
2285 	wc.nsec = do_div(wall_nsec, 1000000000);
2286 	wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
2287 	wc.version = version;
2288 
2289 	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
2290 
2291 	if (sec_hi_ofs) {
2292 		wc_sec_hi = wall_nsec >> 32;
2293 		kvm_write_guest(kvm, wall_clock + sec_hi_ofs,
2294 				&wc_sec_hi, sizeof(wc_sec_hi));
2295 	}
2296 
2297 	version++;
2298 	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
2299 }
2300 
2301 static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
2302 				  bool old_msr, bool host_initiated)
2303 {
2304 	struct kvm_arch *ka = &vcpu->kvm->arch;
2305 
2306 	if (vcpu->vcpu_id == 0 && !host_initiated) {
2307 		if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
2308 			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2309 
2310 		ka->boot_vcpu_runs_old_kvmclock = old_msr;
2311 	}
2312 
2313 	vcpu->arch.time = system_time;
2314 	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
2315 
2316 	/* we verify if the enable bit is set... */
2317 	if (system_time & 1)
2318 		kvm_gpc_activate(&vcpu->arch.pv_time, system_time & ~1ULL,
2319 				 sizeof(struct pvclock_vcpu_time_info));
2320 	else
2321 		kvm_gpc_deactivate(&vcpu->arch.pv_time);
2322 
2323 	return;
2324 }
2325 
2326 static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
2327 {
2328 	do_shl32_div32(dividend, divisor);
2329 	return dividend;
2330 }
2331 
2332 static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
2333 			       s8 *pshift, u32 *pmultiplier)
2334 {
2335 	uint64_t scaled64;
2336 	int32_t  shift = 0;
2337 	uint64_t tps64;
2338 	uint32_t tps32;
2339 
2340 	tps64 = base_hz;
2341 	scaled64 = scaled_hz;
2342 	while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
2343 		tps64 >>= 1;
2344 		shift--;
2345 	}
2346 
2347 	tps32 = (uint32_t)tps64;
2348 	while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
2349 		if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
2350 			scaled64 >>= 1;
2351 		else
2352 			tps32 <<= 1;
2353 		shift++;
2354 	}
2355 
2356 	*pshift = shift;
2357 	*pmultiplier = div_frac(scaled64, tps32);
2358 }
2359 
2360 #ifdef CONFIG_X86_64
2361 static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
2362 #endif
2363 
2364 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
2365 static unsigned long max_tsc_khz;
2366 
2367 static u32 adjust_tsc_khz(u32 khz, s32 ppm)
2368 {
2369 	u64 v = (u64)khz * (1000000 + ppm);
2370 	do_div(v, 1000000);
2371 	return v;
2372 }
2373 
2374 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier);
2375 
2376 static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
2377 {
2378 	u64 ratio;
2379 
2380 	/* Guest TSC same frequency as host TSC? */
2381 	if (!scale) {
2382 		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
2383 		return 0;
2384 	}
2385 
2386 	/* TSC scaling supported? */
2387 	if (!kvm_caps.has_tsc_control) {
2388 		if (user_tsc_khz > tsc_khz) {
2389 			vcpu->arch.tsc_catchup = 1;
2390 			vcpu->arch.tsc_always_catchup = 1;
2391 			return 0;
2392 		} else {
2393 			pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
2394 			return -1;
2395 		}
2396 	}
2397 
2398 	/* TSC scaling required  - calculate ratio */
2399 	ratio = mul_u64_u32_div(1ULL << kvm_caps.tsc_scaling_ratio_frac_bits,
2400 				user_tsc_khz, tsc_khz);
2401 
2402 	if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) {
2403 		pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
2404 			            user_tsc_khz);
2405 		return -1;
2406 	}
2407 
2408 	kvm_vcpu_write_tsc_multiplier(vcpu, ratio);
2409 	return 0;
2410 }
2411 
2412 static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
2413 {
2414 	u32 thresh_lo, thresh_hi;
2415 	int use_scaling = 0;
2416 
2417 	/* tsc_khz can be zero if TSC calibration fails */
2418 	if (user_tsc_khz == 0) {
2419 		/* set tsc_scaling_ratio to a safe value */
2420 		kvm_vcpu_write_tsc_multiplier(vcpu, kvm_caps.default_tsc_scaling_ratio);
2421 		return -1;
2422 	}
2423 
2424 	/* Compute a scale to convert nanoseconds in TSC cycles */
2425 	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
2426 			   &vcpu->arch.virtual_tsc_shift,
2427 			   &vcpu->arch.virtual_tsc_mult);
2428 	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
2429 
2430 	/*
2431 	 * Compute the variation in TSC rate which is acceptable
2432 	 * within the range of tolerance and decide if the
2433 	 * rate being applied is within that bounds of the hardware
2434 	 * rate.  If so, no scaling or compensation need be done.
2435 	 */
2436 	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
2437 	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
2438 	if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) {
2439 		pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
2440 		use_scaling = 1;
2441 	}
2442 	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
2443 }
2444 
2445 static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
2446 {
2447 	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
2448 				      vcpu->arch.virtual_tsc_mult,
2449 				      vcpu->arch.virtual_tsc_shift);
2450 	tsc += vcpu->arch.this_tsc_write;
2451 	return tsc;
2452 }
2453 
2454 #ifdef CONFIG_X86_64
2455 static inline int gtod_is_based_on_tsc(int mode)
2456 {
2457 	return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
2458 }
2459 #endif
2460 
2461 static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
2462 {
2463 #ifdef CONFIG_X86_64
2464 	bool vcpus_matched;
2465 	struct kvm_arch *ka = &vcpu->kvm->arch;
2466 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2467 
2468 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2469 			 atomic_read(&vcpu->kvm->online_vcpus));
2470 
2471 	/*
2472 	 * Once the masterclock is enabled, always perform request in
2473 	 * order to update it.
2474 	 *
2475 	 * In order to enable masterclock, the host clocksource must be TSC
2476 	 * and the vcpus need to have matched TSCs.  When that happens,
2477 	 * perform request to enable masterclock.
2478 	 */
2479 	if (ka->use_master_clock ||
2480 	    (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
2481 		kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
2482 
2483 	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
2484 			    atomic_read(&vcpu->kvm->online_vcpus),
2485 		            ka->use_master_clock, gtod->clock.vclock_mode);
2486 #endif
2487 }
2488 
2489 /*
2490  * Multiply tsc by a fixed point number represented by ratio.
2491  *
2492  * The most significant 64-N bits (mult) of ratio represent the
2493  * integral part of the fixed point number; the remaining N bits
2494  * (frac) represent the fractional part, ie. ratio represents a fixed
2495  * point number (mult + frac * 2^(-N)).
2496  *
2497  * N equals to kvm_caps.tsc_scaling_ratio_frac_bits.
2498  */
2499 static inline u64 __scale_tsc(u64 ratio, u64 tsc)
2500 {
2501 	return mul_u64_u64_shr(tsc, ratio, kvm_caps.tsc_scaling_ratio_frac_bits);
2502 }
2503 
2504 u64 kvm_scale_tsc(u64 tsc, u64 ratio)
2505 {
2506 	u64 _tsc = tsc;
2507 
2508 	if (ratio != kvm_caps.default_tsc_scaling_ratio)
2509 		_tsc = __scale_tsc(ratio, tsc);
2510 
2511 	return _tsc;
2512 }
2513 
2514 static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
2515 {
2516 	u64 tsc;
2517 
2518 	tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio);
2519 
2520 	return target_tsc - tsc;
2521 }
2522 
2523 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
2524 {
2525 	return vcpu->arch.l1_tsc_offset +
2526 		kvm_scale_tsc(host_tsc, vcpu->arch.l1_tsc_scaling_ratio);
2527 }
2528 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
2529 
2530 u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier)
2531 {
2532 	u64 nested_offset;
2533 
2534 	if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio)
2535 		nested_offset = l1_offset;
2536 	else
2537 		nested_offset = mul_s64_u64_shr((s64) l1_offset, l2_multiplier,
2538 						kvm_caps.tsc_scaling_ratio_frac_bits);
2539 
2540 	nested_offset += l2_offset;
2541 	return nested_offset;
2542 }
2543 EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset);
2544 
2545 u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier)
2546 {
2547 	if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio)
2548 		return mul_u64_u64_shr(l1_multiplier, l2_multiplier,
2549 				       kvm_caps.tsc_scaling_ratio_frac_bits);
2550 
2551 	return l1_multiplier;
2552 }
2553 EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier);
2554 
2555 static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset)
2556 {
2557 	trace_kvm_write_tsc_offset(vcpu->vcpu_id,
2558 				   vcpu->arch.l1_tsc_offset,
2559 				   l1_offset);
2560 
2561 	vcpu->arch.l1_tsc_offset = l1_offset;
2562 
2563 	/*
2564 	 * If we are here because L1 chose not to trap WRMSR to TSC then
2565 	 * according to the spec this should set L1's TSC (as opposed to
2566 	 * setting L1's offset for L2).
2567 	 */
2568 	if (is_guest_mode(vcpu))
2569 		vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
2570 			l1_offset,
2571 			static_call(kvm_x86_get_l2_tsc_offset)(vcpu),
2572 			static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2573 	else
2574 		vcpu->arch.tsc_offset = l1_offset;
2575 
2576 	static_call(kvm_x86_write_tsc_offset)(vcpu, vcpu->arch.tsc_offset);
2577 }
2578 
2579 static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier)
2580 {
2581 	vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier;
2582 
2583 	/* Userspace is changing the multiplier while L2 is active */
2584 	if (is_guest_mode(vcpu))
2585 		vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier(
2586 			l1_multiplier,
2587 			static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu));
2588 	else
2589 		vcpu->arch.tsc_scaling_ratio = l1_multiplier;
2590 
2591 	if (kvm_caps.has_tsc_control)
2592 		static_call(kvm_x86_write_tsc_multiplier)(
2593 			vcpu, vcpu->arch.tsc_scaling_ratio);
2594 }
2595 
2596 static inline bool kvm_check_tsc_unstable(void)
2597 {
2598 #ifdef CONFIG_X86_64
2599 	/*
2600 	 * TSC is marked unstable when we're running on Hyper-V,
2601 	 * 'TSC page' clocksource is good.
2602 	 */
2603 	if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
2604 		return false;
2605 #endif
2606 	return check_tsc_unstable();
2607 }
2608 
2609 /*
2610  * Infers attempts to synchronize the guest's tsc from host writes. Sets the
2611  * offset for the vcpu and tracks the TSC matching generation that the vcpu
2612  * participates in.
2613  */
2614 static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc,
2615 				  u64 ns, bool matched)
2616 {
2617 	struct kvm *kvm = vcpu->kvm;
2618 
2619 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
2620 
2621 	/*
2622 	 * We also track th most recent recorded KHZ, write and time to
2623 	 * allow the matching interval to be extended at each write.
2624 	 */
2625 	kvm->arch.last_tsc_nsec = ns;
2626 	kvm->arch.last_tsc_write = tsc;
2627 	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
2628 	kvm->arch.last_tsc_offset = offset;
2629 
2630 	vcpu->arch.last_guest_tsc = tsc;
2631 
2632 	kvm_vcpu_write_tsc_offset(vcpu, offset);
2633 
2634 	if (!matched) {
2635 		/*
2636 		 * We split periods of matched TSC writes into generations.
2637 		 * For each generation, we track the original measured
2638 		 * nanosecond time, offset, and write, so if TSCs are in
2639 		 * sync, we can match exact offset, and if not, we can match
2640 		 * exact software computation in compute_guest_tsc()
2641 		 *
2642 		 * These values are tracked in kvm->arch.cur_xxx variables.
2643 		 */
2644 		kvm->arch.cur_tsc_generation++;
2645 		kvm->arch.cur_tsc_nsec = ns;
2646 		kvm->arch.cur_tsc_write = tsc;
2647 		kvm->arch.cur_tsc_offset = offset;
2648 		kvm->arch.nr_vcpus_matched_tsc = 0;
2649 	} else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) {
2650 		kvm->arch.nr_vcpus_matched_tsc++;
2651 	}
2652 
2653 	/* Keep track of which generation this VCPU has synchronized to */
2654 	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
2655 	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
2656 	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
2657 
2658 	kvm_track_tsc_matching(vcpu);
2659 }
2660 
2661 static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
2662 {
2663 	struct kvm *kvm = vcpu->kvm;
2664 	u64 offset, ns, elapsed;
2665 	unsigned long flags;
2666 	bool matched = false;
2667 	bool synchronizing = false;
2668 
2669 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
2670 	offset = kvm_compute_l1_tsc_offset(vcpu, data);
2671 	ns = get_kvmclock_base_ns();
2672 	elapsed = ns - kvm->arch.last_tsc_nsec;
2673 
2674 	if (vcpu->arch.virtual_tsc_khz) {
2675 		if (data == 0) {
2676 			/*
2677 			 * detection of vcpu initialization -- need to sync
2678 			 * with other vCPUs. This particularly helps to keep
2679 			 * kvm_clock stable after CPU hotplug
2680 			 */
2681 			synchronizing = true;
2682 		} else {
2683 			u64 tsc_exp = kvm->arch.last_tsc_write +
2684 						nsec_to_cycles(vcpu, elapsed);
2685 			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
2686 			/*
2687 			 * Special case: TSC write with a small delta (1 second)
2688 			 * of virtual cycle time against real time is
2689 			 * interpreted as an attempt to synchronize the CPU.
2690 			 */
2691 			synchronizing = data < tsc_exp + tsc_hz &&
2692 					data + tsc_hz > tsc_exp;
2693 		}
2694 	}
2695 
2696 	/*
2697 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
2698 	 * TSC, we add elapsed time in this computation.  We could let the
2699 	 * compensation code attempt to catch up if we fall behind, but
2700 	 * it's better to try to match offsets from the beginning.
2701          */
2702 	if (synchronizing &&
2703 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
2704 		if (!kvm_check_tsc_unstable()) {
2705 			offset = kvm->arch.cur_tsc_offset;
2706 		} else {
2707 			u64 delta = nsec_to_cycles(vcpu, elapsed);
2708 			data += delta;
2709 			offset = kvm_compute_l1_tsc_offset(vcpu, data);
2710 		}
2711 		matched = true;
2712 	}
2713 
2714 	__kvm_synchronize_tsc(vcpu, offset, data, ns, matched);
2715 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
2716 }
2717 
2718 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
2719 					   s64 adjustment)
2720 {
2721 	u64 tsc_offset = vcpu->arch.l1_tsc_offset;
2722 	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
2723 }
2724 
2725 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
2726 {
2727 	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio)
2728 		WARN_ON(adjustment < 0);
2729 	adjustment = kvm_scale_tsc((u64) adjustment,
2730 				   vcpu->arch.l1_tsc_scaling_ratio);
2731 	adjust_tsc_offset_guest(vcpu, adjustment);
2732 }
2733 
2734 #ifdef CONFIG_X86_64
2735 
2736 static u64 read_tsc(void)
2737 {
2738 	u64 ret = (u64)rdtsc_ordered();
2739 	u64 last = pvclock_gtod_data.clock.cycle_last;
2740 
2741 	if (likely(ret >= last))
2742 		return ret;
2743 
2744 	/*
2745 	 * GCC likes to generate cmov here, but this branch is extremely
2746 	 * predictable (it's just a function of time and the likely is
2747 	 * very likely) and there's a data dependence, so force GCC
2748 	 * to generate a branch instead.  I don't barrier() because
2749 	 * we don't actually need a barrier, and if this function
2750 	 * ever gets inlined it will generate worse code.
2751 	 */
2752 	asm volatile ("");
2753 	return last;
2754 }
2755 
2756 static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
2757 			  int *mode)
2758 {
2759 	long v;
2760 	u64 tsc_pg_val;
2761 
2762 	switch (clock->vclock_mode) {
2763 	case VDSO_CLOCKMODE_HVCLOCK:
2764 		tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
2765 						  tsc_timestamp);
2766 		if (tsc_pg_val != U64_MAX) {
2767 			/* TSC page valid */
2768 			*mode = VDSO_CLOCKMODE_HVCLOCK;
2769 			v = (tsc_pg_val - clock->cycle_last) &
2770 				clock->mask;
2771 		} else {
2772 			/* TSC page invalid */
2773 			*mode = VDSO_CLOCKMODE_NONE;
2774 		}
2775 		break;
2776 	case VDSO_CLOCKMODE_TSC:
2777 		*mode = VDSO_CLOCKMODE_TSC;
2778 		*tsc_timestamp = read_tsc();
2779 		v = (*tsc_timestamp - clock->cycle_last) &
2780 			clock->mask;
2781 		break;
2782 	default:
2783 		*mode = VDSO_CLOCKMODE_NONE;
2784 	}
2785 
2786 	if (*mode == VDSO_CLOCKMODE_NONE)
2787 		*tsc_timestamp = v = 0;
2788 
2789 	return v * clock->mult;
2790 }
2791 
2792 static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
2793 {
2794 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2795 	unsigned long seq;
2796 	int mode;
2797 	u64 ns;
2798 
2799 	do {
2800 		seq = read_seqcount_begin(&gtod->seq);
2801 		ns = gtod->raw_clock.base_cycles;
2802 		ns += vgettsc(&gtod->raw_clock, tsc_timestamp, &mode);
2803 		ns >>= gtod->raw_clock.shift;
2804 		ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
2805 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2806 	*t = ns;
2807 
2808 	return mode;
2809 }
2810 
2811 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
2812 {
2813 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
2814 	unsigned long seq;
2815 	int mode;
2816 	u64 ns;
2817 
2818 	do {
2819 		seq = read_seqcount_begin(&gtod->seq);
2820 		ts->tv_sec = gtod->wall_time_sec;
2821 		ns = gtod->clock.base_cycles;
2822 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
2823 		ns >>= gtod->clock.shift;
2824 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
2825 
2826 	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
2827 	ts->tv_nsec = ns;
2828 
2829 	return mode;
2830 }
2831 
2832 /* returns true if host is using TSC based clocksource */
2833 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
2834 {
2835 	/* checked again under seqlock below */
2836 	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2837 		return false;
2838 
2839 	return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
2840 						      tsc_timestamp));
2841 }
2842 
2843 /* returns true if host is using TSC based clocksource */
2844 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
2845 					   u64 *tsc_timestamp)
2846 {
2847 	/* checked again under seqlock below */
2848 	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
2849 		return false;
2850 
2851 	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
2852 }
2853 #endif
2854 
2855 /*
2856  *
2857  * Assuming a stable TSC across physical CPUS, and a stable TSC
2858  * across virtual CPUs, the following condition is possible.
2859  * Each numbered line represents an event visible to both
2860  * CPUs at the next numbered event.
2861  *
2862  * "timespecX" represents host monotonic time. "tscX" represents
2863  * RDTSC value.
2864  *
2865  * 		VCPU0 on CPU0		|	VCPU1 on CPU1
2866  *
2867  * 1.  read timespec0,tsc0
2868  * 2.					| timespec1 = timespec0 + N
2869  * 					| tsc1 = tsc0 + M
2870  * 3. transition to guest		| transition to guest
2871  * 4. ret0 = timespec0 + (rdtsc - tsc0) |
2872  * 5.				        | ret1 = timespec1 + (rdtsc - tsc1)
2873  * 				        | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
2874  *
2875  * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
2876  *
2877  * 	- ret0 < ret1
2878  *	- timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
2879  *		...
2880  *	- 0 < N - M => M < N
2881  *
2882  * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
2883  * always the case (the difference between two distinct xtime instances
2884  * might be smaller then the difference between corresponding TSC reads,
2885  * when updating guest vcpus pvclock areas).
2886  *
2887  * To avoid that problem, do not allow visibility of distinct
2888  * system_timestamp/tsc_timestamp values simultaneously: use a master
2889  * copy of host monotonic time values. Update that master copy
2890  * in lockstep.
2891  *
2892  * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
2893  *
2894  */
2895 
2896 static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
2897 {
2898 #ifdef CONFIG_X86_64
2899 	struct kvm_arch *ka = &kvm->arch;
2900 	int vclock_mode;
2901 	bool host_tsc_clocksource, vcpus_matched;
2902 
2903 	lockdep_assert_held(&kvm->arch.tsc_write_lock);
2904 	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
2905 			atomic_read(&kvm->online_vcpus));
2906 
2907 	/*
2908 	 * If the host uses TSC clock, then passthrough TSC as stable
2909 	 * to the guest.
2910 	 */
2911 	host_tsc_clocksource = kvm_get_time_and_clockread(
2912 					&ka->master_kernel_ns,
2913 					&ka->master_cycle_now);
2914 
2915 	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
2916 				&& !ka->backwards_tsc_observed
2917 				&& !ka->boot_vcpu_runs_old_kvmclock;
2918 
2919 	if (ka->use_master_clock)
2920 		atomic_set(&kvm_guest_has_master_clock, 1);
2921 
2922 	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
2923 	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
2924 					vcpus_matched);
2925 #endif
2926 }
2927 
2928 static void kvm_make_mclock_inprogress_request(struct kvm *kvm)
2929 {
2930 	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
2931 }
2932 
2933 static void __kvm_start_pvclock_update(struct kvm *kvm)
2934 {
2935 	raw_spin_lock_irq(&kvm->arch.tsc_write_lock);
2936 	write_seqcount_begin(&kvm->arch.pvclock_sc);
2937 }
2938 
2939 static void kvm_start_pvclock_update(struct kvm *kvm)
2940 {
2941 	kvm_make_mclock_inprogress_request(kvm);
2942 
2943 	/* no guest entries from this point */
2944 	__kvm_start_pvclock_update(kvm);
2945 }
2946 
2947 static void kvm_end_pvclock_update(struct kvm *kvm)
2948 {
2949 	struct kvm_arch *ka = &kvm->arch;
2950 	struct kvm_vcpu *vcpu;
2951 	unsigned long i;
2952 
2953 	write_seqcount_end(&ka->pvclock_sc);
2954 	raw_spin_unlock_irq(&ka->tsc_write_lock);
2955 	kvm_for_each_vcpu(i, vcpu, kvm)
2956 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2957 
2958 	/* guest entries allowed */
2959 	kvm_for_each_vcpu(i, vcpu, kvm)
2960 		kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
2961 }
2962 
2963 static void kvm_update_masterclock(struct kvm *kvm)
2964 {
2965 	kvm_hv_request_tsc_page_update(kvm);
2966 	kvm_start_pvclock_update(kvm);
2967 	pvclock_update_vm_gtod_copy(kvm);
2968 	kvm_end_pvclock_update(kvm);
2969 }
2970 
2971 /*
2972  * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
2973  * per-CPU value (which may be zero if a CPU is going offline).  Note, tsc_khz
2974  * can change during boot even if the TSC is constant, as it's possible for KVM
2975  * to be loaded before TSC calibration completes.  Ideally, KVM would get a
2976  * notification when calibration completes, but practically speaking calibration
2977  * will complete before userspace is alive enough to create VMs.
2978  */
2979 static unsigned long get_cpu_tsc_khz(void)
2980 {
2981 	if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
2982 		return tsc_khz;
2983 	else
2984 		return __this_cpu_read(cpu_tsc_khz);
2985 }
2986 
2987 /* Called within read_seqcount_begin/retry for kvm->pvclock_sc.  */
2988 static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
2989 {
2990 	struct kvm_arch *ka = &kvm->arch;
2991 	struct pvclock_vcpu_time_info hv_clock;
2992 
2993 	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
2994 	get_cpu();
2995 
2996 	data->flags = 0;
2997 	if (ka->use_master_clock &&
2998 	    (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
2999 #ifdef CONFIG_X86_64
3000 		struct timespec64 ts;
3001 
3002 		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
3003 			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
3004 			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
3005 		} else
3006 #endif
3007 		data->host_tsc = rdtsc();
3008 
3009 		data->flags |= KVM_CLOCK_TSC_STABLE;
3010 		hv_clock.tsc_timestamp = ka->master_cycle_now;
3011 		hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
3012 		kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
3013 				   &hv_clock.tsc_shift,
3014 				   &hv_clock.tsc_to_system_mul);
3015 		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
3016 	} else {
3017 		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
3018 	}
3019 
3020 	put_cpu();
3021 }
3022 
3023 static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
3024 {
3025 	struct kvm_arch *ka = &kvm->arch;
3026 	unsigned seq;
3027 
3028 	do {
3029 		seq = read_seqcount_begin(&ka->pvclock_sc);
3030 		__get_kvmclock(kvm, data);
3031 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3032 }
3033 
3034 u64 get_kvmclock_ns(struct kvm *kvm)
3035 {
3036 	struct kvm_clock_data data;
3037 
3038 	get_kvmclock(kvm, &data);
3039 	return data.clock;
3040 }
3041 
3042 static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
3043 				    struct gfn_to_pfn_cache *gpc,
3044 				    unsigned int offset)
3045 {
3046 	struct kvm_vcpu_arch *vcpu = &v->arch;
3047 	struct pvclock_vcpu_time_info *guest_hv_clock;
3048 	unsigned long flags;
3049 
3050 	read_lock_irqsave(&gpc->lock, flags);
3051 	while (!kvm_gpc_check(gpc, offset + sizeof(*guest_hv_clock))) {
3052 		read_unlock_irqrestore(&gpc->lock, flags);
3053 
3054 		if (kvm_gpc_refresh(gpc, offset + sizeof(*guest_hv_clock)))
3055 			return;
3056 
3057 		read_lock_irqsave(&gpc->lock, flags);
3058 	}
3059 
3060 	guest_hv_clock = (void *)(gpc->khva + offset);
3061 
3062 	/*
3063 	 * This VCPU is paused, but it's legal for a guest to read another
3064 	 * VCPU's kvmclock, so we really have to follow the specification where
3065 	 * it says that version is odd if data is being modified, and even after
3066 	 * it is consistent.
3067 	 */
3068 
3069 	guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
3070 	smp_wmb();
3071 
3072 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
3073 	vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
3074 
3075 	if (vcpu->pvclock_set_guest_stopped_request) {
3076 		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
3077 		vcpu->pvclock_set_guest_stopped_request = false;
3078 	}
3079 
3080 	memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
3081 	smp_wmb();
3082 
3083 	guest_hv_clock->version = ++vcpu->hv_clock.version;
3084 
3085 	mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
3086 	read_unlock_irqrestore(&gpc->lock, flags);
3087 
3088 	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
3089 }
3090 
3091 static int kvm_guest_time_update(struct kvm_vcpu *v)
3092 {
3093 	unsigned long flags, tgt_tsc_khz;
3094 	unsigned seq;
3095 	struct kvm_vcpu_arch *vcpu = &v->arch;
3096 	struct kvm_arch *ka = &v->kvm->arch;
3097 	s64 kernel_ns;
3098 	u64 tsc_timestamp, host_tsc;
3099 	u8 pvclock_flags;
3100 	bool use_master_clock;
3101 
3102 	kernel_ns = 0;
3103 	host_tsc = 0;
3104 
3105 	/*
3106 	 * If the host uses TSC clock, then passthrough TSC as stable
3107 	 * to the guest.
3108 	 */
3109 	do {
3110 		seq = read_seqcount_begin(&ka->pvclock_sc);
3111 		use_master_clock = ka->use_master_clock;
3112 		if (use_master_clock) {
3113 			host_tsc = ka->master_cycle_now;
3114 			kernel_ns = ka->master_kernel_ns;
3115 		}
3116 	} while (read_seqcount_retry(&ka->pvclock_sc, seq));
3117 
3118 	/* Keep irq disabled to prevent changes to the clock */
3119 	local_irq_save(flags);
3120 	tgt_tsc_khz = get_cpu_tsc_khz();
3121 	if (unlikely(tgt_tsc_khz == 0)) {
3122 		local_irq_restore(flags);
3123 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3124 		return 1;
3125 	}
3126 	if (!use_master_clock) {
3127 		host_tsc = rdtsc();
3128 		kernel_ns = get_kvmclock_base_ns();
3129 	}
3130 
3131 	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
3132 
3133 	/*
3134 	 * We may have to catch up the TSC to match elapsed wall clock
3135 	 * time for two reasons, even if kvmclock is used.
3136 	 *   1) CPU could have been running below the maximum TSC rate
3137 	 *   2) Broken TSC compensation resets the base at each VCPU
3138 	 *      entry to avoid unknown leaps of TSC even when running
3139 	 *      again on the same CPU.  This may cause apparent elapsed
3140 	 *      time to disappear, and the guest to stand still or run
3141 	 *	very slowly.
3142 	 */
3143 	if (vcpu->tsc_catchup) {
3144 		u64 tsc = compute_guest_tsc(v, kernel_ns);
3145 		if (tsc > tsc_timestamp) {
3146 			adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
3147 			tsc_timestamp = tsc;
3148 		}
3149 	}
3150 
3151 	local_irq_restore(flags);
3152 
3153 	/* With all the info we got, fill in the values */
3154 
3155 	if (kvm_caps.has_tsc_control)
3156 		tgt_tsc_khz = kvm_scale_tsc(tgt_tsc_khz,
3157 					    v->arch.l1_tsc_scaling_ratio);
3158 
3159 	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
3160 		kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
3161 				   &vcpu->hv_clock.tsc_shift,
3162 				   &vcpu->hv_clock.tsc_to_system_mul);
3163 		vcpu->hw_tsc_khz = tgt_tsc_khz;
3164 	}
3165 
3166 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
3167 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
3168 	vcpu->last_guest_tsc = tsc_timestamp;
3169 
3170 	/* If the host uses TSC clocksource, then it is stable */
3171 	pvclock_flags = 0;
3172 	if (use_master_clock)
3173 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
3174 
3175 	vcpu->hv_clock.flags = pvclock_flags;
3176 
3177 	if (vcpu->pv_time.active)
3178 		kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
3179 	if (vcpu->xen.vcpu_info_cache.active)
3180 		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
3181 					offsetof(struct compat_vcpu_info, time));
3182 	if (vcpu->xen.vcpu_time_info_cache.active)
3183 		kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
3184 	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
3185 	return 0;
3186 }
3187 
3188 /*
3189  * kvmclock updates which are isolated to a given vcpu, such as
3190  * vcpu->cpu migration, should not allow system_timestamp from
3191  * the rest of the vcpus to remain static. Otherwise ntp frequency
3192  * correction applies to one vcpu's system_timestamp but not
3193  * the others.
3194  *
3195  * So in those cases, request a kvmclock update for all vcpus.
3196  * We need to rate-limit these requests though, as they can
3197  * considerably slow guests that have a large number of vcpus.
3198  * The time for a remote vcpu to update its kvmclock is bound
3199  * by the delay we use to rate-limit the updates.
3200  */
3201 
3202 #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
3203 
3204 static void kvmclock_update_fn(struct work_struct *work)
3205 {
3206 	unsigned long i;
3207 	struct delayed_work *dwork = to_delayed_work(work);
3208 	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3209 					   kvmclock_update_work);
3210 	struct kvm *kvm = container_of(ka, struct kvm, arch);
3211 	struct kvm_vcpu *vcpu;
3212 
3213 	kvm_for_each_vcpu(i, vcpu, kvm) {
3214 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3215 		kvm_vcpu_kick(vcpu);
3216 	}
3217 }
3218 
3219 static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
3220 {
3221 	struct kvm *kvm = v->kvm;
3222 
3223 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
3224 	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
3225 					KVMCLOCK_UPDATE_DELAY);
3226 }
3227 
3228 #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
3229 
3230 static void kvmclock_sync_fn(struct work_struct *work)
3231 {
3232 	struct delayed_work *dwork = to_delayed_work(work);
3233 	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
3234 					   kvmclock_sync_work);
3235 	struct kvm *kvm = container_of(ka, struct kvm, arch);
3236 
3237 	if (!kvmclock_periodic_sync)
3238 		return;
3239 
3240 	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
3241 	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
3242 					KVMCLOCK_SYNC_PERIOD);
3243 }
3244 
3245 /* These helpers are safe iff @msr is known to be an MCx bank MSR. */
3246 static bool is_mci_control_msr(u32 msr)
3247 {
3248 	return (msr & 3) == 0;
3249 }
3250 static bool is_mci_status_msr(u32 msr)
3251 {
3252 	return (msr & 3) == 1;
3253 }
3254 
3255 /*
3256  * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
3257  */
3258 static bool can_set_mci_status(struct kvm_vcpu *vcpu)
3259 {
3260 	/* McStatusWrEn enabled? */
3261 	if (guest_cpuid_is_amd_or_hygon(vcpu))
3262 		return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
3263 
3264 	return false;
3265 }
3266 
3267 static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3268 {
3269 	u64 mcg_cap = vcpu->arch.mcg_cap;
3270 	unsigned bank_num = mcg_cap & 0xff;
3271 	u32 msr = msr_info->index;
3272 	u64 data = msr_info->data;
3273 	u32 offset, last_msr;
3274 
3275 	switch (msr) {
3276 	case MSR_IA32_MCG_STATUS:
3277 		vcpu->arch.mcg_status = data;
3278 		break;
3279 	case MSR_IA32_MCG_CTL:
3280 		if (!(mcg_cap & MCG_CTL_P) &&
3281 		    (data || !msr_info->host_initiated))
3282 			return 1;
3283 		if (data != 0 && data != ~(u64)0)
3284 			return 1;
3285 		vcpu->arch.mcg_ctl = data;
3286 		break;
3287 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3288 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3289 		if (msr > last_msr)
3290 			return 1;
3291 
3292 		if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated))
3293 			return 1;
3294 		/* An attempt to write a 1 to a reserved bit raises #GP */
3295 		if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK))
3296 			return 1;
3297 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3298 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
3299 		vcpu->arch.mci_ctl2_banks[offset] = data;
3300 		break;
3301 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3302 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3303 		if (msr > last_msr)
3304 			return 1;
3305 
3306 		/*
3307 		 * Only 0 or all 1s can be written to IA32_MCi_CTL, all other
3308 		 * values are architecturally undefined.  But, some Linux
3309 		 * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB
3310 		 * issue on AMD K8s, allow bit 10 to be clear when setting all
3311 		 * other bits in order to avoid an uncaught #GP in the guest.
3312 		 *
3313 		 * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable,
3314 		 * single-bit ECC data errors.
3315 		 */
3316 		if (is_mci_control_msr(msr) &&
3317 		    data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
3318 			return 1;
3319 
3320 		/*
3321 		 * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR.
3322 		 * AMD-based CPUs allow non-zero values, but if and only if
3323 		 * HWCR[McStatusWrEn] is set.
3324 		 */
3325 		if (!msr_info->host_initiated && is_mci_status_msr(msr) &&
3326 		    data != 0 && !can_set_mci_status(vcpu))
3327 			return 1;
3328 
3329 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3330 					    last_msr + 1 - MSR_IA32_MC0_CTL);
3331 		vcpu->arch.mce_banks[offset] = data;
3332 		break;
3333 	default:
3334 		return 1;
3335 	}
3336 	return 0;
3337 }
3338 
3339 static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
3340 {
3341 	u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
3342 
3343 	return (vcpu->arch.apf.msr_en_val & mask) == mask;
3344 }
3345 
3346 static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
3347 {
3348 	gpa_t gpa = data & ~0x3f;
3349 
3350 	/* Bits 4:5 are reserved, Should be zero */
3351 	if (data & 0x30)
3352 		return 1;
3353 
3354 	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
3355 	    (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
3356 		return 1;
3357 
3358 	if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
3359 	    (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
3360 		return 1;
3361 
3362 	if (!lapic_in_kernel(vcpu))
3363 		return data ? 1 : 0;
3364 
3365 	vcpu->arch.apf.msr_en_val = data;
3366 
3367 	if (!kvm_pv_async_pf_enabled(vcpu)) {
3368 		kvm_clear_async_pf_completion_queue(vcpu);
3369 		kvm_async_pf_hash_reset(vcpu);
3370 		return 0;
3371 	}
3372 
3373 	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
3374 					sizeof(u64)))
3375 		return 1;
3376 
3377 	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
3378 	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
3379 
3380 	kvm_async_pf_wakeup_all(vcpu);
3381 
3382 	return 0;
3383 }
3384 
3385 static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
3386 {
3387 	/* Bits 8-63 are reserved */
3388 	if (data >> 8)
3389 		return 1;
3390 
3391 	if (!lapic_in_kernel(vcpu))
3392 		return 1;
3393 
3394 	vcpu->arch.apf.msr_int_val = data;
3395 
3396 	vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
3397 
3398 	return 0;
3399 }
3400 
3401 static void kvmclock_reset(struct kvm_vcpu *vcpu)
3402 {
3403 	kvm_gpc_deactivate(&vcpu->arch.pv_time);
3404 	vcpu->arch.time = 0;
3405 }
3406 
3407 static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
3408 {
3409 	++vcpu->stat.tlb_flush;
3410 	static_call(kvm_x86_flush_tlb_all)(vcpu);
3411 
3412 	/* Flushing all ASIDs flushes the current ASID... */
3413 	kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
3414 }
3415 
3416 static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
3417 {
3418 	++vcpu->stat.tlb_flush;
3419 
3420 	if (!tdp_enabled) {
3421 		/*
3422 		 * A TLB flush on behalf of the guest is equivalent to
3423 		 * INVPCID(all), toggling CR4.PGE, etc., which requires
3424 		 * a forced sync of the shadow page tables.  Ensure all the
3425 		 * roots are synced and the guest TLB in hardware is clean.
3426 		 */
3427 		kvm_mmu_sync_roots(vcpu);
3428 		kvm_mmu_sync_prev_roots(vcpu);
3429 	}
3430 
3431 	static_call(kvm_x86_flush_tlb_guest)(vcpu);
3432 
3433 	/*
3434 	 * Flushing all "guest" TLB is always a superset of Hyper-V's fine
3435 	 * grained flushing.
3436 	 */
3437 	kvm_hv_vcpu_purge_flush_tlb(vcpu);
3438 }
3439 
3440 
3441 static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
3442 {
3443 	++vcpu->stat.tlb_flush;
3444 	static_call(kvm_x86_flush_tlb_current)(vcpu);
3445 }
3446 
3447 /*
3448  * Service "local" TLB flush requests, which are specific to the current MMU
3449  * context.  In addition to the generic event handling in vcpu_enter_guest(),
3450  * TLB flushes that are targeted at an MMU context also need to be serviced
3451  * prior before nested VM-Enter/VM-Exit.
3452  */
3453 void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
3454 {
3455 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
3456 		kvm_vcpu_flush_tlb_current(vcpu);
3457 
3458 	if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
3459 		kvm_vcpu_flush_tlb_guest(vcpu);
3460 }
3461 EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
3462 
3463 static void record_steal_time(struct kvm_vcpu *vcpu)
3464 {
3465 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
3466 	struct kvm_steal_time __user *st;
3467 	struct kvm_memslots *slots;
3468 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
3469 	u64 steal;
3470 	u32 version;
3471 
3472 	if (kvm_xen_msr_enabled(vcpu->kvm)) {
3473 		kvm_xen_runstate_set_running(vcpu);
3474 		return;
3475 	}
3476 
3477 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
3478 		return;
3479 
3480 	if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm))
3481 		return;
3482 
3483 	slots = kvm_memslots(vcpu->kvm);
3484 
3485 	if (unlikely(slots->generation != ghc->generation ||
3486 		     gpa != ghc->gpa ||
3487 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot)) {
3488 		/* We rely on the fact that it fits in a single page. */
3489 		BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS);
3490 
3491 		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gpa, sizeof(*st)) ||
3492 		    kvm_is_error_hva(ghc->hva) || !ghc->memslot)
3493 			return;
3494 	}
3495 
3496 	st = (struct kvm_steal_time __user *)ghc->hva;
3497 	/*
3498 	 * Doing a TLB flush here, on the guest's behalf, can avoid
3499 	 * expensive IPIs.
3500 	 */
3501 	if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
3502 		u8 st_preempted = 0;
3503 		int err = -EFAULT;
3504 
3505 		if (!user_access_begin(st, sizeof(*st)))
3506 			return;
3507 
3508 		asm volatile("1: xchgb %0, %2\n"
3509 			     "xor %1, %1\n"
3510 			     "2:\n"
3511 			     _ASM_EXTABLE_UA(1b, 2b)
3512 			     : "+q" (st_preempted),
3513 			       "+&r" (err),
3514 			       "+m" (st->preempted));
3515 		if (err)
3516 			goto out;
3517 
3518 		user_access_end();
3519 
3520 		vcpu->arch.st.preempted = 0;
3521 
3522 		trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
3523 				       st_preempted & KVM_VCPU_FLUSH_TLB);
3524 		if (st_preempted & KVM_VCPU_FLUSH_TLB)
3525 			kvm_vcpu_flush_tlb_guest(vcpu);
3526 
3527 		if (!user_access_begin(st, sizeof(*st)))
3528 			goto dirty;
3529 	} else {
3530 		if (!user_access_begin(st, sizeof(*st)))
3531 			return;
3532 
3533 		unsafe_put_user(0, &st->preempted, out);
3534 		vcpu->arch.st.preempted = 0;
3535 	}
3536 
3537 	unsafe_get_user(version, &st->version, out);
3538 	if (version & 1)
3539 		version += 1;  /* first time write, random junk */
3540 
3541 	version += 1;
3542 	unsafe_put_user(version, &st->version, out);
3543 
3544 	smp_wmb();
3545 
3546 	unsafe_get_user(steal, &st->steal, out);
3547 	steal += current->sched_info.run_delay -
3548 		vcpu->arch.st.last_steal;
3549 	vcpu->arch.st.last_steal = current->sched_info.run_delay;
3550 	unsafe_put_user(steal, &st->steal, out);
3551 
3552 	version += 1;
3553 	unsafe_put_user(version, &st->version, out);
3554 
3555  out:
3556 	user_access_end();
3557  dirty:
3558 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
3559 }
3560 
3561 int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3562 {
3563 	bool pr = false;
3564 	u32 msr = msr_info->index;
3565 	u64 data = msr_info->data;
3566 
3567 	if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr)
3568 		return kvm_xen_write_hypercall_page(vcpu, data);
3569 
3570 	switch (msr) {
3571 	case MSR_AMD64_NB_CFG:
3572 	case MSR_IA32_UCODE_WRITE:
3573 	case MSR_VM_HSAVE_PA:
3574 	case MSR_AMD64_PATCH_LOADER:
3575 	case MSR_AMD64_BU_CFG2:
3576 	case MSR_AMD64_DC_CFG:
3577 	case MSR_F15H_EX_CFG:
3578 		break;
3579 
3580 	case MSR_IA32_UCODE_REV:
3581 		if (msr_info->host_initiated)
3582 			vcpu->arch.microcode_version = data;
3583 		break;
3584 	case MSR_IA32_ARCH_CAPABILITIES:
3585 		if (!msr_info->host_initiated)
3586 			return 1;
3587 		vcpu->arch.arch_capabilities = data;
3588 		break;
3589 	case MSR_IA32_PERF_CAPABILITIES:
3590 		if (!msr_info->host_initiated)
3591 			return 1;
3592 		if (data & ~kvm_caps.supported_perf_cap)
3593 			return 1;
3594 
3595 		vcpu->arch.perf_capabilities = data;
3596 		kvm_pmu_refresh(vcpu);
3597 		return 0;
3598 	case MSR_EFER:
3599 		return set_efer(vcpu, msr_info);
3600 	case MSR_K7_HWCR:
3601 		data &= ~(u64)0x40;	/* ignore flush filter disable */
3602 		data &= ~(u64)0x100;	/* ignore ignne emulation enable */
3603 		data &= ~(u64)0x8;	/* ignore TLB cache disable */
3604 
3605 		/* Handle McStatusWrEn */
3606 		if (data == BIT_ULL(18)) {
3607 			vcpu->arch.msr_hwcr = data;
3608 		} else if (data != 0) {
3609 			vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
3610 				    data);
3611 			return 1;
3612 		}
3613 		break;
3614 	case MSR_FAM10H_MMIO_CONF_BASE:
3615 		if (data != 0) {
3616 			vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
3617 				    "0x%llx\n", data);
3618 			return 1;
3619 		}
3620 		break;
3621 	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
3622 	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
3623 		return kvm_mtrr_set_msr(vcpu, msr, data);
3624 	case MSR_IA32_APICBASE:
3625 		return kvm_set_apic_base(vcpu, msr_info);
3626 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
3627 		return kvm_x2apic_msr_write(vcpu, msr, data);
3628 	case MSR_IA32_TSC_DEADLINE:
3629 		kvm_set_lapic_tscdeadline_msr(vcpu, data);
3630 		break;
3631 	case MSR_IA32_TSC_ADJUST:
3632 		if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
3633 			if (!msr_info->host_initiated) {
3634 				s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
3635 				adjust_tsc_offset_guest(vcpu, adj);
3636 				/* Before back to guest, tsc_timestamp must be adjusted
3637 				 * as well, otherwise guest's percpu pvclock time could jump.
3638 				 */
3639 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
3640 			}
3641 			vcpu->arch.ia32_tsc_adjust_msr = data;
3642 		}
3643 		break;
3644 	case MSR_IA32_MISC_ENABLE: {
3645 		u64 old_val = vcpu->arch.ia32_misc_enable_msr;
3646 
3647 		if (!msr_info->host_initiated) {
3648 			/* RO bits */
3649 			if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK)
3650 				return 1;
3651 
3652 			/* R bits, i.e. writes are ignored, but don't fault. */
3653 			data = data & ~MSR_IA32_MISC_ENABLE_EMON;
3654 			data |= old_val & MSR_IA32_MISC_ENABLE_EMON;
3655 		}
3656 
3657 		if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
3658 		    ((old_val ^ data)  & MSR_IA32_MISC_ENABLE_MWAIT)) {
3659 			if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
3660 				return 1;
3661 			vcpu->arch.ia32_misc_enable_msr = data;
3662 			kvm_update_cpuid_runtime(vcpu);
3663 		} else {
3664 			vcpu->arch.ia32_misc_enable_msr = data;
3665 		}
3666 		break;
3667 	}
3668 	case MSR_IA32_SMBASE:
3669 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
3670 			return 1;
3671 		vcpu->arch.smbase = data;
3672 		break;
3673 	case MSR_IA32_POWER_CTL:
3674 		vcpu->arch.msr_ia32_power_ctl = data;
3675 		break;
3676 	case MSR_IA32_TSC:
3677 		if (msr_info->host_initiated) {
3678 			kvm_synchronize_tsc(vcpu, data);
3679 		} else {
3680 			u64 adj = kvm_compute_l1_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
3681 			adjust_tsc_offset_guest(vcpu, adj);
3682 			vcpu->arch.ia32_tsc_adjust_msr += adj;
3683 		}
3684 		break;
3685 	case MSR_IA32_XSS:
3686 		if (!msr_info->host_initiated &&
3687 		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
3688 			return 1;
3689 		/*
3690 		 * KVM supports exposing PT to the guest, but does not support
3691 		 * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
3692 		 * XSAVES/XRSTORS to save/restore PT MSRs.
3693 		 */
3694 		if (data & ~kvm_caps.supported_xss)
3695 			return 1;
3696 		vcpu->arch.ia32_xss = data;
3697 		kvm_update_cpuid_runtime(vcpu);
3698 		break;
3699 	case MSR_SMI_COUNT:
3700 		if (!msr_info->host_initiated)
3701 			return 1;
3702 		vcpu->arch.smi_count = data;
3703 		break;
3704 	case MSR_KVM_WALL_CLOCK_NEW:
3705 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3706 			return 1;
3707 
3708 		vcpu->kvm->arch.wall_clock = data;
3709 		kvm_write_wall_clock(vcpu->kvm, data, 0);
3710 		break;
3711 	case MSR_KVM_WALL_CLOCK:
3712 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3713 			return 1;
3714 
3715 		vcpu->kvm->arch.wall_clock = data;
3716 		kvm_write_wall_clock(vcpu->kvm, data, 0);
3717 		break;
3718 	case MSR_KVM_SYSTEM_TIME_NEW:
3719 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
3720 			return 1;
3721 
3722 		kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
3723 		break;
3724 	case MSR_KVM_SYSTEM_TIME:
3725 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
3726 			return 1;
3727 
3728 		kvm_write_system_time(vcpu, data, true,  msr_info->host_initiated);
3729 		break;
3730 	case MSR_KVM_ASYNC_PF_EN:
3731 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
3732 			return 1;
3733 
3734 		if (kvm_pv_enable_async_pf(vcpu, data))
3735 			return 1;
3736 		break;
3737 	case MSR_KVM_ASYNC_PF_INT:
3738 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3739 			return 1;
3740 
3741 		if (kvm_pv_enable_async_pf_int(vcpu, data))
3742 			return 1;
3743 		break;
3744 	case MSR_KVM_ASYNC_PF_ACK:
3745 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
3746 			return 1;
3747 		if (data & 0x1) {
3748 			vcpu->arch.apf.pageready_pending = false;
3749 			kvm_check_async_pf_completion(vcpu);
3750 		}
3751 		break;
3752 	case MSR_KVM_STEAL_TIME:
3753 		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
3754 			return 1;
3755 
3756 		if (unlikely(!sched_info_on()))
3757 			return 1;
3758 
3759 		if (data & KVM_STEAL_RESERVED_MASK)
3760 			return 1;
3761 
3762 		vcpu->arch.st.msr_val = data;
3763 
3764 		if (!(data & KVM_MSR_ENABLED))
3765 			break;
3766 
3767 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
3768 
3769 		break;
3770 	case MSR_KVM_PV_EOI_EN:
3771 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
3772 			return 1;
3773 
3774 		if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8)))
3775 			return 1;
3776 		break;
3777 
3778 	case MSR_KVM_POLL_CONTROL:
3779 		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
3780 			return 1;
3781 
3782 		/* only enable bit supported */
3783 		if (data & (-1ULL << 1))
3784 			return 1;
3785 
3786 		vcpu->arch.msr_kvm_poll_control = data;
3787 		break;
3788 
3789 	case MSR_IA32_MCG_CTL:
3790 	case MSR_IA32_MCG_STATUS:
3791 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3792 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3793 		return set_msr_mce(vcpu, msr_info);
3794 
3795 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
3796 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
3797 		pr = true;
3798 		fallthrough;
3799 	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
3800 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
3801 		if (kvm_pmu_is_valid_msr(vcpu, msr))
3802 			return kvm_pmu_set_msr(vcpu, msr_info);
3803 
3804 		if (pr || data != 0)
3805 			vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
3806 				    "0x%x data 0x%llx\n", msr, data);
3807 		break;
3808 	case MSR_K7_CLK_CTL:
3809 		/*
3810 		 * Ignore all writes to this no longer documented MSR.
3811 		 * Writes are only relevant for old K7 processors,
3812 		 * all pre-dating SVM, but a recommended workaround from
3813 		 * AMD for these chips. It is possible to specify the
3814 		 * affected processor models on the command line, hence
3815 		 * the need to ignore the workaround.
3816 		 */
3817 		break;
3818 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
3819 	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
3820 	case HV_X64_MSR_SYNDBG_OPTIONS:
3821 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
3822 	case HV_X64_MSR_CRASH_CTL:
3823 	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
3824 	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
3825 	case HV_X64_MSR_TSC_EMULATION_CONTROL:
3826 	case HV_X64_MSR_TSC_EMULATION_STATUS:
3827 		return kvm_hv_set_msr_common(vcpu, msr, data,
3828 					     msr_info->host_initiated);
3829 	case MSR_IA32_BBL_CR_CTL3:
3830 		/* Drop writes to this legacy MSR -- see rdmsr
3831 		 * counterpart for further detail.
3832 		 */
3833 		if (report_ignored_msrs)
3834 			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
3835 				msr, data);
3836 		break;
3837 	case MSR_AMD64_OSVW_ID_LENGTH:
3838 		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3839 			return 1;
3840 		vcpu->arch.osvw.length = data;
3841 		break;
3842 	case MSR_AMD64_OSVW_STATUS:
3843 		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
3844 			return 1;
3845 		vcpu->arch.osvw.status = data;
3846 		break;
3847 	case MSR_PLATFORM_INFO:
3848 		if (!msr_info->host_initiated ||
3849 		    (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
3850 		     cpuid_fault_enabled(vcpu)))
3851 			return 1;
3852 		vcpu->arch.msr_platform_info = data;
3853 		break;
3854 	case MSR_MISC_FEATURES_ENABLES:
3855 		if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT ||
3856 		    (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
3857 		     !supports_cpuid_fault(vcpu)))
3858 			return 1;
3859 		vcpu->arch.msr_misc_features_enables = data;
3860 		break;
3861 #ifdef CONFIG_X86_64
3862 	case MSR_IA32_XFD:
3863 		if (!msr_info->host_initiated &&
3864 		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
3865 			return 1;
3866 
3867 		if (data & ~kvm_guest_supported_xfd(vcpu))
3868 			return 1;
3869 
3870 		fpu_update_guest_xfd(&vcpu->arch.guest_fpu, data);
3871 		break;
3872 	case MSR_IA32_XFD_ERR:
3873 		if (!msr_info->host_initiated &&
3874 		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
3875 			return 1;
3876 
3877 		if (data & ~kvm_guest_supported_xfd(vcpu))
3878 			return 1;
3879 
3880 		vcpu->arch.guest_fpu.xfd_err = data;
3881 		break;
3882 #endif
3883 	case MSR_IA32_PEBS_ENABLE:
3884 	case MSR_IA32_DS_AREA:
3885 	case MSR_PEBS_DATA_CFG:
3886 	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3887 		if (kvm_pmu_is_valid_msr(vcpu, msr))
3888 			return kvm_pmu_set_msr(vcpu, msr_info);
3889 		/*
3890 		 * Userspace is allowed to write '0' to MSRs that KVM reports
3891 		 * as to-be-saved, even if an MSRs isn't fully supported.
3892 		 */
3893 		return !msr_info->host_initiated || data;
3894 	default:
3895 		if (kvm_pmu_is_valid_msr(vcpu, msr))
3896 			return kvm_pmu_set_msr(vcpu, msr_info);
3897 		return KVM_MSR_RET_INVALID;
3898 	}
3899 	return 0;
3900 }
3901 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
3902 
3903 static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
3904 {
3905 	u64 data;
3906 	u64 mcg_cap = vcpu->arch.mcg_cap;
3907 	unsigned bank_num = mcg_cap & 0xff;
3908 	u32 offset, last_msr;
3909 
3910 	switch (msr) {
3911 	case MSR_IA32_P5_MC_ADDR:
3912 	case MSR_IA32_P5_MC_TYPE:
3913 		data = 0;
3914 		break;
3915 	case MSR_IA32_MCG_CAP:
3916 		data = vcpu->arch.mcg_cap;
3917 		break;
3918 	case MSR_IA32_MCG_CTL:
3919 		if (!(mcg_cap & MCG_CTL_P) && !host)
3920 			return 1;
3921 		data = vcpu->arch.mcg_ctl;
3922 		break;
3923 	case MSR_IA32_MCG_STATUS:
3924 		data = vcpu->arch.mcg_status;
3925 		break;
3926 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
3927 		last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1;
3928 		if (msr > last_msr)
3929 			return 1;
3930 
3931 		if (!(mcg_cap & MCG_CMCI_P) && !host)
3932 			return 1;
3933 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2,
3934 					    last_msr + 1 - MSR_IA32_MC0_CTL2);
3935 		data = vcpu->arch.mci_ctl2_banks[offset];
3936 		break;
3937 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
3938 		last_msr = MSR_IA32_MCx_CTL(bank_num) - 1;
3939 		if (msr > last_msr)
3940 			return 1;
3941 
3942 		offset = array_index_nospec(msr - MSR_IA32_MC0_CTL,
3943 					    last_msr + 1 - MSR_IA32_MC0_CTL);
3944 		data = vcpu->arch.mce_banks[offset];
3945 		break;
3946 	default:
3947 		return 1;
3948 	}
3949 	*pdata = data;
3950 	return 0;
3951 }
3952 
3953 int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
3954 {
3955 	switch (msr_info->index) {
3956 	case MSR_IA32_PLATFORM_ID:
3957 	case MSR_IA32_EBL_CR_POWERON:
3958 	case MSR_IA32_LASTBRANCHFROMIP:
3959 	case MSR_IA32_LASTBRANCHTOIP:
3960 	case MSR_IA32_LASTINTFROMIP:
3961 	case MSR_IA32_LASTINTTOIP:
3962 	case MSR_AMD64_SYSCFG:
3963 	case MSR_K8_TSEG_ADDR:
3964 	case MSR_K8_TSEG_MASK:
3965 	case MSR_VM_HSAVE_PA:
3966 	case MSR_K8_INT_PENDING_MSG:
3967 	case MSR_AMD64_NB_CFG:
3968 	case MSR_FAM10H_MMIO_CONF_BASE:
3969 	case MSR_AMD64_BU_CFG2:
3970 	case MSR_IA32_PERF_CTL:
3971 	case MSR_AMD64_DC_CFG:
3972 	case MSR_F15H_EX_CFG:
3973 	/*
3974 	 * Intel Sandy Bridge CPUs must support the RAPL (running average power
3975 	 * limit) MSRs. Just return 0, as we do not want to expose the host
3976 	 * data here. Do not conditionalize this on CPUID, as KVM does not do
3977 	 * so for existing CPU-specific MSRs.
3978 	 */
3979 	case MSR_RAPL_POWER_UNIT:
3980 	case MSR_PP0_ENERGY_STATUS:	/* Power plane 0 (core) */
3981 	case MSR_PP1_ENERGY_STATUS:	/* Power plane 1 (graphics uncore) */
3982 	case MSR_PKG_ENERGY_STATUS:	/* Total package */
3983 	case MSR_DRAM_ENERGY_STATUS:	/* DRAM controller */
3984 		msr_info->data = 0;
3985 		break;
3986 	case MSR_IA32_PEBS_ENABLE:
3987 	case MSR_IA32_DS_AREA:
3988 	case MSR_PEBS_DATA_CFG:
3989 	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
3990 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
3991 			return kvm_pmu_get_msr(vcpu, msr_info);
3992 		/*
3993 		 * Userspace is allowed to read MSRs that KVM reports as
3994 		 * to-be-saved, even if an MSR isn't fully supported.
3995 		 */
3996 		if (!msr_info->host_initiated)
3997 			return 1;
3998 		msr_info->data = 0;
3999 		break;
4000 	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
4001 	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
4002 	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
4003 	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
4004 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4005 			return kvm_pmu_get_msr(vcpu, msr_info);
4006 		msr_info->data = 0;
4007 		break;
4008 	case MSR_IA32_UCODE_REV:
4009 		msr_info->data = vcpu->arch.microcode_version;
4010 		break;
4011 	case MSR_IA32_ARCH_CAPABILITIES:
4012 		if (!msr_info->host_initiated &&
4013 		    !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
4014 			return 1;
4015 		msr_info->data = vcpu->arch.arch_capabilities;
4016 		break;
4017 	case MSR_IA32_PERF_CAPABILITIES:
4018 		if (!msr_info->host_initiated &&
4019 		    !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
4020 			return 1;
4021 		msr_info->data = vcpu->arch.perf_capabilities;
4022 		break;
4023 	case MSR_IA32_POWER_CTL:
4024 		msr_info->data = vcpu->arch.msr_ia32_power_ctl;
4025 		break;
4026 	case MSR_IA32_TSC: {
4027 		/*
4028 		 * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
4029 		 * even when not intercepted. AMD manual doesn't explicitly
4030 		 * state this but appears to behave the same.
4031 		 *
4032 		 * On userspace reads and writes, however, we unconditionally
4033 		 * return L1's TSC value to ensure backwards-compatible
4034 		 * behavior for migration.
4035 		 */
4036 		u64 offset, ratio;
4037 
4038 		if (msr_info->host_initiated) {
4039 			offset = vcpu->arch.l1_tsc_offset;
4040 			ratio = vcpu->arch.l1_tsc_scaling_ratio;
4041 		} else {
4042 			offset = vcpu->arch.tsc_offset;
4043 			ratio = vcpu->arch.tsc_scaling_ratio;
4044 		}
4045 
4046 		msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset;
4047 		break;
4048 	}
4049 	case MSR_MTRRcap:
4050 	case 0x200 ... MSR_IA32_MC0_CTL2 - 1:
4051 	case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff:
4052 		return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
4053 	case 0xcd: /* fsb frequency */
4054 		msr_info->data = 3;
4055 		break;
4056 		/*
4057 		 * MSR_EBC_FREQUENCY_ID
4058 		 * Conservative value valid for even the basic CPU models.
4059 		 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
4060 		 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
4061 		 * and 266MHz for model 3, or 4. Set Core Clock
4062 		 * Frequency to System Bus Frequency Ratio to 1 (bits
4063 		 * 31:24) even though these are only valid for CPU
4064 		 * models > 2, however guests may end up dividing or
4065 		 * multiplying by zero otherwise.
4066 		 */
4067 	case MSR_EBC_FREQUENCY_ID:
4068 		msr_info->data = 1 << 24;
4069 		break;
4070 	case MSR_IA32_APICBASE:
4071 		msr_info->data = kvm_get_apic_base(vcpu);
4072 		break;
4073 	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
4074 		return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
4075 	case MSR_IA32_TSC_DEADLINE:
4076 		msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
4077 		break;
4078 	case MSR_IA32_TSC_ADJUST:
4079 		msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
4080 		break;
4081 	case MSR_IA32_MISC_ENABLE:
4082 		msr_info->data = vcpu->arch.ia32_misc_enable_msr;
4083 		break;
4084 	case MSR_IA32_SMBASE:
4085 		if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated)
4086 			return 1;
4087 		msr_info->data = vcpu->arch.smbase;
4088 		break;
4089 	case MSR_SMI_COUNT:
4090 		msr_info->data = vcpu->arch.smi_count;
4091 		break;
4092 	case MSR_IA32_PERF_STATUS:
4093 		/* TSC increment by tick */
4094 		msr_info->data = 1000ULL;
4095 		/* CPU multiplier */
4096 		msr_info->data |= (((uint64_t)4ULL) << 40);
4097 		break;
4098 	case MSR_EFER:
4099 		msr_info->data = vcpu->arch.efer;
4100 		break;
4101 	case MSR_KVM_WALL_CLOCK:
4102 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4103 			return 1;
4104 
4105 		msr_info->data = vcpu->kvm->arch.wall_clock;
4106 		break;
4107 	case MSR_KVM_WALL_CLOCK_NEW:
4108 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4109 			return 1;
4110 
4111 		msr_info->data = vcpu->kvm->arch.wall_clock;
4112 		break;
4113 	case MSR_KVM_SYSTEM_TIME:
4114 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
4115 			return 1;
4116 
4117 		msr_info->data = vcpu->arch.time;
4118 		break;
4119 	case MSR_KVM_SYSTEM_TIME_NEW:
4120 		if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
4121 			return 1;
4122 
4123 		msr_info->data = vcpu->arch.time;
4124 		break;
4125 	case MSR_KVM_ASYNC_PF_EN:
4126 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
4127 			return 1;
4128 
4129 		msr_info->data = vcpu->arch.apf.msr_en_val;
4130 		break;
4131 	case MSR_KVM_ASYNC_PF_INT:
4132 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
4133 			return 1;
4134 
4135 		msr_info->data = vcpu->arch.apf.msr_int_val;
4136 		break;
4137 	case MSR_KVM_ASYNC_PF_ACK:
4138 		if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
4139 			return 1;
4140 
4141 		msr_info->data = 0;
4142 		break;
4143 	case MSR_KVM_STEAL_TIME:
4144 		if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
4145 			return 1;
4146 
4147 		msr_info->data = vcpu->arch.st.msr_val;
4148 		break;
4149 	case MSR_KVM_PV_EOI_EN:
4150 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
4151 			return 1;
4152 
4153 		msr_info->data = vcpu->arch.pv_eoi.msr_val;
4154 		break;
4155 	case MSR_KVM_POLL_CONTROL:
4156 		if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
4157 			return 1;
4158 
4159 		msr_info->data = vcpu->arch.msr_kvm_poll_control;
4160 		break;
4161 	case MSR_IA32_P5_MC_ADDR:
4162 	case MSR_IA32_P5_MC_TYPE:
4163 	case MSR_IA32_MCG_CAP:
4164 	case MSR_IA32_MCG_CTL:
4165 	case MSR_IA32_MCG_STATUS:
4166 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
4167 	case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1:
4168 		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
4169 				   msr_info->host_initiated);
4170 	case MSR_IA32_XSS:
4171 		if (!msr_info->host_initiated &&
4172 		    !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
4173 			return 1;
4174 		msr_info->data = vcpu->arch.ia32_xss;
4175 		break;
4176 	case MSR_K7_CLK_CTL:
4177 		/*
4178 		 * Provide expected ramp-up count for K7. All other
4179 		 * are set to zero, indicating minimum divisors for
4180 		 * every field.
4181 		 *
4182 		 * This prevents guest kernels on AMD host with CPU
4183 		 * type 6, model 8 and higher from exploding due to
4184 		 * the rdmsr failing.
4185 		 */
4186 		msr_info->data = 0x20000000;
4187 		break;
4188 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
4189 	case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
4190 	case HV_X64_MSR_SYNDBG_OPTIONS:
4191 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
4192 	case HV_X64_MSR_CRASH_CTL:
4193 	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
4194 	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
4195 	case HV_X64_MSR_TSC_EMULATION_CONTROL:
4196 	case HV_X64_MSR_TSC_EMULATION_STATUS:
4197 		return kvm_hv_get_msr_common(vcpu,
4198 					     msr_info->index, &msr_info->data,
4199 					     msr_info->host_initiated);
4200 	case MSR_IA32_BBL_CR_CTL3:
4201 		/* This legacy MSR exists but isn't fully documented in current
4202 		 * silicon.  It is however accessed by winxp in very narrow
4203 		 * scenarios where it sets bit #19, itself documented as
4204 		 * a "reserved" bit.  Best effort attempt to source coherent
4205 		 * read data here should the balance of the register be
4206 		 * interpreted by the guest:
4207 		 *
4208 		 * L2 cache control register 3: 64GB range, 256KB size,
4209 		 * enabled, latency 0x1, configured
4210 		 */
4211 		msr_info->data = 0xbe702111;
4212 		break;
4213 	case MSR_AMD64_OSVW_ID_LENGTH:
4214 		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4215 			return 1;
4216 		msr_info->data = vcpu->arch.osvw.length;
4217 		break;
4218 	case MSR_AMD64_OSVW_STATUS:
4219 		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
4220 			return 1;
4221 		msr_info->data = vcpu->arch.osvw.status;
4222 		break;
4223 	case MSR_PLATFORM_INFO:
4224 		if (!msr_info->host_initiated &&
4225 		    !vcpu->kvm->arch.guest_can_read_msr_platform_info)
4226 			return 1;
4227 		msr_info->data = vcpu->arch.msr_platform_info;
4228 		break;
4229 	case MSR_MISC_FEATURES_ENABLES:
4230 		msr_info->data = vcpu->arch.msr_misc_features_enables;
4231 		break;
4232 	case MSR_K7_HWCR:
4233 		msr_info->data = vcpu->arch.msr_hwcr;
4234 		break;
4235 #ifdef CONFIG_X86_64
4236 	case MSR_IA32_XFD:
4237 		if (!msr_info->host_initiated &&
4238 		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4239 			return 1;
4240 
4241 		msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd;
4242 		break;
4243 	case MSR_IA32_XFD_ERR:
4244 		if (!msr_info->host_initiated &&
4245 		    !guest_cpuid_has(vcpu, X86_FEATURE_XFD))
4246 			return 1;
4247 
4248 		msr_info->data = vcpu->arch.guest_fpu.xfd_err;
4249 		break;
4250 #endif
4251 	default:
4252 		if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
4253 			return kvm_pmu_get_msr(vcpu, msr_info);
4254 		return KVM_MSR_RET_INVALID;
4255 	}
4256 	return 0;
4257 }
4258 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
4259 
4260 /*
4261  * Read or write a bunch of msrs. All parameters are kernel addresses.
4262  *
4263  * @return number of msrs set successfully.
4264  */
4265 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
4266 		    struct kvm_msr_entry *entries,
4267 		    int (*do_msr)(struct kvm_vcpu *vcpu,
4268 				  unsigned index, u64 *data))
4269 {
4270 	int i;
4271 
4272 	for (i = 0; i < msrs->nmsrs; ++i)
4273 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
4274 			break;
4275 
4276 	return i;
4277 }
4278 
4279 /*
4280  * Read or write a bunch of msrs. Parameters are user addresses.
4281  *
4282  * @return number of msrs set successfully.
4283  */
4284 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
4285 		  int (*do_msr)(struct kvm_vcpu *vcpu,
4286 				unsigned index, u64 *data),
4287 		  int writeback)
4288 {
4289 	struct kvm_msrs msrs;
4290 	struct kvm_msr_entry *entries;
4291 	int r, n;
4292 	unsigned size;
4293 
4294 	r = -EFAULT;
4295 	if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
4296 		goto out;
4297 
4298 	r = -E2BIG;
4299 	if (msrs.nmsrs >= MAX_IO_MSRS)
4300 		goto out;
4301 
4302 	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
4303 	entries = memdup_user(user_msrs->entries, size);
4304 	if (IS_ERR(entries)) {
4305 		r = PTR_ERR(entries);
4306 		goto out;
4307 	}
4308 
4309 	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
4310 	if (r < 0)
4311 		goto out_free;
4312 
4313 	r = -EFAULT;
4314 	if (writeback && copy_to_user(user_msrs->entries, entries, size))
4315 		goto out_free;
4316 
4317 	r = n;
4318 
4319 out_free:
4320 	kfree(entries);
4321 out:
4322 	return r;
4323 }
4324 
4325 static inline bool kvm_can_mwait_in_guest(void)
4326 {
4327 	return boot_cpu_has(X86_FEATURE_MWAIT) &&
4328 		!boot_cpu_has_bug(X86_BUG_MONITOR) &&
4329 		boot_cpu_has(X86_FEATURE_ARAT);
4330 }
4331 
4332 static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu,
4333 					    struct kvm_cpuid2 __user *cpuid_arg)
4334 {
4335 	struct kvm_cpuid2 cpuid;
4336 	int r;
4337 
4338 	r = -EFAULT;
4339 	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4340 		return r;
4341 
4342 	r = kvm_get_hv_cpuid(vcpu, &cpuid, cpuid_arg->entries);
4343 	if (r)
4344 		return r;
4345 
4346 	r = -EFAULT;
4347 	if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4348 		return r;
4349 
4350 	return 0;
4351 }
4352 
4353 int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
4354 {
4355 	int r = 0;
4356 
4357 	switch (ext) {
4358 	case KVM_CAP_IRQCHIP:
4359 	case KVM_CAP_HLT:
4360 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
4361 	case KVM_CAP_SET_TSS_ADDR:
4362 	case KVM_CAP_EXT_CPUID:
4363 	case KVM_CAP_EXT_EMUL_CPUID:
4364 	case KVM_CAP_CLOCKSOURCE:
4365 	case KVM_CAP_PIT:
4366 	case KVM_CAP_NOP_IO_DELAY:
4367 	case KVM_CAP_MP_STATE:
4368 	case KVM_CAP_SYNC_MMU:
4369 	case KVM_CAP_USER_NMI:
4370 	case KVM_CAP_REINJECT_CONTROL:
4371 	case KVM_CAP_IRQ_INJECT_STATUS:
4372 	case KVM_CAP_IOEVENTFD:
4373 	case KVM_CAP_IOEVENTFD_NO_LENGTH:
4374 	case KVM_CAP_PIT2:
4375 	case KVM_CAP_PIT_STATE2:
4376 	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
4377 	case KVM_CAP_VCPU_EVENTS:
4378 	case KVM_CAP_HYPERV:
4379 	case KVM_CAP_HYPERV_VAPIC:
4380 	case KVM_CAP_HYPERV_SPIN:
4381 	case KVM_CAP_HYPERV_SYNIC:
4382 	case KVM_CAP_HYPERV_SYNIC2:
4383 	case KVM_CAP_HYPERV_VP_INDEX:
4384 	case KVM_CAP_HYPERV_EVENTFD:
4385 	case KVM_CAP_HYPERV_TLBFLUSH:
4386 	case KVM_CAP_HYPERV_SEND_IPI:
4387 	case KVM_CAP_HYPERV_CPUID:
4388 	case KVM_CAP_HYPERV_ENFORCE_CPUID:
4389 	case KVM_CAP_SYS_HYPERV_CPUID:
4390 	case KVM_CAP_PCI_SEGMENT:
4391 	case KVM_CAP_DEBUGREGS:
4392 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
4393 	case KVM_CAP_XSAVE:
4394 	case KVM_CAP_ASYNC_PF:
4395 	case KVM_CAP_ASYNC_PF_INT:
4396 	case KVM_CAP_GET_TSC_KHZ:
4397 	case KVM_CAP_KVMCLOCK_CTRL:
4398 	case KVM_CAP_READONLY_MEM:
4399 	case KVM_CAP_HYPERV_TIME:
4400 	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
4401 	case KVM_CAP_TSC_DEADLINE_TIMER:
4402 	case KVM_CAP_DISABLE_QUIRKS:
4403 	case KVM_CAP_SET_BOOT_CPU_ID:
4404  	case KVM_CAP_SPLIT_IRQCHIP:
4405 	case KVM_CAP_IMMEDIATE_EXIT:
4406 	case KVM_CAP_PMU_EVENT_FILTER:
4407 	case KVM_CAP_GET_MSR_FEATURES:
4408 	case KVM_CAP_MSR_PLATFORM_INFO:
4409 	case KVM_CAP_EXCEPTION_PAYLOAD:
4410 	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
4411 	case KVM_CAP_SET_GUEST_DEBUG:
4412 	case KVM_CAP_LAST_CPU:
4413 	case KVM_CAP_X86_USER_SPACE_MSR:
4414 	case KVM_CAP_X86_MSR_FILTER:
4415 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
4416 #ifdef CONFIG_X86_SGX_KVM
4417 	case KVM_CAP_SGX_ATTRIBUTE:
4418 #endif
4419 	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
4420 	case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
4421 	case KVM_CAP_SREGS2:
4422 	case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
4423 	case KVM_CAP_VCPU_ATTRIBUTES:
4424 	case KVM_CAP_SYS_ATTRIBUTES:
4425 	case KVM_CAP_VAPIC:
4426 	case KVM_CAP_ENABLE_CAP:
4427 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
4428 		r = 1;
4429 		break;
4430 	case KVM_CAP_EXIT_HYPERCALL:
4431 		r = KVM_EXIT_HYPERCALL_VALID_MASK;
4432 		break;
4433 	case KVM_CAP_SET_GUEST_DEBUG2:
4434 		return KVM_GUESTDBG_VALID_MASK;
4435 #ifdef CONFIG_KVM_XEN
4436 	case KVM_CAP_XEN_HVM:
4437 		r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
4438 		    KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
4439 		    KVM_XEN_HVM_CONFIG_SHARED_INFO |
4440 		    KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
4441 		    KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
4442 		if (sched_info_on())
4443 			r |= KVM_XEN_HVM_CONFIG_RUNSTATE |
4444 			     KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG;
4445 		break;
4446 #endif
4447 	case KVM_CAP_SYNC_REGS:
4448 		r = KVM_SYNC_X86_VALID_FIELDS;
4449 		break;
4450 	case KVM_CAP_ADJUST_CLOCK:
4451 		r = KVM_CLOCK_VALID_FLAGS;
4452 		break;
4453 	case KVM_CAP_X86_DISABLE_EXITS:
4454 		r |=  KVM_X86_DISABLE_EXITS_HLT | KVM_X86_DISABLE_EXITS_PAUSE |
4455 		      KVM_X86_DISABLE_EXITS_CSTATE;
4456 		if(kvm_can_mwait_in_guest())
4457 			r |= KVM_X86_DISABLE_EXITS_MWAIT;
4458 		break;
4459 	case KVM_CAP_X86_SMM:
4460 		if (!IS_ENABLED(CONFIG_KVM_SMM))
4461 			break;
4462 
4463 		/* SMBASE is usually relocated above 1M on modern chipsets,
4464 		 * and SMM handlers might indeed rely on 4G segment limits,
4465 		 * so do not report SMM to be available if real mode is
4466 		 * emulated via vm86 mode.  Still, do not go to great lengths
4467 		 * to avoid userspace's usage of the feature, because it is a
4468 		 * fringe case that is not enabled except via specific settings
4469 		 * of the module parameters.
4470 		 */
4471 		r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE);
4472 		break;
4473 	case KVM_CAP_NR_VCPUS:
4474 		r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS);
4475 		break;
4476 	case KVM_CAP_MAX_VCPUS:
4477 		r = KVM_MAX_VCPUS;
4478 		break;
4479 	case KVM_CAP_MAX_VCPU_ID:
4480 		r = KVM_MAX_VCPU_IDS;
4481 		break;
4482 	case KVM_CAP_PV_MMU:	/* obsolete */
4483 		r = 0;
4484 		break;
4485 	case KVM_CAP_MCE:
4486 		r = KVM_MAX_MCE_BANKS;
4487 		break;
4488 	case KVM_CAP_XCRS:
4489 		r = boot_cpu_has(X86_FEATURE_XSAVE);
4490 		break;
4491 	case KVM_CAP_TSC_CONTROL:
4492 	case KVM_CAP_VM_TSC_CONTROL:
4493 		r = kvm_caps.has_tsc_control;
4494 		break;
4495 	case KVM_CAP_X2APIC_API:
4496 		r = KVM_X2APIC_API_VALID_FLAGS;
4497 		break;
4498 	case KVM_CAP_NESTED_STATE:
4499 		r = kvm_x86_ops.nested_ops->get_state ?
4500 			kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
4501 		break;
4502 	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
4503 		r = kvm_x86_ops.enable_l2_tlb_flush != NULL;
4504 		break;
4505 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
4506 		r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
4507 		break;
4508 	case KVM_CAP_SMALLER_MAXPHYADDR:
4509 		r = (int) allow_smaller_maxphyaddr;
4510 		break;
4511 	case KVM_CAP_STEAL_TIME:
4512 		r = sched_info_on();
4513 		break;
4514 	case KVM_CAP_X86_BUS_LOCK_EXIT:
4515 		if (kvm_caps.has_bus_lock_exit)
4516 			r = KVM_BUS_LOCK_DETECTION_OFF |
4517 			    KVM_BUS_LOCK_DETECTION_EXIT;
4518 		else
4519 			r = 0;
4520 		break;
4521 	case KVM_CAP_XSAVE2: {
4522 		u64 guest_perm = xstate_get_guest_group_perm();
4523 
4524 		r = xstate_required_size(kvm_caps.supported_xcr0 & guest_perm, false);
4525 		if (r < sizeof(struct kvm_xsave))
4526 			r = sizeof(struct kvm_xsave);
4527 		break;
4528 	}
4529 	case KVM_CAP_PMU_CAPABILITY:
4530 		r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0;
4531 		break;
4532 	case KVM_CAP_DISABLE_QUIRKS2:
4533 		r = KVM_X86_VALID_QUIRKS;
4534 		break;
4535 	case KVM_CAP_X86_NOTIFY_VMEXIT:
4536 		r = kvm_caps.has_notify_vmexit;
4537 		break;
4538 	default:
4539 		break;
4540 	}
4541 	return r;
4542 }
4543 
4544 static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
4545 {
4546 	void __user *uaddr = (void __user*)(unsigned long)attr->addr;
4547 
4548 	if ((u64)(unsigned long)uaddr != attr->addr)
4549 		return ERR_PTR_USR(-EFAULT);
4550 	return uaddr;
4551 }
4552 
4553 static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
4554 {
4555 	u64 __user *uaddr = kvm_get_attr_addr(attr);
4556 
4557 	if (attr->group)
4558 		return -ENXIO;
4559 
4560 	if (IS_ERR(uaddr))
4561 		return PTR_ERR(uaddr);
4562 
4563 	switch (attr->attr) {
4564 	case KVM_X86_XCOMP_GUEST_SUPP:
4565 		if (put_user(kvm_caps.supported_xcr0, uaddr))
4566 			return -EFAULT;
4567 		return 0;
4568 	default:
4569 		return -ENXIO;
4570 		break;
4571 	}
4572 }
4573 
4574 static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
4575 {
4576 	if (attr->group)
4577 		return -ENXIO;
4578 
4579 	switch (attr->attr) {
4580 	case KVM_X86_XCOMP_GUEST_SUPP:
4581 		return 0;
4582 	default:
4583 		return -ENXIO;
4584 	}
4585 }
4586 
4587 long kvm_arch_dev_ioctl(struct file *filp,
4588 			unsigned int ioctl, unsigned long arg)
4589 {
4590 	void __user *argp = (void __user *)arg;
4591 	long r;
4592 
4593 	switch (ioctl) {
4594 	case KVM_GET_MSR_INDEX_LIST: {
4595 		struct kvm_msr_list __user *user_msr_list = argp;
4596 		struct kvm_msr_list msr_list;
4597 		unsigned n;
4598 
4599 		r = -EFAULT;
4600 		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4601 			goto out;
4602 		n = msr_list.nmsrs;
4603 		msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
4604 		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4605 			goto out;
4606 		r = -E2BIG;
4607 		if (n < msr_list.nmsrs)
4608 			goto out;
4609 		r = -EFAULT;
4610 		if (copy_to_user(user_msr_list->indices, &msrs_to_save,
4611 				 num_msrs_to_save * sizeof(u32)))
4612 			goto out;
4613 		if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
4614 				 &emulated_msrs,
4615 				 num_emulated_msrs * sizeof(u32)))
4616 			goto out;
4617 		r = 0;
4618 		break;
4619 	}
4620 	case KVM_GET_SUPPORTED_CPUID:
4621 	case KVM_GET_EMULATED_CPUID: {
4622 		struct kvm_cpuid2 __user *cpuid_arg = argp;
4623 		struct kvm_cpuid2 cpuid;
4624 
4625 		r = -EFAULT;
4626 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
4627 			goto out;
4628 
4629 		r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
4630 					    ioctl);
4631 		if (r)
4632 			goto out;
4633 
4634 		r = -EFAULT;
4635 		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
4636 			goto out;
4637 		r = 0;
4638 		break;
4639 	}
4640 	case KVM_X86_GET_MCE_CAP_SUPPORTED:
4641 		r = -EFAULT;
4642 		if (copy_to_user(argp, &kvm_caps.supported_mce_cap,
4643 				 sizeof(kvm_caps.supported_mce_cap)))
4644 			goto out;
4645 		r = 0;
4646 		break;
4647 	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
4648 		struct kvm_msr_list __user *user_msr_list = argp;
4649 		struct kvm_msr_list msr_list;
4650 		unsigned int n;
4651 
4652 		r = -EFAULT;
4653 		if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
4654 			goto out;
4655 		n = msr_list.nmsrs;
4656 		msr_list.nmsrs = num_msr_based_features;
4657 		if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
4658 			goto out;
4659 		r = -E2BIG;
4660 		if (n < msr_list.nmsrs)
4661 			goto out;
4662 		r = -EFAULT;
4663 		if (copy_to_user(user_msr_list->indices, &msr_based_features,
4664 				 num_msr_based_features * sizeof(u32)))
4665 			goto out;
4666 		r = 0;
4667 		break;
4668 	}
4669 	case KVM_GET_MSRS:
4670 		r = msr_io(NULL, argp, do_get_msr_feature, 1);
4671 		break;
4672 	case KVM_GET_SUPPORTED_HV_CPUID:
4673 		r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
4674 		break;
4675 	case KVM_GET_DEVICE_ATTR: {
4676 		struct kvm_device_attr attr;
4677 		r = -EFAULT;
4678 		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4679 			break;
4680 		r = kvm_x86_dev_get_attr(&attr);
4681 		break;
4682 	}
4683 	case KVM_HAS_DEVICE_ATTR: {
4684 		struct kvm_device_attr attr;
4685 		r = -EFAULT;
4686 		if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
4687 			break;
4688 		r = kvm_x86_dev_has_attr(&attr);
4689 		break;
4690 	}
4691 	default:
4692 		r = -EINVAL;
4693 		break;
4694 	}
4695 out:
4696 	return r;
4697 }
4698 
4699 static void wbinvd_ipi(void *garbage)
4700 {
4701 	wbinvd();
4702 }
4703 
4704 static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
4705 {
4706 	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
4707 }
4708 
4709 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
4710 {
4711 	/* Address WBINVD may be executed by guest */
4712 	if (need_emulate_wbinvd(vcpu)) {
4713 		if (static_call(kvm_x86_has_wbinvd_exit)())
4714 			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
4715 		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
4716 			smp_call_function_single(vcpu->cpu,
4717 					wbinvd_ipi, NULL, 1);
4718 	}
4719 
4720 	static_call(kvm_x86_vcpu_load)(vcpu, cpu);
4721 
4722 	/* Save host pkru register if supported */
4723 	vcpu->arch.host_pkru = read_pkru();
4724 
4725 	/* Apply any externally detected TSC adjustments (due to suspend) */
4726 	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
4727 		adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
4728 		vcpu->arch.tsc_offset_adjustment = 0;
4729 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4730 	}
4731 
4732 	if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) {
4733 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
4734 				rdtsc() - vcpu->arch.last_host_tsc;
4735 		if (tsc_delta < 0)
4736 			mark_tsc_unstable("KVM discovered backwards TSC");
4737 
4738 		if (kvm_check_tsc_unstable()) {
4739 			u64 offset = kvm_compute_l1_tsc_offset(vcpu,
4740 						vcpu->arch.last_guest_tsc);
4741 			kvm_vcpu_write_tsc_offset(vcpu, offset);
4742 			vcpu->arch.tsc_catchup = 1;
4743 		}
4744 
4745 		if (kvm_lapic_hv_timer_in_use(vcpu))
4746 			kvm_lapic_restart_hv_timer(vcpu);
4747 
4748 		/*
4749 		 * On a host with synchronized TSC, there is no need to update
4750 		 * kvmclock on vcpu->cpu migration
4751 		 */
4752 		if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
4753 			kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
4754 		if (vcpu->cpu != cpu)
4755 			kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
4756 		vcpu->cpu = cpu;
4757 	}
4758 
4759 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
4760 }
4761 
4762 static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
4763 {
4764 	struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
4765 	struct kvm_steal_time __user *st;
4766 	struct kvm_memslots *slots;
4767 	static const u8 preempted = KVM_VCPU_PREEMPTED;
4768 	gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS;
4769 
4770 	/*
4771 	 * The vCPU can be marked preempted if and only if the VM-Exit was on
4772 	 * an instruction boundary and will not trigger guest emulation of any
4773 	 * kind (see vcpu_run).  Vendor specific code controls (conservatively)
4774 	 * when this is true, for example allowing the vCPU to be marked
4775 	 * preempted if and only if the VM-Exit was due to a host interrupt.
4776 	 */
4777 	if (!vcpu->arch.at_instruction_boundary) {
4778 		vcpu->stat.preemption_other++;
4779 		return;
4780 	}
4781 
4782 	vcpu->stat.preemption_reported++;
4783 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
4784 		return;
4785 
4786 	if (vcpu->arch.st.preempted)
4787 		return;
4788 
4789 	/* This happens on process exit */
4790 	if (unlikely(current->mm != vcpu->kvm->mm))
4791 		return;
4792 
4793 	slots = kvm_memslots(vcpu->kvm);
4794 
4795 	if (unlikely(slots->generation != ghc->generation ||
4796 		     gpa != ghc->gpa ||
4797 		     kvm_is_error_hva(ghc->hva) || !ghc->memslot))
4798 		return;
4799 
4800 	st = (struct kvm_steal_time __user *)ghc->hva;
4801 	BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted));
4802 
4803 	if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted)))
4804 		vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
4805 
4806 	mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa));
4807 }
4808 
4809 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
4810 {
4811 	int idx;
4812 
4813 	if (vcpu->preempted) {
4814 		if (!vcpu->arch.guest_state_protected)
4815 			vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu);
4816 
4817 		/*
4818 		 * Take the srcu lock as memslots will be accessed to check the gfn
4819 		 * cache generation against the memslots generation.
4820 		 */
4821 		idx = srcu_read_lock(&vcpu->kvm->srcu);
4822 		if (kvm_xen_msr_enabled(vcpu->kvm))
4823 			kvm_xen_runstate_set_preempted(vcpu);
4824 		else
4825 			kvm_steal_time_set_preempted(vcpu);
4826 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
4827 	}
4828 
4829 	static_call(kvm_x86_vcpu_put)(vcpu);
4830 	vcpu->arch.last_host_tsc = rdtsc();
4831 }
4832 
4833 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
4834 				    struct kvm_lapic_state *s)
4835 {
4836 	static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
4837 
4838 	return kvm_apic_get_state(vcpu, s);
4839 }
4840 
4841 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
4842 				    struct kvm_lapic_state *s)
4843 {
4844 	int r;
4845 
4846 	r = kvm_apic_set_state(vcpu, s);
4847 	if (r)
4848 		return r;
4849 	update_cr8_intercept(vcpu);
4850 
4851 	return 0;
4852 }
4853 
4854 static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
4855 {
4856 	/*
4857 	 * We can accept userspace's request for interrupt injection
4858 	 * as long as we have a place to store the interrupt number.
4859 	 * The actual injection will happen when the CPU is able to
4860 	 * deliver the interrupt.
4861 	 */
4862 	if (kvm_cpu_has_extint(vcpu))
4863 		return false;
4864 
4865 	/* Acknowledging ExtINT does not happen if LINT0 is masked.  */
4866 	return (!lapic_in_kernel(vcpu) ||
4867 		kvm_apic_accept_pic_intr(vcpu));
4868 }
4869 
4870 static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
4871 {
4872 	/*
4873 	 * Do not cause an interrupt window exit if an exception
4874 	 * is pending or an event needs reinjection; userspace
4875 	 * might want to inject the interrupt manually using KVM_SET_REGS
4876 	 * or KVM_SET_SREGS.  For that to work, we must be at an
4877 	 * instruction boundary and with no events half-injected.
4878 	 */
4879 	return (kvm_arch_interrupt_allowed(vcpu) &&
4880 		kvm_cpu_accept_dm_intr(vcpu) &&
4881 		!kvm_event_needs_reinjection(vcpu) &&
4882 		!kvm_is_exception_pending(vcpu));
4883 }
4884 
4885 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
4886 				    struct kvm_interrupt *irq)
4887 {
4888 	if (irq->irq >= KVM_NR_INTERRUPTS)
4889 		return -EINVAL;
4890 
4891 	if (!irqchip_in_kernel(vcpu->kvm)) {
4892 		kvm_queue_interrupt(vcpu, irq->irq, false);
4893 		kvm_make_request(KVM_REQ_EVENT, vcpu);
4894 		return 0;
4895 	}
4896 
4897 	/*
4898 	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
4899 	 * fail for in-kernel 8259.
4900 	 */
4901 	if (pic_in_kernel(vcpu->kvm))
4902 		return -ENXIO;
4903 
4904 	if (vcpu->arch.pending_external_vector != -1)
4905 		return -EEXIST;
4906 
4907 	vcpu->arch.pending_external_vector = irq->irq;
4908 	kvm_make_request(KVM_REQ_EVENT, vcpu);
4909 	return 0;
4910 }
4911 
4912 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
4913 {
4914 	kvm_inject_nmi(vcpu);
4915 
4916 	return 0;
4917 }
4918 
4919 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
4920 					   struct kvm_tpr_access_ctl *tac)
4921 {
4922 	if (tac->flags)
4923 		return -EINVAL;
4924 	vcpu->arch.tpr_access_reporting = !!tac->enabled;
4925 	return 0;
4926 }
4927 
4928 static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
4929 					u64 mcg_cap)
4930 {
4931 	int r;
4932 	unsigned bank_num = mcg_cap & 0xff, bank;
4933 
4934 	r = -EINVAL;
4935 	if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
4936 		goto out;
4937 	if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000))
4938 		goto out;
4939 	r = 0;
4940 	vcpu->arch.mcg_cap = mcg_cap;
4941 	/* Init IA32_MCG_CTL to all 1s */
4942 	if (mcg_cap & MCG_CTL_P)
4943 		vcpu->arch.mcg_ctl = ~(u64)0;
4944 	/* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */
4945 	for (bank = 0; bank < bank_num; bank++) {
4946 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
4947 		if (mcg_cap & MCG_CMCI_P)
4948 			vcpu->arch.mci_ctl2_banks[bank] = 0;
4949 	}
4950 
4951 	kvm_apic_after_set_mcg_cap(vcpu);
4952 
4953 	static_call(kvm_x86_setup_mce)(vcpu);
4954 out:
4955 	return r;
4956 }
4957 
4958 /*
4959  * Validate this is an UCNA (uncorrectable no action) error by checking the
4960  * MCG_STATUS and MCi_STATUS registers:
4961  * - none of the bits for Machine Check Exceptions are set
4962  * - both the VAL (valid) and UC (uncorrectable) bits are set
4963  * MCI_STATUS_PCC - Processor Context Corrupted
4964  * MCI_STATUS_S - Signaled as a Machine Check Exception
4965  * MCI_STATUS_AR - Software recoverable Action Required
4966  */
4967 static bool is_ucna(struct kvm_x86_mce *mce)
4968 {
4969 	return	!mce->mcg_status &&
4970 		!(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) &&
4971 		(mce->status & MCI_STATUS_VAL) &&
4972 		(mce->status & MCI_STATUS_UC);
4973 }
4974 
4975 static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks)
4976 {
4977 	u64 mcg_cap = vcpu->arch.mcg_cap;
4978 
4979 	banks[1] = mce->status;
4980 	banks[2] = mce->addr;
4981 	banks[3] = mce->misc;
4982 	vcpu->arch.mcg_status = mce->mcg_status;
4983 
4984 	if (!(mcg_cap & MCG_CMCI_P) ||
4985 	    !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN))
4986 		return 0;
4987 
4988 	if (lapic_in_kernel(vcpu))
4989 		kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTCMCI);
4990 
4991 	return 0;
4992 }
4993 
4994 static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
4995 				      struct kvm_x86_mce *mce)
4996 {
4997 	u64 mcg_cap = vcpu->arch.mcg_cap;
4998 	unsigned bank_num = mcg_cap & 0xff;
4999 	u64 *banks = vcpu->arch.mce_banks;
5000 
5001 	if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL))
5002 		return -EINVAL;
5003 
5004 	banks += array_index_nospec(4 * mce->bank, 4 * bank_num);
5005 
5006 	if (is_ucna(mce))
5007 		return kvm_vcpu_x86_set_ucna(vcpu, mce, banks);
5008 
5009 	/*
5010 	 * if IA32_MCG_CTL is not all 1s, the uncorrected error
5011 	 * reporting is disabled
5012 	 */
5013 	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
5014 	    vcpu->arch.mcg_ctl != ~(u64)0)
5015 		return 0;
5016 	/*
5017 	 * if IA32_MCi_CTL is not all 1s, the uncorrected error
5018 	 * reporting is disabled for the bank
5019 	 */
5020 	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
5021 		return 0;
5022 	if (mce->status & MCI_STATUS_UC) {
5023 		if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
5024 		    !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
5025 			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5026 			return 0;
5027 		}
5028 		if (banks[1] & MCI_STATUS_VAL)
5029 			mce->status |= MCI_STATUS_OVER;
5030 		banks[2] = mce->addr;
5031 		banks[3] = mce->misc;
5032 		vcpu->arch.mcg_status = mce->mcg_status;
5033 		banks[1] = mce->status;
5034 		kvm_queue_exception(vcpu, MC_VECTOR);
5035 	} else if (!(banks[1] & MCI_STATUS_VAL)
5036 		   || !(banks[1] & MCI_STATUS_UC)) {
5037 		if (banks[1] & MCI_STATUS_VAL)
5038 			mce->status |= MCI_STATUS_OVER;
5039 		banks[2] = mce->addr;
5040 		banks[3] = mce->misc;
5041 		banks[1] = mce->status;
5042 	} else
5043 		banks[1] |= MCI_STATUS_OVER;
5044 	return 0;
5045 }
5046 
5047 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
5048 					       struct kvm_vcpu_events *events)
5049 {
5050 	struct kvm_queued_exception *ex;
5051 
5052 	process_nmi(vcpu);
5053 
5054 #ifdef CONFIG_KVM_SMM
5055 	if (kvm_check_request(KVM_REQ_SMI, vcpu))
5056 		process_smi(vcpu);
5057 #endif
5058 
5059 	/*
5060 	 * KVM's ABI only allows for one exception to be migrated.  Luckily,
5061 	 * the only time there can be two queued exceptions is if there's a
5062 	 * non-exiting _injected_ exception, and a pending exiting exception.
5063 	 * In that case, ignore the VM-Exiting exception as it's an extension
5064 	 * of the injected exception.
5065 	 */
5066 	if (vcpu->arch.exception_vmexit.pending &&
5067 	    !vcpu->arch.exception.pending &&
5068 	    !vcpu->arch.exception.injected)
5069 		ex = &vcpu->arch.exception_vmexit;
5070 	else
5071 		ex = &vcpu->arch.exception;
5072 
5073 	/*
5074 	 * In guest mode, payload delivery should be deferred if the exception
5075 	 * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
5076 	 * intercepts #PF, ditto for DR6 and #DBs.  If the per-VM capability,
5077 	 * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
5078 	 * propagate the payload and so it cannot be safely deferred.  Deliver
5079 	 * the payload if the capability hasn't been requested.
5080 	 */
5081 	if (!vcpu->kvm->arch.exception_payload_enabled &&
5082 	    ex->pending && ex->has_payload)
5083 		kvm_deliver_exception_payload(vcpu, ex);
5084 
5085 	memset(events, 0, sizeof(*events));
5086 
5087 	/*
5088 	 * The API doesn't provide the instruction length for software
5089 	 * exceptions, so don't report them. As long as the guest RIP
5090 	 * isn't advanced, we should expect to encounter the exception
5091 	 * again.
5092 	 */
5093 	if (!kvm_exception_is_soft(ex->vector)) {
5094 		events->exception.injected = ex->injected;
5095 		events->exception.pending = ex->pending;
5096 		/*
5097 		 * For ABI compatibility, deliberately conflate
5098 		 * pending and injected exceptions when
5099 		 * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
5100 		 */
5101 		if (!vcpu->kvm->arch.exception_payload_enabled)
5102 			events->exception.injected |= ex->pending;
5103 	}
5104 	events->exception.nr = ex->vector;
5105 	events->exception.has_error_code = ex->has_error_code;
5106 	events->exception.error_code = ex->error_code;
5107 	events->exception_has_payload = ex->has_payload;
5108 	events->exception_payload = ex->payload;
5109 
5110 	events->interrupt.injected =
5111 		vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
5112 	events->interrupt.nr = vcpu->arch.interrupt.nr;
5113 	events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
5114 
5115 	events->nmi.injected = vcpu->arch.nmi_injected;
5116 	events->nmi.pending = vcpu->arch.nmi_pending != 0;
5117 	events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu);
5118 
5119 	/* events->sipi_vector is never valid when reporting to user space */
5120 
5121 #ifdef CONFIG_KVM_SMM
5122 	events->smi.smm = is_smm(vcpu);
5123 	events->smi.pending = vcpu->arch.smi_pending;
5124 	events->smi.smm_inside_nmi =
5125 		!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
5126 #endif
5127 	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
5128 
5129 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
5130 			 | KVM_VCPUEVENT_VALID_SHADOW
5131 			 | KVM_VCPUEVENT_VALID_SMM);
5132 	if (vcpu->kvm->arch.exception_payload_enabled)
5133 		events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD;
5134 	if (vcpu->kvm->arch.triple_fault_event) {
5135 		events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5136 		events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT;
5137 	}
5138 }
5139 
5140 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
5141 					      struct kvm_vcpu_events *events)
5142 {
5143 	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
5144 			      | KVM_VCPUEVENT_VALID_SIPI_VECTOR
5145 			      | KVM_VCPUEVENT_VALID_SHADOW
5146 			      | KVM_VCPUEVENT_VALID_SMM
5147 			      | KVM_VCPUEVENT_VALID_PAYLOAD
5148 			      | KVM_VCPUEVENT_VALID_TRIPLE_FAULT))
5149 		return -EINVAL;
5150 
5151 	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
5152 		if (!vcpu->kvm->arch.exception_payload_enabled)
5153 			return -EINVAL;
5154 		if (events->exception.pending)
5155 			events->exception.injected = 0;
5156 		else
5157 			events->exception_has_payload = 0;
5158 	} else {
5159 		events->exception.pending = 0;
5160 		events->exception_has_payload = 0;
5161 	}
5162 
5163 	if ((events->exception.injected || events->exception.pending) &&
5164 	    (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR))
5165 		return -EINVAL;
5166 
5167 	/* INITs are latched while in SMM */
5168 	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
5169 	    (events->smi.smm || events->smi.pending) &&
5170 	    vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
5171 		return -EINVAL;
5172 
5173 	process_nmi(vcpu);
5174 
5175 	/*
5176 	 * Flag that userspace is stuffing an exception, the next KVM_RUN will
5177 	 * morph the exception to a VM-Exit if appropriate.  Do this only for
5178 	 * pending exceptions, already-injected exceptions are not subject to
5179 	 * intercpetion.  Note, userspace that conflates pending and injected
5180 	 * is hosed, and will incorrectly convert an injected exception into a
5181 	 * pending exception, which in turn may cause a spurious VM-Exit.
5182 	 */
5183 	vcpu->arch.exception_from_userspace = events->exception.pending;
5184 
5185 	vcpu->arch.exception_vmexit.pending = false;
5186 
5187 	vcpu->arch.exception.injected = events->exception.injected;
5188 	vcpu->arch.exception.pending = events->exception.pending;
5189 	vcpu->arch.exception.vector = events->exception.nr;
5190 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
5191 	vcpu->arch.exception.error_code = events->exception.error_code;
5192 	vcpu->arch.exception.has_payload = events->exception_has_payload;
5193 	vcpu->arch.exception.payload = events->exception_payload;
5194 
5195 	vcpu->arch.interrupt.injected = events->interrupt.injected;
5196 	vcpu->arch.interrupt.nr = events->interrupt.nr;
5197 	vcpu->arch.interrupt.soft = events->interrupt.soft;
5198 	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
5199 		static_call(kvm_x86_set_interrupt_shadow)(vcpu,
5200 						events->interrupt.shadow);
5201 
5202 	vcpu->arch.nmi_injected = events->nmi.injected;
5203 	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
5204 		vcpu->arch.nmi_pending = events->nmi.pending;
5205 	static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked);
5206 
5207 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
5208 	    lapic_in_kernel(vcpu))
5209 		vcpu->arch.apic->sipi_vector = events->sipi_vector;
5210 
5211 	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
5212 #ifdef CONFIG_KVM_SMM
5213 		if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
5214 			kvm_leave_nested(vcpu);
5215 			kvm_smm_changed(vcpu, events->smi.smm);
5216 		}
5217 
5218 		vcpu->arch.smi_pending = events->smi.pending;
5219 
5220 		if (events->smi.smm) {
5221 			if (events->smi.smm_inside_nmi)
5222 				vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
5223 			else
5224 				vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
5225 		}
5226 
5227 #else
5228 		if (events->smi.smm || events->smi.pending ||
5229 		    events->smi.smm_inside_nmi)
5230 			return -EINVAL;
5231 #endif
5232 
5233 		if (lapic_in_kernel(vcpu)) {
5234 			if (events->smi.latched_init)
5235 				set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5236 			else
5237 				clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
5238 		}
5239 	}
5240 
5241 	if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) {
5242 		if (!vcpu->kvm->arch.triple_fault_event)
5243 			return -EINVAL;
5244 		if (events->triple_fault.pending)
5245 			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5246 		else
5247 			kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu);
5248 	}
5249 
5250 	kvm_make_request(KVM_REQ_EVENT, vcpu);
5251 
5252 	return 0;
5253 }
5254 
5255 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
5256 					     struct kvm_debugregs *dbgregs)
5257 {
5258 	unsigned long val;
5259 
5260 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
5261 	kvm_get_dr(vcpu, 6, &val);
5262 	dbgregs->dr6 = val;
5263 	dbgregs->dr7 = vcpu->arch.dr7;
5264 	dbgregs->flags = 0;
5265 	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
5266 }
5267 
5268 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
5269 					    struct kvm_debugregs *dbgregs)
5270 {
5271 	if (dbgregs->flags)
5272 		return -EINVAL;
5273 
5274 	if (!kvm_dr6_valid(dbgregs->dr6))
5275 		return -EINVAL;
5276 	if (!kvm_dr7_valid(dbgregs->dr7))
5277 		return -EINVAL;
5278 
5279 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
5280 	kvm_update_dr0123(vcpu);
5281 	vcpu->arch.dr6 = dbgregs->dr6;
5282 	vcpu->arch.dr7 = dbgregs->dr7;
5283 	kvm_update_dr7(vcpu);
5284 
5285 	return 0;
5286 }
5287 
5288 static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
5289 					 struct kvm_xsave *guest_xsave)
5290 {
5291 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5292 		return;
5293 
5294 	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
5295 				       guest_xsave->region,
5296 				       sizeof(guest_xsave->region),
5297 				       vcpu->arch.pkru);
5298 }
5299 
5300 static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu,
5301 					  u8 *state, unsigned int size)
5302 {
5303 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5304 		return;
5305 
5306 	fpu_copy_guest_fpstate_to_uabi(&vcpu->arch.guest_fpu,
5307 				       state, size, vcpu->arch.pkru);
5308 }
5309 
5310 static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
5311 					struct kvm_xsave *guest_xsave)
5312 {
5313 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
5314 		return 0;
5315 
5316 	return fpu_copy_uabi_to_guest_fpstate(&vcpu->arch.guest_fpu,
5317 					      guest_xsave->region,
5318 					      kvm_caps.supported_xcr0,
5319 					      &vcpu->arch.pkru);
5320 }
5321 
5322 static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
5323 					struct kvm_xcrs *guest_xcrs)
5324 {
5325 	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
5326 		guest_xcrs->nr_xcrs = 0;
5327 		return;
5328 	}
5329 
5330 	guest_xcrs->nr_xcrs = 1;
5331 	guest_xcrs->flags = 0;
5332 	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
5333 	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
5334 }
5335 
5336 static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
5337 				       struct kvm_xcrs *guest_xcrs)
5338 {
5339 	int i, r = 0;
5340 
5341 	if (!boot_cpu_has(X86_FEATURE_XSAVE))
5342 		return -EINVAL;
5343 
5344 	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
5345 		return -EINVAL;
5346 
5347 	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
5348 		/* Only support XCR0 currently */
5349 		if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
5350 			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
5351 				guest_xcrs->xcrs[i].value);
5352 			break;
5353 		}
5354 	if (r)
5355 		r = -EINVAL;
5356 	return r;
5357 }
5358 
5359 /*
5360  * kvm_set_guest_paused() indicates to the guest kernel that it has been
5361  * stopped by the hypervisor.  This function will be called from the host only.
5362  * EINVAL is returned when the host attempts to set the flag for a guest that
5363  * does not support pv clocks.
5364  */
5365 static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
5366 {
5367 	if (!vcpu->arch.pv_time.active)
5368 		return -EINVAL;
5369 	vcpu->arch.pvclock_set_guest_stopped_request = true;
5370 	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5371 	return 0;
5372 }
5373 
5374 static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
5375 				 struct kvm_device_attr *attr)
5376 {
5377 	int r;
5378 
5379 	switch (attr->attr) {
5380 	case KVM_VCPU_TSC_OFFSET:
5381 		r = 0;
5382 		break;
5383 	default:
5384 		r = -ENXIO;
5385 	}
5386 
5387 	return r;
5388 }
5389 
5390 static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
5391 				 struct kvm_device_attr *attr)
5392 {
5393 	u64 __user *uaddr = kvm_get_attr_addr(attr);
5394 	int r;
5395 
5396 	if (IS_ERR(uaddr))
5397 		return PTR_ERR(uaddr);
5398 
5399 	switch (attr->attr) {
5400 	case KVM_VCPU_TSC_OFFSET:
5401 		r = -EFAULT;
5402 		if (put_user(vcpu->arch.l1_tsc_offset, uaddr))
5403 			break;
5404 		r = 0;
5405 		break;
5406 	default:
5407 		r = -ENXIO;
5408 	}
5409 
5410 	return r;
5411 }
5412 
5413 static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
5414 				 struct kvm_device_attr *attr)
5415 {
5416 	u64 __user *uaddr = kvm_get_attr_addr(attr);
5417 	struct kvm *kvm = vcpu->kvm;
5418 	int r;
5419 
5420 	if (IS_ERR(uaddr))
5421 		return PTR_ERR(uaddr);
5422 
5423 	switch (attr->attr) {
5424 	case KVM_VCPU_TSC_OFFSET: {
5425 		u64 offset, tsc, ns;
5426 		unsigned long flags;
5427 		bool matched;
5428 
5429 		r = -EFAULT;
5430 		if (get_user(offset, uaddr))
5431 			break;
5432 
5433 		raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
5434 
5435 		matched = (vcpu->arch.virtual_tsc_khz &&
5436 			   kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz &&
5437 			   kvm->arch.last_tsc_offset == offset);
5438 
5439 		tsc = kvm_scale_tsc(rdtsc(), vcpu->arch.l1_tsc_scaling_ratio) + offset;
5440 		ns = get_kvmclock_base_ns();
5441 
5442 		__kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched);
5443 		raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
5444 
5445 		r = 0;
5446 		break;
5447 	}
5448 	default:
5449 		r = -ENXIO;
5450 	}
5451 
5452 	return r;
5453 }
5454 
5455 static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu,
5456 				      unsigned int ioctl,
5457 				      void __user *argp)
5458 {
5459 	struct kvm_device_attr attr;
5460 	int r;
5461 
5462 	if (copy_from_user(&attr, argp, sizeof(attr)))
5463 		return -EFAULT;
5464 
5465 	if (attr.group != KVM_VCPU_TSC_CTRL)
5466 		return -ENXIO;
5467 
5468 	switch (ioctl) {
5469 	case KVM_HAS_DEVICE_ATTR:
5470 		r = kvm_arch_tsc_has_attr(vcpu, &attr);
5471 		break;
5472 	case KVM_GET_DEVICE_ATTR:
5473 		r = kvm_arch_tsc_get_attr(vcpu, &attr);
5474 		break;
5475 	case KVM_SET_DEVICE_ATTR:
5476 		r = kvm_arch_tsc_set_attr(vcpu, &attr);
5477 		break;
5478 	}
5479 
5480 	return r;
5481 }
5482 
5483 static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
5484 				     struct kvm_enable_cap *cap)
5485 {
5486 	int r;
5487 	uint16_t vmcs_version;
5488 	void __user *user_ptr;
5489 
5490 	if (cap->flags)
5491 		return -EINVAL;
5492 
5493 	switch (cap->cap) {
5494 	case KVM_CAP_HYPERV_SYNIC2:
5495 		if (cap->args[0])
5496 			return -EINVAL;
5497 		fallthrough;
5498 
5499 	case KVM_CAP_HYPERV_SYNIC:
5500 		if (!irqchip_in_kernel(vcpu->kvm))
5501 			return -EINVAL;
5502 		return kvm_hv_activate_synic(vcpu, cap->cap ==
5503 					     KVM_CAP_HYPERV_SYNIC2);
5504 	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
5505 		if (!kvm_x86_ops.nested_ops->enable_evmcs)
5506 			return -ENOTTY;
5507 		r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
5508 		if (!r) {
5509 			user_ptr = (void __user *)(uintptr_t)cap->args[0];
5510 			if (copy_to_user(user_ptr, &vmcs_version,
5511 					 sizeof(vmcs_version)))
5512 				r = -EFAULT;
5513 		}
5514 		return r;
5515 	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
5516 		if (!kvm_x86_ops.enable_l2_tlb_flush)
5517 			return -ENOTTY;
5518 
5519 		return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu);
5520 
5521 	case KVM_CAP_HYPERV_ENFORCE_CPUID:
5522 		return kvm_hv_set_enforce_cpuid(vcpu, cap->args[0]);
5523 
5524 	case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
5525 		vcpu->arch.pv_cpuid.enforce = cap->args[0];
5526 		if (vcpu->arch.pv_cpuid.enforce)
5527 			kvm_update_pv_runtime(vcpu);
5528 
5529 		return 0;
5530 	default:
5531 		return -EINVAL;
5532 	}
5533 }
5534 
5535 long kvm_arch_vcpu_ioctl(struct file *filp,
5536 			 unsigned int ioctl, unsigned long arg)
5537 {
5538 	struct kvm_vcpu *vcpu = filp->private_data;
5539 	void __user *argp = (void __user *)arg;
5540 	int r;
5541 	union {
5542 		struct kvm_sregs2 *sregs2;
5543 		struct kvm_lapic_state *lapic;
5544 		struct kvm_xsave *xsave;
5545 		struct kvm_xcrs *xcrs;
5546 		void *buffer;
5547 	} u;
5548 
5549 	vcpu_load(vcpu);
5550 
5551 	u.buffer = NULL;
5552 	switch (ioctl) {
5553 	case KVM_GET_LAPIC: {
5554 		r = -EINVAL;
5555 		if (!lapic_in_kernel(vcpu))
5556 			goto out;
5557 		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
5558 				GFP_KERNEL_ACCOUNT);
5559 
5560 		r = -ENOMEM;
5561 		if (!u.lapic)
5562 			goto out;
5563 		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
5564 		if (r)
5565 			goto out;
5566 		r = -EFAULT;
5567 		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
5568 			goto out;
5569 		r = 0;
5570 		break;
5571 	}
5572 	case KVM_SET_LAPIC: {
5573 		r = -EINVAL;
5574 		if (!lapic_in_kernel(vcpu))
5575 			goto out;
5576 		u.lapic = memdup_user(argp, sizeof(*u.lapic));
5577 		if (IS_ERR(u.lapic)) {
5578 			r = PTR_ERR(u.lapic);
5579 			goto out_nofree;
5580 		}
5581 
5582 		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
5583 		break;
5584 	}
5585 	case KVM_INTERRUPT: {
5586 		struct kvm_interrupt irq;
5587 
5588 		r = -EFAULT;
5589 		if (copy_from_user(&irq, argp, sizeof(irq)))
5590 			goto out;
5591 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
5592 		break;
5593 	}
5594 	case KVM_NMI: {
5595 		r = kvm_vcpu_ioctl_nmi(vcpu);
5596 		break;
5597 	}
5598 	case KVM_SMI: {
5599 		r = kvm_inject_smi(vcpu);
5600 		break;
5601 	}
5602 	case KVM_SET_CPUID: {
5603 		struct kvm_cpuid __user *cpuid_arg = argp;
5604 		struct kvm_cpuid cpuid;
5605 
5606 		r = -EFAULT;
5607 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5608 			goto out;
5609 		r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
5610 		break;
5611 	}
5612 	case KVM_SET_CPUID2: {
5613 		struct kvm_cpuid2 __user *cpuid_arg = argp;
5614 		struct kvm_cpuid2 cpuid;
5615 
5616 		r = -EFAULT;
5617 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5618 			goto out;
5619 		r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
5620 					      cpuid_arg->entries);
5621 		break;
5622 	}
5623 	case KVM_GET_CPUID2: {
5624 		struct kvm_cpuid2 __user *cpuid_arg = argp;
5625 		struct kvm_cpuid2 cpuid;
5626 
5627 		r = -EFAULT;
5628 		if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
5629 			goto out;
5630 		r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
5631 					      cpuid_arg->entries);
5632 		if (r)
5633 			goto out;
5634 		r = -EFAULT;
5635 		if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
5636 			goto out;
5637 		r = 0;
5638 		break;
5639 	}
5640 	case KVM_GET_MSRS: {
5641 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
5642 		r = msr_io(vcpu, argp, do_get_msr, 1);
5643 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5644 		break;
5645 	}
5646 	case KVM_SET_MSRS: {
5647 		int idx = srcu_read_lock(&vcpu->kvm->srcu);
5648 		r = msr_io(vcpu, argp, do_set_msr, 0);
5649 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5650 		break;
5651 	}
5652 	case KVM_TPR_ACCESS_REPORTING: {
5653 		struct kvm_tpr_access_ctl tac;
5654 
5655 		r = -EFAULT;
5656 		if (copy_from_user(&tac, argp, sizeof(tac)))
5657 			goto out;
5658 		r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
5659 		if (r)
5660 			goto out;
5661 		r = -EFAULT;
5662 		if (copy_to_user(argp, &tac, sizeof(tac)))
5663 			goto out;
5664 		r = 0;
5665 		break;
5666 	};
5667 	case KVM_SET_VAPIC_ADDR: {
5668 		struct kvm_vapic_addr va;
5669 		int idx;
5670 
5671 		r = -EINVAL;
5672 		if (!lapic_in_kernel(vcpu))
5673 			goto out;
5674 		r = -EFAULT;
5675 		if (copy_from_user(&va, argp, sizeof(va)))
5676 			goto out;
5677 		idx = srcu_read_lock(&vcpu->kvm->srcu);
5678 		r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
5679 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5680 		break;
5681 	}
5682 	case KVM_X86_SETUP_MCE: {
5683 		u64 mcg_cap;
5684 
5685 		r = -EFAULT;
5686 		if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
5687 			goto out;
5688 		r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
5689 		break;
5690 	}
5691 	case KVM_X86_SET_MCE: {
5692 		struct kvm_x86_mce mce;
5693 
5694 		r = -EFAULT;
5695 		if (copy_from_user(&mce, argp, sizeof(mce)))
5696 			goto out;
5697 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
5698 		break;
5699 	}
5700 	case KVM_GET_VCPU_EVENTS: {
5701 		struct kvm_vcpu_events events;
5702 
5703 		kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
5704 
5705 		r = -EFAULT;
5706 		if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
5707 			break;
5708 		r = 0;
5709 		break;
5710 	}
5711 	case KVM_SET_VCPU_EVENTS: {
5712 		struct kvm_vcpu_events events;
5713 
5714 		r = -EFAULT;
5715 		if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
5716 			break;
5717 
5718 		r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
5719 		break;
5720 	}
5721 	case KVM_GET_DEBUGREGS: {
5722 		struct kvm_debugregs dbgregs;
5723 
5724 		kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
5725 
5726 		r = -EFAULT;
5727 		if (copy_to_user(argp, &dbgregs,
5728 				 sizeof(struct kvm_debugregs)))
5729 			break;
5730 		r = 0;
5731 		break;
5732 	}
5733 	case KVM_SET_DEBUGREGS: {
5734 		struct kvm_debugregs dbgregs;
5735 
5736 		r = -EFAULT;
5737 		if (copy_from_user(&dbgregs, argp,
5738 				   sizeof(struct kvm_debugregs)))
5739 			break;
5740 
5741 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
5742 		break;
5743 	}
5744 	case KVM_GET_XSAVE: {
5745 		r = -EINVAL;
5746 		if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave))
5747 			break;
5748 
5749 		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
5750 		r = -ENOMEM;
5751 		if (!u.xsave)
5752 			break;
5753 
5754 		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
5755 
5756 		r = -EFAULT;
5757 		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
5758 			break;
5759 		r = 0;
5760 		break;
5761 	}
5762 	case KVM_SET_XSAVE: {
5763 		int size = vcpu->arch.guest_fpu.uabi_size;
5764 
5765 		u.xsave = memdup_user(argp, size);
5766 		if (IS_ERR(u.xsave)) {
5767 			r = PTR_ERR(u.xsave);
5768 			goto out_nofree;
5769 		}
5770 
5771 		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
5772 		break;
5773 	}
5774 
5775 	case KVM_GET_XSAVE2: {
5776 		int size = vcpu->arch.guest_fpu.uabi_size;
5777 
5778 		u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT);
5779 		r = -ENOMEM;
5780 		if (!u.xsave)
5781 			break;
5782 
5783 		kvm_vcpu_ioctl_x86_get_xsave2(vcpu, u.buffer, size);
5784 
5785 		r = -EFAULT;
5786 		if (copy_to_user(argp, u.xsave, size))
5787 			break;
5788 
5789 		r = 0;
5790 		break;
5791 	}
5792 
5793 	case KVM_GET_XCRS: {
5794 		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
5795 		r = -ENOMEM;
5796 		if (!u.xcrs)
5797 			break;
5798 
5799 		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
5800 
5801 		r = -EFAULT;
5802 		if (copy_to_user(argp, u.xcrs,
5803 				 sizeof(struct kvm_xcrs)))
5804 			break;
5805 		r = 0;
5806 		break;
5807 	}
5808 	case KVM_SET_XCRS: {
5809 		u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
5810 		if (IS_ERR(u.xcrs)) {
5811 			r = PTR_ERR(u.xcrs);
5812 			goto out_nofree;
5813 		}
5814 
5815 		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
5816 		break;
5817 	}
5818 	case KVM_SET_TSC_KHZ: {
5819 		u32 user_tsc_khz;
5820 
5821 		r = -EINVAL;
5822 		user_tsc_khz = (u32)arg;
5823 
5824 		if (kvm_caps.has_tsc_control &&
5825 		    user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
5826 			goto out;
5827 
5828 		if (user_tsc_khz == 0)
5829 			user_tsc_khz = tsc_khz;
5830 
5831 		if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
5832 			r = 0;
5833 
5834 		goto out;
5835 	}
5836 	case KVM_GET_TSC_KHZ: {
5837 		r = vcpu->arch.virtual_tsc_khz;
5838 		goto out;
5839 	}
5840 	case KVM_KVMCLOCK_CTRL: {
5841 		r = kvm_set_guest_paused(vcpu);
5842 		goto out;
5843 	}
5844 	case KVM_ENABLE_CAP: {
5845 		struct kvm_enable_cap cap;
5846 
5847 		r = -EFAULT;
5848 		if (copy_from_user(&cap, argp, sizeof(cap)))
5849 			goto out;
5850 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
5851 		break;
5852 	}
5853 	case KVM_GET_NESTED_STATE: {
5854 		struct kvm_nested_state __user *user_kvm_nested_state = argp;
5855 		u32 user_data_size;
5856 
5857 		r = -EINVAL;
5858 		if (!kvm_x86_ops.nested_ops->get_state)
5859 			break;
5860 
5861 		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
5862 		r = -EFAULT;
5863 		if (get_user(user_data_size, &user_kvm_nested_state->size))
5864 			break;
5865 
5866 		r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
5867 						     user_data_size);
5868 		if (r < 0)
5869 			break;
5870 
5871 		if (r > user_data_size) {
5872 			if (put_user(r, &user_kvm_nested_state->size))
5873 				r = -EFAULT;
5874 			else
5875 				r = -E2BIG;
5876 			break;
5877 		}
5878 
5879 		r = 0;
5880 		break;
5881 	}
5882 	case KVM_SET_NESTED_STATE: {
5883 		struct kvm_nested_state __user *user_kvm_nested_state = argp;
5884 		struct kvm_nested_state kvm_state;
5885 		int idx;
5886 
5887 		r = -EINVAL;
5888 		if (!kvm_x86_ops.nested_ops->set_state)
5889 			break;
5890 
5891 		r = -EFAULT;
5892 		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
5893 			break;
5894 
5895 		r = -EINVAL;
5896 		if (kvm_state.size < sizeof(kvm_state))
5897 			break;
5898 
5899 		if (kvm_state.flags &
5900 		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
5901 		      | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
5902 		      | KVM_STATE_NESTED_GIF_SET))
5903 			break;
5904 
5905 		/* nested_run_pending implies guest_mode.  */
5906 		if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
5907 		    && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
5908 			break;
5909 
5910 		idx = srcu_read_lock(&vcpu->kvm->srcu);
5911 		r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
5912 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
5913 		break;
5914 	}
5915 	case KVM_GET_SUPPORTED_HV_CPUID:
5916 		r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
5917 		break;
5918 #ifdef CONFIG_KVM_XEN
5919 	case KVM_XEN_VCPU_GET_ATTR: {
5920 		struct kvm_xen_vcpu_attr xva;
5921 
5922 		r = -EFAULT;
5923 		if (copy_from_user(&xva, argp, sizeof(xva)))
5924 			goto out;
5925 		r = kvm_xen_vcpu_get_attr(vcpu, &xva);
5926 		if (!r && copy_to_user(argp, &xva, sizeof(xva)))
5927 			r = -EFAULT;
5928 		break;
5929 	}
5930 	case KVM_XEN_VCPU_SET_ATTR: {
5931 		struct kvm_xen_vcpu_attr xva;
5932 
5933 		r = -EFAULT;
5934 		if (copy_from_user(&xva, argp, sizeof(xva)))
5935 			goto out;
5936 		r = kvm_xen_vcpu_set_attr(vcpu, &xva);
5937 		break;
5938 	}
5939 #endif
5940 	case KVM_GET_SREGS2: {
5941 		u.sregs2 = kzalloc(sizeof(struct kvm_sregs2), GFP_KERNEL);
5942 		r = -ENOMEM;
5943 		if (!u.sregs2)
5944 			goto out;
5945 		__get_sregs2(vcpu, u.sregs2);
5946 		r = -EFAULT;
5947 		if (copy_to_user(argp, u.sregs2, sizeof(struct kvm_sregs2)))
5948 			goto out;
5949 		r = 0;
5950 		break;
5951 	}
5952 	case KVM_SET_SREGS2: {
5953 		u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2));
5954 		if (IS_ERR(u.sregs2)) {
5955 			r = PTR_ERR(u.sregs2);
5956 			u.sregs2 = NULL;
5957 			goto out;
5958 		}
5959 		r = __set_sregs2(vcpu, u.sregs2);
5960 		break;
5961 	}
5962 	case KVM_HAS_DEVICE_ATTR:
5963 	case KVM_GET_DEVICE_ATTR:
5964 	case KVM_SET_DEVICE_ATTR:
5965 		r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
5966 		break;
5967 	default:
5968 		r = -EINVAL;
5969 	}
5970 out:
5971 	kfree(u.buffer);
5972 out_nofree:
5973 	vcpu_put(vcpu);
5974 	return r;
5975 }
5976 
5977 vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
5978 {
5979 	return VM_FAULT_SIGBUS;
5980 }
5981 
5982 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
5983 {
5984 	int ret;
5985 
5986 	if (addr > (unsigned int)(-3 * PAGE_SIZE))
5987 		return -EINVAL;
5988 	ret = static_call(kvm_x86_set_tss_addr)(kvm, addr);
5989 	return ret;
5990 }
5991 
5992 static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
5993 					      u64 ident_addr)
5994 {
5995 	return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr);
5996 }
5997 
5998 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
5999 					 unsigned long kvm_nr_mmu_pages)
6000 {
6001 	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
6002 		return -EINVAL;
6003 
6004 	mutex_lock(&kvm->slots_lock);
6005 
6006 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
6007 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
6008 
6009 	mutex_unlock(&kvm->slots_lock);
6010 	return 0;
6011 }
6012 
6013 static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
6014 {
6015 	return kvm->arch.n_max_mmu_pages;
6016 }
6017 
6018 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6019 {
6020 	struct kvm_pic *pic = kvm->arch.vpic;
6021 	int r;
6022 
6023 	r = 0;
6024 	switch (chip->chip_id) {
6025 	case KVM_IRQCHIP_PIC_MASTER:
6026 		memcpy(&chip->chip.pic, &pic->pics[0],
6027 			sizeof(struct kvm_pic_state));
6028 		break;
6029 	case KVM_IRQCHIP_PIC_SLAVE:
6030 		memcpy(&chip->chip.pic, &pic->pics[1],
6031 			sizeof(struct kvm_pic_state));
6032 		break;
6033 	case KVM_IRQCHIP_IOAPIC:
6034 		kvm_get_ioapic(kvm, &chip->chip.ioapic);
6035 		break;
6036 	default:
6037 		r = -EINVAL;
6038 		break;
6039 	}
6040 	return r;
6041 }
6042 
6043 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
6044 {
6045 	struct kvm_pic *pic = kvm->arch.vpic;
6046 	int r;
6047 
6048 	r = 0;
6049 	switch (chip->chip_id) {
6050 	case KVM_IRQCHIP_PIC_MASTER:
6051 		spin_lock(&pic->lock);
6052 		memcpy(&pic->pics[0], &chip->chip.pic,
6053 			sizeof(struct kvm_pic_state));
6054 		spin_unlock(&pic->lock);
6055 		break;
6056 	case KVM_IRQCHIP_PIC_SLAVE:
6057 		spin_lock(&pic->lock);
6058 		memcpy(&pic->pics[1], &chip->chip.pic,
6059 			sizeof(struct kvm_pic_state));
6060 		spin_unlock(&pic->lock);
6061 		break;
6062 	case KVM_IRQCHIP_IOAPIC:
6063 		kvm_set_ioapic(kvm, &chip->chip.ioapic);
6064 		break;
6065 	default:
6066 		r = -EINVAL;
6067 		break;
6068 	}
6069 	kvm_pic_update_irq(pic);
6070 	return r;
6071 }
6072 
6073 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6074 {
6075 	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
6076 
6077 	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
6078 
6079 	mutex_lock(&kps->lock);
6080 	memcpy(ps, &kps->channels, sizeof(*ps));
6081 	mutex_unlock(&kps->lock);
6082 	return 0;
6083 }
6084 
6085 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
6086 {
6087 	int i;
6088 	struct kvm_pit *pit = kvm->arch.vpit;
6089 
6090 	mutex_lock(&pit->pit_state.lock);
6091 	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
6092 	for (i = 0; i < 3; i++)
6093 		kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
6094 	mutex_unlock(&pit->pit_state.lock);
6095 	return 0;
6096 }
6097 
6098 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6099 {
6100 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
6101 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
6102 		sizeof(ps->channels));
6103 	ps->flags = kvm->arch.vpit->pit_state.flags;
6104 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
6105 	memset(&ps->reserved, 0, sizeof(ps->reserved));
6106 	return 0;
6107 }
6108 
6109 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
6110 {
6111 	int start = 0;
6112 	int i;
6113 	u32 prev_legacy, cur_legacy;
6114 	struct kvm_pit *pit = kvm->arch.vpit;
6115 
6116 	mutex_lock(&pit->pit_state.lock);
6117 	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
6118 	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
6119 	if (!prev_legacy && cur_legacy)
6120 		start = 1;
6121 	memcpy(&pit->pit_state.channels, &ps->channels,
6122 	       sizeof(pit->pit_state.channels));
6123 	pit->pit_state.flags = ps->flags;
6124 	for (i = 0; i < 3; i++)
6125 		kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
6126 				   start && i == 0);
6127 	mutex_unlock(&pit->pit_state.lock);
6128 	return 0;
6129 }
6130 
6131 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
6132 				 struct kvm_reinject_control *control)
6133 {
6134 	struct kvm_pit *pit = kvm->arch.vpit;
6135 
6136 	/* pit->pit_state.lock was overloaded to prevent userspace from getting
6137 	 * an inconsistent state after running multiple KVM_REINJECT_CONTROL
6138 	 * ioctls in parallel.  Use a separate lock if that ioctl isn't rare.
6139 	 */
6140 	mutex_lock(&pit->pit_state.lock);
6141 	kvm_pit_set_reinject(pit, control->pit_reinject);
6142 	mutex_unlock(&pit->pit_state.lock);
6143 
6144 	return 0;
6145 }
6146 
6147 void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
6148 {
6149 
6150 	/*
6151 	 * Flush all CPUs' dirty log buffers to the  dirty_bitmap.  Called
6152 	 * before reporting dirty_bitmap to userspace.  KVM flushes the buffers
6153 	 * on all VM-Exits, thus we only need to kick running vCPUs to force a
6154 	 * VM-Exit.
6155 	 */
6156 	struct kvm_vcpu *vcpu;
6157 	unsigned long i;
6158 
6159 	kvm_for_each_vcpu(i, vcpu, kvm)
6160 		kvm_vcpu_kick(vcpu);
6161 }
6162 
6163 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
6164 			bool line_status)
6165 {
6166 	if (!irqchip_in_kernel(kvm))
6167 		return -ENXIO;
6168 
6169 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
6170 					irq_event->irq, irq_event->level,
6171 					line_status);
6172 	return 0;
6173 }
6174 
6175 int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
6176 			    struct kvm_enable_cap *cap)
6177 {
6178 	int r;
6179 
6180 	if (cap->flags)
6181 		return -EINVAL;
6182 
6183 	switch (cap->cap) {
6184 	case KVM_CAP_DISABLE_QUIRKS2:
6185 		r = -EINVAL;
6186 		if (cap->args[0] & ~KVM_X86_VALID_QUIRKS)
6187 			break;
6188 		fallthrough;
6189 	case KVM_CAP_DISABLE_QUIRKS:
6190 		kvm->arch.disabled_quirks = cap->args[0];
6191 		r = 0;
6192 		break;
6193 	case KVM_CAP_SPLIT_IRQCHIP: {
6194 		mutex_lock(&kvm->lock);
6195 		r = -EINVAL;
6196 		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
6197 			goto split_irqchip_unlock;
6198 		r = -EEXIST;
6199 		if (irqchip_in_kernel(kvm))
6200 			goto split_irqchip_unlock;
6201 		if (kvm->created_vcpus)
6202 			goto split_irqchip_unlock;
6203 		r = kvm_setup_empty_irq_routing(kvm);
6204 		if (r)
6205 			goto split_irqchip_unlock;
6206 		/* Pairs with irqchip_in_kernel. */
6207 		smp_wmb();
6208 		kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
6209 		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
6210 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
6211 		r = 0;
6212 split_irqchip_unlock:
6213 		mutex_unlock(&kvm->lock);
6214 		break;
6215 	}
6216 	case KVM_CAP_X2APIC_API:
6217 		r = -EINVAL;
6218 		if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
6219 			break;
6220 
6221 		if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
6222 			kvm->arch.x2apic_format = true;
6223 		if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
6224 			kvm->arch.x2apic_broadcast_quirk_disabled = true;
6225 
6226 		r = 0;
6227 		break;
6228 	case KVM_CAP_X86_DISABLE_EXITS:
6229 		r = -EINVAL;
6230 		if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
6231 			break;
6232 
6233 		if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
6234 			kvm_can_mwait_in_guest())
6235 			kvm->arch.mwait_in_guest = true;
6236 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
6237 			kvm->arch.hlt_in_guest = true;
6238 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
6239 			kvm->arch.pause_in_guest = true;
6240 		if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
6241 			kvm->arch.cstate_in_guest = true;
6242 		r = 0;
6243 		break;
6244 	case KVM_CAP_MSR_PLATFORM_INFO:
6245 		kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
6246 		r = 0;
6247 		break;
6248 	case KVM_CAP_EXCEPTION_PAYLOAD:
6249 		kvm->arch.exception_payload_enabled = cap->args[0];
6250 		r = 0;
6251 		break;
6252 	case KVM_CAP_X86_TRIPLE_FAULT_EVENT:
6253 		kvm->arch.triple_fault_event = cap->args[0];
6254 		r = 0;
6255 		break;
6256 	case KVM_CAP_X86_USER_SPACE_MSR:
6257 		r = -EINVAL;
6258 		if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK)
6259 			break;
6260 		kvm->arch.user_space_msr_mask = cap->args[0];
6261 		r = 0;
6262 		break;
6263 	case KVM_CAP_X86_BUS_LOCK_EXIT:
6264 		r = -EINVAL;
6265 		if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE)
6266 			break;
6267 
6268 		if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) &&
6269 		    (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT))
6270 			break;
6271 
6272 		if (kvm_caps.has_bus_lock_exit &&
6273 		    cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)
6274 			kvm->arch.bus_lock_detection_enabled = true;
6275 		r = 0;
6276 		break;
6277 #ifdef CONFIG_X86_SGX_KVM
6278 	case KVM_CAP_SGX_ATTRIBUTE: {
6279 		unsigned long allowed_attributes = 0;
6280 
6281 		r = sgx_set_attribute(&allowed_attributes, cap->args[0]);
6282 		if (r)
6283 			break;
6284 
6285 		/* KVM only supports the PROVISIONKEY privileged attribute. */
6286 		if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
6287 		    !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
6288 			kvm->arch.sgx_provisioning_allowed = true;
6289 		else
6290 			r = -EINVAL;
6291 		break;
6292 	}
6293 #endif
6294 	case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
6295 		r = -EINVAL;
6296 		if (!kvm_x86_ops.vm_copy_enc_context_from)
6297 			break;
6298 
6299 		r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]);
6300 		break;
6301 	case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
6302 		r = -EINVAL;
6303 		if (!kvm_x86_ops.vm_move_enc_context_from)
6304 			break;
6305 
6306 		r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]);
6307 		break;
6308 	case KVM_CAP_EXIT_HYPERCALL:
6309 		if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) {
6310 			r = -EINVAL;
6311 			break;
6312 		}
6313 		kvm->arch.hypercall_exit_enabled = cap->args[0];
6314 		r = 0;
6315 		break;
6316 	case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
6317 		r = -EINVAL;
6318 		if (cap->args[0] & ~1)
6319 			break;
6320 		kvm->arch.exit_on_emulation_error = cap->args[0];
6321 		r = 0;
6322 		break;
6323 	case KVM_CAP_PMU_CAPABILITY:
6324 		r = -EINVAL;
6325 		if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK))
6326 			break;
6327 
6328 		mutex_lock(&kvm->lock);
6329 		if (!kvm->created_vcpus) {
6330 			kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE);
6331 			r = 0;
6332 		}
6333 		mutex_unlock(&kvm->lock);
6334 		break;
6335 	case KVM_CAP_MAX_VCPU_ID:
6336 		r = -EINVAL;
6337 		if (cap->args[0] > KVM_MAX_VCPU_IDS)
6338 			break;
6339 
6340 		mutex_lock(&kvm->lock);
6341 		if (kvm->arch.max_vcpu_ids == cap->args[0]) {
6342 			r = 0;
6343 		} else if (!kvm->arch.max_vcpu_ids) {
6344 			kvm->arch.max_vcpu_ids = cap->args[0];
6345 			r = 0;
6346 		}
6347 		mutex_unlock(&kvm->lock);
6348 		break;
6349 	case KVM_CAP_X86_NOTIFY_VMEXIT:
6350 		r = -EINVAL;
6351 		if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS)
6352 			break;
6353 		if (!kvm_caps.has_notify_vmexit)
6354 			break;
6355 		if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED))
6356 			break;
6357 		mutex_lock(&kvm->lock);
6358 		if (!kvm->created_vcpus) {
6359 			kvm->arch.notify_window = cap->args[0] >> 32;
6360 			kvm->arch.notify_vmexit_flags = (u32)cap->args[0];
6361 			r = 0;
6362 		}
6363 		mutex_unlock(&kvm->lock);
6364 		break;
6365 	case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES:
6366 		r = -EINVAL;
6367 
6368 		/*
6369 		 * Since the risk of disabling NX hugepages is a guest crashing
6370 		 * the system, ensure the userspace process has permission to
6371 		 * reboot the system.
6372 		 *
6373 		 * Note that unlike the reboot() syscall, the process must have
6374 		 * this capability in the root namespace because exposing
6375 		 * /dev/kvm into a container does not limit the scope of the
6376 		 * iTLB multihit bug to that container. In other words,
6377 		 * this must use capable(), not ns_capable().
6378 		 */
6379 		if (!capable(CAP_SYS_BOOT)) {
6380 			r = -EPERM;
6381 			break;
6382 		}
6383 
6384 		if (cap->args[0])
6385 			break;
6386 
6387 		mutex_lock(&kvm->lock);
6388 		if (!kvm->created_vcpus) {
6389 			kvm->arch.disable_nx_huge_pages = true;
6390 			r = 0;
6391 		}
6392 		mutex_unlock(&kvm->lock);
6393 		break;
6394 	default:
6395 		r = -EINVAL;
6396 		break;
6397 	}
6398 	return r;
6399 }
6400 
6401 static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
6402 {
6403 	struct kvm_x86_msr_filter *msr_filter;
6404 
6405 	msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
6406 	if (!msr_filter)
6407 		return NULL;
6408 
6409 	msr_filter->default_allow = default_allow;
6410 	return msr_filter;
6411 }
6412 
6413 static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
6414 {
6415 	u32 i;
6416 
6417 	if (!msr_filter)
6418 		return;
6419 
6420 	for (i = 0; i < msr_filter->count; i++)
6421 		kfree(msr_filter->ranges[i].bitmap);
6422 
6423 	kfree(msr_filter);
6424 }
6425 
6426 static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
6427 			      struct kvm_msr_filter_range *user_range)
6428 {
6429 	unsigned long *bitmap = NULL;
6430 	size_t bitmap_size;
6431 
6432 	if (!user_range->nmsrs)
6433 		return 0;
6434 
6435 	if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK)
6436 		return -EINVAL;
6437 
6438 	if (!user_range->flags)
6439 		return -EINVAL;
6440 
6441 	bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
6442 	if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
6443 		return -EINVAL;
6444 
6445 	bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
6446 	if (IS_ERR(bitmap))
6447 		return PTR_ERR(bitmap);
6448 
6449 	msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) {
6450 		.flags = user_range->flags,
6451 		.base = user_range->base,
6452 		.nmsrs = user_range->nmsrs,
6453 		.bitmap = bitmap,
6454 	};
6455 
6456 	msr_filter->count++;
6457 	return 0;
6458 }
6459 
6460 static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
6461 				       struct kvm_msr_filter *filter)
6462 {
6463 	struct kvm_x86_msr_filter *new_filter, *old_filter;
6464 	bool default_allow;
6465 	bool empty = true;
6466 	int r = 0;
6467 	u32 i;
6468 
6469 	if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK)
6470 		return -EINVAL;
6471 
6472 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
6473 		empty &= !filter->ranges[i].nmsrs;
6474 
6475 	default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
6476 	if (empty && !default_allow)
6477 		return -EINVAL;
6478 
6479 	new_filter = kvm_alloc_msr_filter(default_allow);
6480 	if (!new_filter)
6481 		return -ENOMEM;
6482 
6483 	for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
6484 		r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
6485 		if (r) {
6486 			kvm_free_msr_filter(new_filter);
6487 			return r;
6488 		}
6489 	}
6490 
6491 	mutex_lock(&kvm->lock);
6492 
6493 	/* The per-VM filter is protected by kvm->lock... */
6494 	old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
6495 
6496 	rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
6497 	synchronize_srcu(&kvm->srcu);
6498 
6499 	kvm_free_msr_filter(old_filter);
6500 
6501 	kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
6502 	mutex_unlock(&kvm->lock);
6503 
6504 	return 0;
6505 }
6506 
6507 #ifdef CONFIG_KVM_COMPAT
6508 /* for KVM_X86_SET_MSR_FILTER */
6509 struct kvm_msr_filter_range_compat {
6510 	__u32 flags;
6511 	__u32 nmsrs;
6512 	__u32 base;
6513 	__u32 bitmap;
6514 };
6515 
6516 struct kvm_msr_filter_compat {
6517 	__u32 flags;
6518 	struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
6519 };
6520 
6521 #define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
6522 
6523 long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
6524 			      unsigned long arg)
6525 {
6526 	void __user *argp = (void __user *)arg;
6527 	struct kvm *kvm = filp->private_data;
6528 	long r = -ENOTTY;
6529 
6530 	switch (ioctl) {
6531 	case KVM_X86_SET_MSR_FILTER_COMPAT: {
6532 		struct kvm_msr_filter __user *user_msr_filter = argp;
6533 		struct kvm_msr_filter_compat filter_compat;
6534 		struct kvm_msr_filter filter;
6535 		int i;
6536 
6537 		if (copy_from_user(&filter_compat, user_msr_filter,
6538 				   sizeof(filter_compat)))
6539 			return -EFAULT;
6540 
6541 		filter.flags = filter_compat.flags;
6542 		for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
6543 			struct kvm_msr_filter_range_compat *cr;
6544 
6545 			cr = &filter_compat.ranges[i];
6546 			filter.ranges[i] = (struct kvm_msr_filter_range) {
6547 				.flags = cr->flags,
6548 				.nmsrs = cr->nmsrs,
6549 				.base = cr->base,
6550 				.bitmap = (__u8 *)(ulong)cr->bitmap,
6551 			};
6552 		}
6553 
6554 		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
6555 		break;
6556 	}
6557 	}
6558 
6559 	return r;
6560 }
6561 #endif
6562 
6563 #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
6564 static int kvm_arch_suspend_notifier(struct kvm *kvm)
6565 {
6566 	struct kvm_vcpu *vcpu;
6567 	unsigned long i;
6568 	int ret = 0;
6569 
6570 	mutex_lock(&kvm->lock);
6571 	kvm_for_each_vcpu(i, vcpu, kvm) {
6572 		if (!vcpu->arch.pv_time.active)
6573 			continue;
6574 
6575 		ret = kvm_set_guest_paused(vcpu);
6576 		if (ret) {
6577 			kvm_err("Failed to pause guest VCPU%d: %d\n",
6578 				vcpu->vcpu_id, ret);
6579 			break;
6580 		}
6581 	}
6582 	mutex_unlock(&kvm->lock);
6583 
6584 	return ret ? NOTIFY_BAD : NOTIFY_DONE;
6585 }
6586 
6587 int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
6588 {
6589 	switch (state) {
6590 	case PM_HIBERNATION_PREPARE:
6591 	case PM_SUSPEND_PREPARE:
6592 		return kvm_arch_suspend_notifier(kvm);
6593 	}
6594 
6595 	return NOTIFY_DONE;
6596 }
6597 #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
6598 
6599 static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp)
6600 {
6601 	struct kvm_clock_data data = { 0 };
6602 
6603 	get_kvmclock(kvm, &data);
6604 	if (copy_to_user(argp, &data, sizeof(data)))
6605 		return -EFAULT;
6606 
6607 	return 0;
6608 }
6609 
6610 static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp)
6611 {
6612 	struct kvm_arch *ka = &kvm->arch;
6613 	struct kvm_clock_data data;
6614 	u64 now_raw_ns;
6615 
6616 	if (copy_from_user(&data, argp, sizeof(data)))
6617 		return -EFAULT;
6618 
6619 	/*
6620 	 * Only KVM_CLOCK_REALTIME is used, but allow passing the
6621 	 * result of KVM_GET_CLOCK back to KVM_SET_CLOCK.
6622 	 */
6623 	if (data.flags & ~KVM_CLOCK_VALID_FLAGS)
6624 		return -EINVAL;
6625 
6626 	kvm_hv_request_tsc_page_update(kvm);
6627 	kvm_start_pvclock_update(kvm);
6628 	pvclock_update_vm_gtod_copy(kvm);
6629 
6630 	/*
6631 	 * This pairs with kvm_guest_time_update(): when masterclock is
6632 	 * in use, we use master_kernel_ns + kvmclock_offset to set
6633 	 * unsigned 'system_time' so if we use get_kvmclock_ns() (which
6634 	 * is slightly ahead) here we risk going negative on unsigned
6635 	 * 'system_time' when 'data.clock' is very small.
6636 	 */
6637 	if (data.flags & KVM_CLOCK_REALTIME) {
6638 		u64 now_real_ns = ktime_get_real_ns();
6639 
6640 		/*
6641 		 * Avoid stepping the kvmclock backwards.
6642 		 */
6643 		if (now_real_ns > data.realtime)
6644 			data.clock += now_real_ns - data.realtime;
6645 	}
6646 
6647 	if (ka->use_master_clock)
6648 		now_raw_ns = ka->master_kernel_ns;
6649 	else
6650 		now_raw_ns = get_kvmclock_base_ns();
6651 	ka->kvmclock_offset = data.clock - now_raw_ns;
6652 	kvm_end_pvclock_update(kvm);
6653 	return 0;
6654 }
6655 
6656 long kvm_arch_vm_ioctl(struct file *filp,
6657 		       unsigned int ioctl, unsigned long arg)
6658 {
6659 	struct kvm *kvm = filp->private_data;
6660 	void __user *argp = (void __user *)arg;
6661 	int r = -ENOTTY;
6662 	/*
6663 	 * This union makes it completely explicit to gcc-3.x
6664 	 * that these two variables' stack usage should be
6665 	 * combined, not added together.
6666 	 */
6667 	union {
6668 		struct kvm_pit_state ps;
6669 		struct kvm_pit_state2 ps2;
6670 		struct kvm_pit_config pit_config;
6671 	} u;
6672 
6673 	switch (ioctl) {
6674 	case KVM_SET_TSS_ADDR:
6675 		r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
6676 		break;
6677 	case KVM_SET_IDENTITY_MAP_ADDR: {
6678 		u64 ident_addr;
6679 
6680 		mutex_lock(&kvm->lock);
6681 		r = -EINVAL;
6682 		if (kvm->created_vcpus)
6683 			goto set_identity_unlock;
6684 		r = -EFAULT;
6685 		if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
6686 			goto set_identity_unlock;
6687 		r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
6688 set_identity_unlock:
6689 		mutex_unlock(&kvm->lock);
6690 		break;
6691 	}
6692 	case KVM_SET_NR_MMU_PAGES:
6693 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
6694 		break;
6695 	case KVM_GET_NR_MMU_PAGES:
6696 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
6697 		break;
6698 	case KVM_CREATE_IRQCHIP: {
6699 		mutex_lock(&kvm->lock);
6700 
6701 		r = -EEXIST;
6702 		if (irqchip_in_kernel(kvm))
6703 			goto create_irqchip_unlock;
6704 
6705 		r = -EINVAL;
6706 		if (kvm->created_vcpus)
6707 			goto create_irqchip_unlock;
6708 
6709 		r = kvm_pic_init(kvm);
6710 		if (r)
6711 			goto create_irqchip_unlock;
6712 
6713 		r = kvm_ioapic_init(kvm);
6714 		if (r) {
6715 			kvm_pic_destroy(kvm);
6716 			goto create_irqchip_unlock;
6717 		}
6718 
6719 		r = kvm_setup_default_irq_routing(kvm);
6720 		if (r) {
6721 			kvm_ioapic_destroy(kvm);
6722 			kvm_pic_destroy(kvm);
6723 			goto create_irqchip_unlock;
6724 		}
6725 		/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
6726 		smp_wmb();
6727 		kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
6728 		kvm_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_ABSENT);
6729 	create_irqchip_unlock:
6730 		mutex_unlock(&kvm->lock);
6731 		break;
6732 	}
6733 	case KVM_CREATE_PIT:
6734 		u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
6735 		goto create_pit;
6736 	case KVM_CREATE_PIT2:
6737 		r = -EFAULT;
6738 		if (copy_from_user(&u.pit_config, argp,
6739 				   sizeof(struct kvm_pit_config)))
6740 			goto out;
6741 	create_pit:
6742 		mutex_lock(&kvm->lock);
6743 		r = -EEXIST;
6744 		if (kvm->arch.vpit)
6745 			goto create_pit_unlock;
6746 		r = -ENOMEM;
6747 		kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
6748 		if (kvm->arch.vpit)
6749 			r = 0;
6750 	create_pit_unlock:
6751 		mutex_unlock(&kvm->lock);
6752 		break;
6753 	case KVM_GET_IRQCHIP: {
6754 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
6755 		struct kvm_irqchip *chip;
6756 
6757 		chip = memdup_user(argp, sizeof(*chip));
6758 		if (IS_ERR(chip)) {
6759 			r = PTR_ERR(chip);
6760 			goto out;
6761 		}
6762 
6763 		r = -ENXIO;
6764 		if (!irqchip_kernel(kvm))
6765 			goto get_irqchip_out;
6766 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
6767 		if (r)
6768 			goto get_irqchip_out;
6769 		r = -EFAULT;
6770 		if (copy_to_user(argp, chip, sizeof(*chip)))
6771 			goto get_irqchip_out;
6772 		r = 0;
6773 	get_irqchip_out:
6774 		kfree(chip);
6775 		break;
6776 	}
6777 	case KVM_SET_IRQCHIP: {
6778 		/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
6779 		struct kvm_irqchip *chip;
6780 
6781 		chip = memdup_user(argp, sizeof(*chip));
6782 		if (IS_ERR(chip)) {
6783 			r = PTR_ERR(chip);
6784 			goto out;
6785 		}
6786 
6787 		r = -ENXIO;
6788 		if (!irqchip_kernel(kvm))
6789 			goto set_irqchip_out;
6790 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
6791 	set_irqchip_out:
6792 		kfree(chip);
6793 		break;
6794 	}
6795 	case KVM_GET_PIT: {
6796 		r = -EFAULT;
6797 		if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
6798 			goto out;
6799 		r = -ENXIO;
6800 		if (!kvm->arch.vpit)
6801 			goto out;
6802 		r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
6803 		if (r)
6804 			goto out;
6805 		r = -EFAULT;
6806 		if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
6807 			goto out;
6808 		r = 0;
6809 		break;
6810 	}
6811 	case KVM_SET_PIT: {
6812 		r = -EFAULT;
6813 		if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
6814 			goto out;
6815 		mutex_lock(&kvm->lock);
6816 		r = -ENXIO;
6817 		if (!kvm->arch.vpit)
6818 			goto set_pit_out;
6819 		r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
6820 set_pit_out:
6821 		mutex_unlock(&kvm->lock);
6822 		break;
6823 	}
6824 	case KVM_GET_PIT2: {
6825 		r = -ENXIO;
6826 		if (!kvm->arch.vpit)
6827 			goto out;
6828 		r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
6829 		if (r)
6830 			goto out;
6831 		r = -EFAULT;
6832 		if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
6833 			goto out;
6834 		r = 0;
6835 		break;
6836 	}
6837 	case KVM_SET_PIT2: {
6838 		r = -EFAULT;
6839 		if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
6840 			goto out;
6841 		mutex_lock(&kvm->lock);
6842 		r = -ENXIO;
6843 		if (!kvm->arch.vpit)
6844 			goto set_pit2_out;
6845 		r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
6846 set_pit2_out:
6847 		mutex_unlock(&kvm->lock);
6848 		break;
6849 	}
6850 	case KVM_REINJECT_CONTROL: {
6851 		struct kvm_reinject_control control;
6852 		r =  -EFAULT;
6853 		if (copy_from_user(&control, argp, sizeof(control)))
6854 			goto out;
6855 		r = -ENXIO;
6856 		if (!kvm->arch.vpit)
6857 			goto out;
6858 		r = kvm_vm_ioctl_reinject(kvm, &control);
6859 		break;
6860 	}
6861 	case KVM_SET_BOOT_CPU_ID:
6862 		r = 0;
6863 		mutex_lock(&kvm->lock);
6864 		if (kvm->created_vcpus)
6865 			r = -EBUSY;
6866 		else
6867 			kvm->arch.bsp_vcpu_id = arg;
6868 		mutex_unlock(&kvm->lock);
6869 		break;
6870 #ifdef CONFIG_KVM_XEN
6871 	case KVM_XEN_HVM_CONFIG: {
6872 		struct kvm_xen_hvm_config xhc;
6873 		r = -EFAULT;
6874 		if (copy_from_user(&xhc, argp, sizeof(xhc)))
6875 			goto out;
6876 		r = kvm_xen_hvm_config(kvm, &xhc);
6877 		break;
6878 	}
6879 	case KVM_XEN_HVM_GET_ATTR: {
6880 		struct kvm_xen_hvm_attr xha;
6881 
6882 		r = -EFAULT;
6883 		if (copy_from_user(&xha, argp, sizeof(xha)))
6884 			goto out;
6885 		r = kvm_xen_hvm_get_attr(kvm, &xha);
6886 		if (!r && copy_to_user(argp, &xha, sizeof(xha)))
6887 			r = -EFAULT;
6888 		break;
6889 	}
6890 	case KVM_XEN_HVM_SET_ATTR: {
6891 		struct kvm_xen_hvm_attr xha;
6892 
6893 		r = -EFAULT;
6894 		if (copy_from_user(&xha, argp, sizeof(xha)))
6895 			goto out;
6896 		r = kvm_xen_hvm_set_attr(kvm, &xha);
6897 		break;
6898 	}
6899 	case KVM_XEN_HVM_EVTCHN_SEND: {
6900 		struct kvm_irq_routing_xen_evtchn uxe;
6901 
6902 		r = -EFAULT;
6903 		if (copy_from_user(&uxe, argp, sizeof(uxe)))
6904 			goto out;
6905 		r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
6906 		break;
6907 	}
6908 #endif
6909 	case KVM_SET_CLOCK:
6910 		r = kvm_vm_ioctl_set_clock(kvm, argp);
6911 		break;
6912 	case KVM_GET_CLOCK:
6913 		r = kvm_vm_ioctl_get_clock(kvm, argp);
6914 		break;
6915 	case KVM_SET_TSC_KHZ: {
6916 		u32 user_tsc_khz;
6917 
6918 		r = -EINVAL;
6919 		user_tsc_khz = (u32)arg;
6920 
6921 		if (kvm_caps.has_tsc_control &&
6922 		    user_tsc_khz >= kvm_caps.max_guest_tsc_khz)
6923 			goto out;
6924 
6925 		if (user_tsc_khz == 0)
6926 			user_tsc_khz = tsc_khz;
6927 
6928 		WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
6929 		r = 0;
6930 
6931 		goto out;
6932 	}
6933 	case KVM_GET_TSC_KHZ: {
6934 		r = READ_ONCE(kvm->arch.default_tsc_khz);
6935 		goto out;
6936 	}
6937 	case KVM_MEMORY_ENCRYPT_OP: {
6938 		r = -ENOTTY;
6939 		if (!kvm_x86_ops.mem_enc_ioctl)
6940 			goto out;
6941 
6942 		r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp);
6943 		break;
6944 	}
6945 	case KVM_MEMORY_ENCRYPT_REG_REGION: {
6946 		struct kvm_enc_region region;
6947 
6948 		r = -EFAULT;
6949 		if (copy_from_user(&region, argp, sizeof(region)))
6950 			goto out;
6951 
6952 		r = -ENOTTY;
6953 		if (!kvm_x86_ops.mem_enc_register_region)
6954 			goto out;
6955 
6956 		r = static_call(kvm_x86_mem_enc_register_region)(kvm, &region);
6957 		break;
6958 	}
6959 	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
6960 		struct kvm_enc_region region;
6961 
6962 		r = -EFAULT;
6963 		if (copy_from_user(&region, argp, sizeof(region)))
6964 			goto out;
6965 
6966 		r = -ENOTTY;
6967 		if (!kvm_x86_ops.mem_enc_unregister_region)
6968 			goto out;
6969 
6970 		r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, &region);
6971 		break;
6972 	}
6973 	case KVM_HYPERV_EVENTFD: {
6974 		struct kvm_hyperv_eventfd hvevfd;
6975 
6976 		r = -EFAULT;
6977 		if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
6978 			goto out;
6979 		r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
6980 		break;
6981 	}
6982 	case KVM_SET_PMU_EVENT_FILTER:
6983 		r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
6984 		break;
6985 	case KVM_X86_SET_MSR_FILTER: {
6986 		struct kvm_msr_filter __user *user_msr_filter = argp;
6987 		struct kvm_msr_filter filter;
6988 
6989 		if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
6990 			return -EFAULT;
6991 
6992 		r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
6993 		break;
6994 	}
6995 	default:
6996 		r = -ENOTTY;
6997 	}
6998 out:
6999 	return r;
7000 }
7001 
7002 static void kvm_init_msr_list(void)
7003 {
7004 	u32 dummy[2];
7005 	unsigned i;
7006 
7007 	BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3,
7008 			 "Please update the fixed PMCs in msrs_to_saved_all[]");
7009 
7010 	num_msrs_to_save = 0;
7011 	num_emulated_msrs = 0;
7012 	num_msr_based_features = 0;
7013 
7014 	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
7015 		if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
7016 			continue;
7017 
7018 		/*
7019 		 * Even MSRs that are valid in the host may not be exposed
7020 		 * to the guests in some cases.
7021 		 */
7022 		switch (msrs_to_save_all[i]) {
7023 		case MSR_IA32_BNDCFGS:
7024 			if (!kvm_mpx_supported())
7025 				continue;
7026 			break;
7027 		case MSR_TSC_AUX:
7028 			if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) &&
7029 			    !kvm_cpu_cap_has(X86_FEATURE_RDPID))
7030 				continue;
7031 			break;
7032 		case MSR_IA32_UMWAIT_CONTROL:
7033 			if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
7034 				continue;
7035 			break;
7036 		case MSR_IA32_RTIT_CTL:
7037 		case MSR_IA32_RTIT_STATUS:
7038 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
7039 				continue;
7040 			break;
7041 		case MSR_IA32_RTIT_CR3_MATCH:
7042 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7043 			    !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
7044 				continue;
7045 			break;
7046 		case MSR_IA32_RTIT_OUTPUT_BASE:
7047 		case MSR_IA32_RTIT_OUTPUT_MASK:
7048 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7049 				(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
7050 				 !intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
7051 				continue;
7052 			break;
7053 		case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
7054 			if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
7055 				msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
7056 				intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
7057 				continue;
7058 			break;
7059 		case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX:
7060 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
7061 			    min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
7062 				continue;
7063 			break;
7064 		case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX:
7065 			if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
7066 			    min(KVM_INTEL_PMC_MAX_GENERIC, kvm_pmu_cap.num_counters_gp))
7067 				continue;
7068 			break;
7069 		case MSR_IA32_XFD:
7070 		case MSR_IA32_XFD_ERR:
7071 			if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
7072 				continue;
7073 			break;
7074 		default:
7075 			break;
7076 		}
7077 
7078 		msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
7079 	}
7080 
7081 	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
7082 		if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i]))
7083 			continue;
7084 
7085 		emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
7086 	}
7087 
7088 	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
7089 		struct kvm_msr_entry msr;
7090 
7091 		msr.index = msr_based_features_all[i];
7092 		if (kvm_get_msr_feature(&msr))
7093 			continue;
7094 
7095 		msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
7096 	}
7097 }
7098 
7099 static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
7100 			   const void *v)
7101 {
7102 	int handled = 0;
7103 	int n;
7104 
7105 	do {
7106 		n = min(len, 8);
7107 		if (!(lapic_in_kernel(vcpu) &&
7108 		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
7109 		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
7110 			break;
7111 		handled += n;
7112 		addr += n;
7113 		len -= n;
7114 		v += n;
7115 	} while (len);
7116 
7117 	return handled;
7118 }
7119 
7120 static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
7121 {
7122 	int handled = 0;
7123 	int n;
7124 
7125 	do {
7126 		n = min(len, 8);
7127 		if (!(lapic_in_kernel(vcpu) &&
7128 		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
7129 					 addr, n, v))
7130 		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
7131 			break;
7132 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
7133 		handled += n;
7134 		addr += n;
7135 		len -= n;
7136 		v += n;
7137 	} while (len);
7138 
7139 	return handled;
7140 }
7141 
7142 void kvm_set_segment(struct kvm_vcpu *vcpu,
7143 		     struct kvm_segment *var, int seg)
7144 {
7145 	static_call(kvm_x86_set_segment)(vcpu, var, seg);
7146 }
7147 
7148 void kvm_get_segment(struct kvm_vcpu *vcpu,
7149 		     struct kvm_segment *var, int seg)
7150 {
7151 	static_call(kvm_x86_get_segment)(vcpu, var, seg);
7152 }
7153 
7154 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access,
7155 			   struct x86_exception *exception)
7156 {
7157 	struct kvm_mmu *mmu = vcpu->arch.mmu;
7158 	gpa_t t_gpa;
7159 
7160 	BUG_ON(!mmu_is_nested(vcpu));
7161 
7162 	/* NPT walks are always user-walks */
7163 	access |= PFERR_USER_MASK;
7164 	t_gpa  = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception);
7165 
7166 	return t_gpa;
7167 }
7168 
7169 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
7170 			      struct x86_exception *exception)
7171 {
7172 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7173 
7174 	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7175 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7176 }
7177 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
7178 
7179 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
7180 			       struct x86_exception *exception)
7181 {
7182 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7183 
7184 	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7185 	access |= PFERR_WRITE_MASK;
7186 	return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7187 }
7188 EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
7189 
7190 /* uses this to access any guest's mapped memory without checking CPL */
7191 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
7192 				struct x86_exception *exception)
7193 {
7194 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7195 
7196 	return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception);
7197 }
7198 
7199 static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
7200 				      struct kvm_vcpu *vcpu, u64 access,
7201 				      struct x86_exception *exception)
7202 {
7203 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7204 	void *data = val;
7205 	int r = X86EMUL_CONTINUE;
7206 
7207 	while (bytes) {
7208 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7209 		unsigned offset = addr & (PAGE_SIZE-1);
7210 		unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
7211 		int ret;
7212 
7213 		if (gpa == INVALID_GPA)
7214 			return X86EMUL_PROPAGATE_FAULT;
7215 		ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
7216 					       offset, toread);
7217 		if (ret < 0) {
7218 			r = X86EMUL_IO_NEEDED;
7219 			goto out;
7220 		}
7221 
7222 		bytes -= toread;
7223 		data += toread;
7224 		addr += toread;
7225 	}
7226 out:
7227 	return r;
7228 }
7229 
7230 /* used for instruction fetching */
7231 static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
7232 				gva_t addr, void *val, unsigned int bytes,
7233 				struct x86_exception *exception)
7234 {
7235 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7236 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7237 	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7238 	unsigned offset;
7239 	int ret;
7240 
7241 	/* Inline kvm_read_guest_virt_helper for speed.  */
7242 	gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK,
7243 				    exception);
7244 	if (unlikely(gpa == INVALID_GPA))
7245 		return X86EMUL_PROPAGATE_FAULT;
7246 
7247 	offset = addr & (PAGE_SIZE-1);
7248 	if (WARN_ON(offset + bytes > PAGE_SIZE))
7249 		bytes = (unsigned)PAGE_SIZE - offset;
7250 	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
7251 				       offset, bytes);
7252 	if (unlikely(ret < 0))
7253 		return X86EMUL_IO_NEEDED;
7254 
7255 	return X86EMUL_CONTINUE;
7256 }
7257 
7258 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
7259 			       gva_t addr, void *val, unsigned int bytes,
7260 			       struct x86_exception *exception)
7261 {
7262 	u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
7263 
7264 	/*
7265 	 * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
7266 	 * is returned, but our callers are not ready for that and they blindly
7267 	 * call kvm_inject_page_fault.  Ensure that they at least do not leak
7268 	 * uninitialized kernel stack memory into cr2 and error code.
7269 	 */
7270 	memset(exception, 0, sizeof(*exception));
7271 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
7272 					  exception);
7273 }
7274 EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
7275 
7276 static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
7277 			     gva_t addr, void *val, unsigned int bytes,
7278 			     struct x86_exception *exception, bool system)
7279 {
7280 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7281 	u64 access = 0;
7282 
7283 	if (system)
7284 		access |= PFERR_IMPLICIT_ACCESS;
7285 	else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
7286 		access |= PFERR_USER_MASK;
7287 
7288 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
7289 }
7290 
7291 static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
7292 				      struct kvm_vcpu *vcpu, u64 access,
7293 				      struct x86_exception *exception)
7294 {
7295 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7296 	void *data = val;
7297 	int r = X86EMUL_CONTINUE;
7298 
7299 	while (bytes) {
7300 		gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception);
7301 		unsigned offset = addr & (PAGE_SIZE-1);
7302 		unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
7303 		int ret;
7304 
7305 		if (gpa == INVALID_GPA)
7306 			return X86EMUL_PROPAGATE_FAULT;
7307 		ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
7308 		if (ret < 0) {
7309 			r = X86EMUL_IO_NEEDED;
7310 			goto out;
7311 		}
7312 
7313 		bytes -= towrite;
7314 		data += towrite;
7315 		addr += towrite;
7316 	}
7317 out:
7318 	return r;
7319 }
7320 
7321 static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val,
7322 			      unsigned int bytes, struct x86_exception *exception,
7323 			      bool system)
7324 {
7325 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7326 	u64 access = PFERR_WRITE_MASK;
7327 
7328 	if (system)
7329 		access |= PFERR_IMPLICIT_ACCESS;
7330 	else if (static_call(kvm_x86_get_cpl)(vcpu) == 3)
7331 		access |= PFERR_USER_MASK;
7332 
7333 	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
7334 					   access, exception);
7335 }
7336 
7337 int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
7338 				unsigned int bytes, struct x86_exception *exception)
7339 {
7340 	/* kvm_write_guest_virt_system can pull in tons of pages. */
7341 	vcpu->arch.l1tf_flush_l1d = true;
7342 
7343 	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
7344 					   PFERR_WRITE_MASK, exception);
7345 }
7346 EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
7347 
7348 static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
7349 				void *insn, int insn_len)
7350 {
7351 	return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
7352 							    insn, insn_len);
7353 }
7354 
7355 int handle_ud(struct kvm_vcpu *vcpu)
7356 {
7357 	static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
7358 	int fep_flags = READ_ONCE(force_emulation_prefix);
7359 	int emul_type = EMULTYPE_TRAP_UD;
7360 	char sig[5]; /* ud2; .ascii "kvm" */
7361 	struct x86_exception e;
7362 
7363 	if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
7364 		return 1;
7365 
7366 	if (fep_flags &&
7367 	    kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
7368 				sig, sizeof(sig), &e) == 0 &&
7369 	    memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
7370 		if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
7371 			kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
7372 		kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
7373 		emul_type = EMULTYPE_TRAP_UD_FORCED;
7374 	}
7375 
7376 	return kvm_emulate_instruction(vcpu, emul_type);
7377 }
7378 EXPORT_SYMBOL_GPL(handle_ud);
7379 
7380 static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7381 			    gpa_t gpa, bool write)
7382 {
7383 	/* For APIC access vmexit */
7384 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7385 		return 1;
7386 
7387 	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
7388 		trace_vcpu_match_mmio(gva, gpa, write, true);
7389 		return 1;
7390 	}
7391 
7392 	return 0;
7393 }
7394 
7395 static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
7396 				gpa_t *gpa, struct x86_exception *exception,
7397 				bool write)
7398 {
7399 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
7400 	u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
7401 		| (write ? PFERR_WRITE_MASK : 0);
7402 
7403 	/*
7404 	 * currently PKRU is only applied to ept enabled guest so
7405 	 * there is no pkey in EPT page table for L1 guest or EPT
7406 	 * shadow page table for L2 guest.
7407 	 */
7408 	if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
7409 	    !permission_fault(vcpu, vcpu->arch.walk_mmu,
7410 			      vcpu->arch.mmio_access, 0, access))) {
7411 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
7412 					(gva & (PAGE_SIZE - 1));
7413 		trace_vcpu_match_mmio(gva, *gpa, write, false);
7414 		return 1;
7415 	}
7416 
7417 	*gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception);
7418 
7419 	if (*gpa == INVALID_GPA)
7420 		return -1;
7421 
7422 	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
7423 }
7424 
7425 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
7426 			const void *val, int bytes)
7427 {
7428 	int ret;
7429 
7430 	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
7431 	if (ret < 0)
7432 		return 0;
7433 	kvm_page_track_write(vcpu, gpa, val, bytes);
7434 	return 1;
7435 }
7436 
7437 struct read_write_emulator_ops {
7438 	int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val,
7439 				  int bytes);
7440 	int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa,
7441 				  void *val, int bytes);
7442 	int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7443 			       int bytes, void *val);
7444 	int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa,
7445 				    void *val, int bytes);
7446 	bool write;
7447 };
7448 
7449 static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
7450 {
7451 	if (vcpu->mmio_read_completed) {
7452 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
7453 			       vcpu->mmio_fragments[0].gpa, val);
7454 		vcpu->mmio_read_completed = 0;
7455 		return 1;
7456 	}
7457 
7458 	return 0;
7459 }
7460 
7461 static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7462 			void *val, int bytes)
7463 {
7464 	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
7465 }
7466 
7467 static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
7468 			 void *val, int bytes)
7469 {
7470 	return emulator_write_phys(vcpu, gpa, val, bytes);
7471 }
7472 
7473 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
7474 {
7475 	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
7476 	return vcpu_mmio_write(vcpu, gpa, bytes, val);
7477 }
7478 
7479 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7480 			  void *val, int bytes)
7481 {
7482 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
7483 	return X86EMUL_IO_NEEDED;
7484 }
7485 
7486 static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
7487 			   void *val, int bytes)
7488 {
7489 	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
7490 
7491 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
7492 	return X86EMUL_CONTINUE;
7493 }
7494 
7495 static const struct read_write_emulator_ops read_emultor = {
7496 	.read_write_prepare = read_prepare,
7497 	.read_write_emulate = read_emulate,
7498 	.read_write_mmio = vcpu_mmio_read,
7499 	.read_write_exit_mmio = read_exit_mmio,
7500 };
7501 
7502 static const struct read_write_emulator_ops write_emultor = {
7503 	.read_write_emulate = write_emulate,
7504 	.read_write_mmio = write_mmio,
7505 	.read_write_exit_mmio = write_exit_mmio,
7506 	.write = true,
7507 };
7508 
7509 static int emulator_read_write_onepage(unsigned long addr, void *val,
7510 				       unsigned int bytes,
7511 				       struct x86_exception *exception,
7512 				       struct kvm_vcpu *vcpu,
7513 				       const struct read_write_emulator_ops *ops)
7514 {
7515 	gpa_t gpa;
7516 	int handled, ret;
7517 	bool write = ops->write;
7518 	struct kvm_mmio_fragment *frag;
7519 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
7520 
7521 	/*
7522 	 * If the exit was due to a NPF we may already have a GPA.
7523 	 * If the GPA is present, use it to avoid the GVA to GPA table walk.
7524 	 * Note, this cannot be used on string operations since string
7525 	 * operation using rep will only have the initial GPA from the NPF
7526 	 * occurred.
7527 	 */
7528 	if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
7529 	    (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
7530 		gpa = ctxt->gpa_val;
7531 		ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
7532 	} else {
7533 		ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
7534 		if (ret < 0)
7535 			return X86EMUL_PROPAGATE_FAULT;
7536 	}
7537 
7538 	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
7539 		return X86EMUL_CONTINUE;
7540 
7541 	/*
7542 	 * Is this MMIO handled locally?
7543 	 */
7544 	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
7545 	if (handled == bytes)
7546 		return X86EMUL_CONTINUE;
7547 
7548 	gpa += handled;
7549 	bytes -= handled;
7550 	val += handled;
7551 
7552 	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
7553 	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
7554 	frag->gpa = gpa;
7555 	frag->data = val;
7556 	frag->len = bytes;
7557 	return X86EMUL_CONTINUE;
7558 }
7559 
7560 static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
7561 			unsigned long addr,
7562 			void *val, unsigned int bytes,
7563 			struct x86_exception *exception,
7564 			const struct read_write_emulator_ops *ops)
7565 {
7566 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7567 	gpa_t gpa;
7568 	int rc;
7569 
7570 	if (ops->read_write_prepare &&
7571 		  ops->read_write_prepare(vcpu, val, bytes))
7572 		return X86EMUL_CONTINUE;
7573 
7574 	vcpu->mmio_nr_fragments = 0;
7575 
7576 	/* Crossing a page boundary? */
7577 	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
7578 		int now;
7579 
7580 		now = -addr & ~PAGE_MASK;
7581 		rc = emulator_read_write_onepage(addr, val, now, exception,
7582 						 vcpu, ops);
7583 
7584 		if (rc != X86EMUL_CONTINUE)
7585 			return rc;
7586 		addr += now;
7587 		if (ctxt->mode != X86EMUL_MODE_PROT64)
7588 			addr = (u32)addr;
7589 		val += now;
7590 		bytes -= now;
7591 	}
7592 
7593 	rc = emulator_read_write_onepage(addr, val, bytes, exception,
7594 					 vcpu, ops);
7595 	if (rc != X86EMUL_CONTINUE)
7596 		return rc;
7597 
7598 	if (!vcpu->mmio_nr_fragments)
7599 		return rc;
7600 
7601 	gpa = vcpu->mmio_fragments[0].gpa;
7602 
7603 	vcpu->mmio_needed = 1;
7604 	vcpu->mmio_cur_fragment = 0;
7605 
7606 	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
7607 	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
7608 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
7609 	vcpu->run->mmio.phys_addr = gpa;
7610 
7611 	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
7612 }
7613 
7614 static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
7615 				  unsigned long addr,
7616 				  void *val,
7617 				  unsigned int bytes,
7618 				  struct x86_exception *exception)
7619 {
7620 	return emulator_read_write(ctxt, addr, val, bytes,
7621 				   exception, &read_emultor);
7622 }
7623 
7624 static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
7625 			    unsigned long addr,
7626 			    const void *val,
7627 			    unsigned int bytes,
7628 			    struct x86_exception *exception)
7629 {
7630 	return emulator_read_write(ctxt, addr, (void *)val, bytes,
7631 				   exception, &write_emultor);
7632 }
7633 
7634 #define emulator_try_cmpxchg_user(t, ptr, old, new) \
7635 	(__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
7636 
7637 static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
7638 				     unsigned long addr,
7639 				     const void *old,
7640 				     const void *new,
7641 				     unsigned int bytes,
7642 				     struct x86_exception *exception)
7643 {
7644 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7645 	u64 page_line_mask;
7646 	unsigned long hva;
7647 	gpa_t gpa;
7648 	int r;
7649 
7650 	/* guests cmpxchg8b have to be emulated atomically */
7651 	if (bytes > 8 || (bytes & (bytes - 1)))
7652 		goto emul_write;
7653 
7654 	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
7655 
7656 	if (gpa == INVALID_GPA ||
7657 	    (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
7658 		goto emul_write;
7659 
7660 	/*
7661 	 * Emulate the atomic as a straight write to avoid #AC if SLD is
7662 	 * enabled in the host and the access splits a cache line.
7663 	 */
7664 	if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
7665 		page_line_mask = ~(cache_line_size() - 1);
7666 	else
7667 		page_line_mask = PAGE_MASK;
7668 
7669 	if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
7670 		goto emul_write;
7671 
7672 	hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
7673 	if (kvm_is_error_hva(hva))
7674 		goto emul_write;
7675 
7676 	hva += offset_in_page(gpa);
7677 
7678 	switch (bytes) {
7679 	case 1:
7680 		r = emulator_try_cmpxchg_user(u8, hva, old, new);
7681 		break;
7682 	case 2:
7683 		r = emulator_try_cmpxchg_user(u16, hva, old, new);
7684 		break;
7685 	case 4:
7686 		r = emulator_try_cmpxchg_user(u32, hva, old, new);
7687 		break;
7688 	case 8:
7689 		r = emulator_try_cmpxchg_user(u64, hva, old, new);
7690 		break;
7691 	default:
7692 		BUG();
7693 	}
7694 
7695 	if (r < 0)
7696 		return X86EMUL_UNHANDLEABLE;
7697 	if (r)
7698 		return X86EMUL_CMPXCHG_FAILED;
7699 
7700 	kvm_page_track_write(vcpu, gpa, new, bytes);
7701 
7702 	return X86EMUL_CONTINUE;
7703 
7704 emul_write:
7705 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
7706 
7707 	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
7708 }
7709 
7710 static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
7711 			       unsigned short port, void *data,
7712 			       unsigned int count, bool in)
7713 {
7714 	unsigned i;
7715 	int r;
7716 
7717 	WARN_ON_ONCE(vcpu->arch.pio.count);
7718 	for (i = 0; i < count; i++) {
7719 		if (in)
7720 			r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, port, size, data);
7721 		else
7722 			r = kvm_io_bus_write(vcpu, KVM_PIO_BUS, port, size, data);
7723 
7724 		if (r) {
7725 			if (i == 0)
7726 				goto userspace_io;
7727 
7728 			/*
7729 			 * Userspace must have unregistered the device while PIO
7730 			 * was running.  Drop writes / read as 0.
7731 			 */
7732 			if (in)
7733 				memset(data, 0, size * (count - i));
7734 			break;
7735 		}
7736 
7737 		data += size;
7738 	}
7739 	return 1;
7740 
7741 userspace_io:
7742 	vcpu->arch.pio.port = port;
7743 	vcpu->arch.pio.in = in;
7744 	vcpu->arch.pio.count = count;
7745 	vcpu->arch.pio.size = size;
7746 
7747 	if (in)
7748 		memset(vcpu->arch.pio_data, 0, size * count);
7749 	else
7750 		memcpy(vcpu->arch.pio_data, data, size * count);
7751 
7752 	vcpu->run->exit_reason = KVM_EXIT_IO;
7753 	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
7754 	vcpu->run->io.size = size;
7755 	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
7756 	vcpu->run->io.count = count;
7757 	vcpu->run->io.port = port;
7758 	return 0;
7759 }
7760 
7761 static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
7762       			   unsigned short port, void *val, unsigned int count)
7763 {
7764 	int r = emulator_pio_in_out(vcpu, size, port, val, count, true);
7765 	if (r)
7766 		trace_kvm_pio(KVM_PIO_IN, port, size, count, val);
7767 
7768 	return r;
7769 }
7770 
7771 static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val)
7772 {
7773 	int size = vcpu->arch.pio.size;
7774 	unsigned int count = vcpu->arch.pio.count;
7775 	memcpy(val, vcpu->arch.pio_data, size * count);
7776 	trace_kvm_pio(KVM_PIO_IN, vcpu->arch.pio.port, size, count, vcpu->arch.pio_data);
7777 	vcpu->arch.pio.count = 0;
7778 }
7779 
7780 static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
7781 				    int size, unsigned short port, void *val,
7782 				    unsigned int count)
7783 {
7784 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7785 	if (vcpu->arch.pio.count) {
7786 		/*
7787 		 * Complete a previous iteration that required userspace I/O.
7788 		 * Note, @count isn't guaranteed to match pio.count as userspace
7789 		 * can modify ECX before rerunning the vCPU.  Ignore any such
7790 		 * shenanigans as KVM doesn't support modifying the rep count,
7791 		 * and the emulator ensures @count doesn't overflow the buffer.
7792 		 */
7793 		complete_emulator_pio_in(vcpu, val);
7794 		return 1;
7795 	}
7796 
7797 	return emulator_pio_in(vcpu, size, port, val, count);
7798 }
7799 
7800 static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
7801 			    unsigned short port, const void *val,
7802 			    unsigned int count)
7803 {
7804 	trace_kvm_pio(KVM_PIO_OUT, port, size, count, val);
7805 	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
7806 }
7807 
7808 static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
7809 				     int size, unsigned short port,
7810 				     const void *val, unsigned int count)
7811 {
7812 	return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
7813 }
7814 
7815 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
7816 {
7817 	return static_call(kvm_x86_get_segment_base)(vcpu, seg);
7818 }
7819 
7820 static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
7821 {
7822 	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
7823 }
7824 
7825 static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
7826 {
7827 	if (!need_emulate_wbinvd(vcpu))
7828 		return X86EMUL_CONTINUE;
7829 
7830 	if (static_call(kvm_x86_has_wbinvd_exit)()) {
7831 		int cpu = get_cpu();
7832 
7833 		cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
7834 		on_each_cpu_mask(vcpu->arch.wbinvd_dirty_mask,
7835 				wbinvd_ipi, NULL, 1);
7836 		put_cpu();
7837 		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
7838 	} else
7839 		wbinvd();
7840 	return X86EMUL_CONTINUE;
7841 }
7842 
7843 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
7844 {
7845 	kvm_emulate_wbinvd_noskip(vcpu);
7846 	return kvm_skip_emulated_instruction(vcpu);
7847 }
7848 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
7849 
7850 
7851 
7852 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
7853 {
7854 	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
7855 }
7856 
7857 static void emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
7858 			    unsigned long *dest)
7859 {
7860 	kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
7861 }
7862 
7863 static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
7864 			   unsigned long value)
7865 {
7866 
7867 	return kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
7868 }
7869 
7870 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
7871 {
7872 	return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
7873 }
7874 
7875 static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
7876 {
7877 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7878 	unsigned long value;
7879 
7880 	switch (cr) {
7881 	case 0:
7882 		value = kvm_read_cr0(vcpu);
7883 		break;
7884 	case 2:
7885 		value = vcpu->arch.cr2;
7886 		break;
7887 	case 3:
7888 		value = kvm_read_cr3(vcpu);
7889 		break;
7890 	case 4:
7891 		value = kvm_read_cr4(vcpu);
7892 		break;
7893 	case 8:
7894 		value = kvm_get_cr8(vcpu);
7895 		break;
7896 	default:
7897 		kvm_err("%s: unexpected cr %u\n", __func__, cr);
7898 		return 0;
7899 	}
7900 
7901 	return value;
7902 }
7903 
7904 static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
7905 {
7906 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
7907 	int res = 0;
7908 
7909 	switch (cr) {
7910 	case 0:
7911 		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
7912 		break;
7913 	case 2:
7914 		vcpu->arch.cr2 = val;
7915 		break;
7916 	case 3:
7917 		res = kvm_set_cr3(vcpu, val);
7918 		break;
7919 	case 4:
7920 		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
7921 		break;
7922 	case 8:
7923 		res = kvm_set_cr8(vcpu, val);
7924 		break;
7925 	default:
7926 		kvm_err("%s: unexpected cr %u\n", __func__, cr);
7927 		res = -1;
7928 	}
7929 
7930 	return res;
7931 }
7932 
7933 static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
7934 {
7935 	return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt));
7936 }
7937 
7938 static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
7939 {
7940 	static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt);
7941 }
7942 
7943 static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
7944 {
7945 	static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt);
7946 }
7947 
7948 static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
7949 {
7950 	static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt);
7951 }
7952 
7953 static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
7954 {
7955 	static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt);
7956 }
7957 
7958 static unsigned long emulator_get_cached_segment_base(
7959 	struct x86_emulate_ctxt *ctxt, int seg)
7960 {
7961 	return get_segment_base(emul_to_vcpu(ctxt), seg);
7962 }
7963 
7964 static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
7965 				 struct desc_struct *desc, u32 *base3,
7966 				 int seg)
7967 {
7968 	struct kvm_segment var;
7969 
7970 	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
7971 	*selector = var.selector;
7972 
7973 	if (var.unusable) {
7974 		memset(desc, 0, sizeof(*desc));
7975 		if (base3)
7976 			*base3 = 0;
7977 		return false;
7978 	}
7979 
7980 	if (var.g)
7981 		var.limit >>= 12;
7982 	set_desc_limit(desc, var.limit);
7983 	set_desc_base(desc, (unsigned long)var.base);
7984 #ifdef CONFIG_X86_64
7985 	if (base3)
7986 		*base3 = var.base >> 32;
7987 #endif
7988 	desc->type = var.type;
7989 	desc->s = var.s;
7990 	desc->dpl = var.dpl;
7991 	desc->p = var.present;
7992 	desc->avl = var.avl;
7993 	desc->l = var.l;
7994 	desc->d = var.db;
7995 	desc->g = var.g;
7996 
7997 	return true;
7998 }
7999 
8000 static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
8001 				 struct desc_struct *desc, u32 base3,
8002 				 int seg)
8003 {
8004 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8005 	struct kvm_segment var;
8006 
8007 	var.selector = selector;
8008 	var.base = get_desc_base(desc);
8009 #ifdef CONFIG_X86_64
8010 	var.base |= ((u64)base3) << 32;
8011 #endif
8012 	var.limit = get_desc_limit(desc);
8013 	if (desc->g)
8014 		var.limit = (var.limit << 12) | 0xfff;
8015 	var.type = desc->type;
8016 	var.dpl = desc->dpl;
8017 	var.db = desc->d;
8018 	var.s = desc->s;
8019 	var.l = desc->l;
8020 	var.g = desc->g;
8021 	var.avl = desc->avl;
8022 	var.present = desc->p;
8023 	var.unusable = !var.present;
8024 	var.padding = 0;
8025 
8026 	kvm_set_segment(vcpu, &var, seg);
8027 	return;
8028 }
8029 
8030 static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
8031 					u32 msr_index, u64 *pdata)
8032 {
8033 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8034 	int r;
8035 
8036 	r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
8037 	if (r < 0)
8038 		return X86EMUL_UNHANDLEABLE;
8039 
8040 	if (r) {
8041 		if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
8042 				       complete_emulated_rdmsr, r))
8043 			return X86EMUL_IO_NEEDED;
8044 
8045 		trace_kvm_msr_read_ex(msr_index);
8046 		return X86EMUL_PROPAGATE_FAULT;
8047 	}
8048 
8049 	trace_kvm_msr_read(msr_index, *pdata);
8050 	return X86EMUL_CONTINUE;
8051 }
8052 
8053 static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
8054 					u32 msr_index, u64 data)
8055 {
8056 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8057 	int r;
8058 
8059 	r = kvm_set_msr_with_filter(vcpu, msr_index, data);
8060 	if (r < 0)
8061 		return X86EMUL_UNHANDLEABLE;
8062 
8063 	if (r) {
8064 		if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
8065 				       complete_emulated_msr_access, r))
8066 			return X86EMUL_IO_NEEDED;
8067 
8068 		trace_kvm_msr_write_ex(msr_index, data);
8069 		return X86EMUL_PROPAGATE_FAULT;
8070 	}
8071 
8072 	trace_kvm_msr_write(msr_index, data);
8073 	return X86EMUL_CONTINUE;
8074 }
8075 
8076 static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
8077 			    u32 msr_index, u64 *pdata)
8078 {
8079 	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
8080 }
8081 
8082 static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
8083 			      u32 pmc)
8084 {
8085 	if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc))
8086 		return 0;
8087 	return -EINVAL;
8088 }
8089 
8090 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
8091 			     u32 pmc, u64 *pdata)
8092 {
8093 	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
8094 }
8095 
8096 static void emulator_halt(struct x86_emulate_ctxt *ctxt)
8097 {
8098 	emul_to_vcpu(ctxt)->arch.halt_request = 1;
8099 }
8100 
8101 static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
8102 			      struct x86_instruction_info *info,
8103 			      enum x86_intercept_stage stage)
8104 {
8105 	return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage,
8106 					    &ctxt->exception);
8107 }
8108 
8109 static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
8110 			      u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
8111 			      bool exact_only)
8112 {
8113 	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
8114 }
8115 
8116 static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
8117 {
8118 	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
8119 }
8120 
8121 static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
8122 {
8123 	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
8124 }
8125 
8126 static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
8127 {
8128 	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
8129 }
8130 
8131 static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
8132 {
8133 	return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
8134 }
8135 
8136 static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
8137 {
8138 	return kvm_register_read_raw(emul_to_vcpu(ctxt), reg);
8139 }
8140 
8141 static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
8142 {
8143 	kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val);
8144 }
8145 
8146 static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
8147 {
8148 	static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked);
8149 }
8150 
8151 static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
8152 {
8153 	return emul_to_vcpu(ctxt)->arch.hflags;
8154 }
8155 
8156 #ifndef CONFIG_KVM_SMM
8157 static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt)
8158 {
8159 	WARN_ON_ONCE(1);
8160 	return X86EMUL_UNHANDLEABLE;
8161 }
8162 #endif
8163 
8164 static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt)
8165 {
8166 	kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt));
8167 }
8168 
8169 static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
8170 {
8171 	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
8172 }
8173 
8174 static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt)
8175 {
8176 	struct kvm *kvm = emul_to_vcpu(ctxt)->kvm;
8177 
8178 	if (!kvm->vm_bugged)
8179 		kvm_vm_bugged(kvm);
8180 }
8181 
8182 static const struct x86_emulate_ops emulate_ops = {
8183 	.vm_bugged           = emulator_vm_bugged,
8184 	.read_gpr            = emulator_read_gpr,
8185 	.write_gpr           = emulator_write_gpr,
8186 	.read_std            = emulator_read_std,
8187 	.write_std           = emulator_write_std,
8188 	.fetch               = kvm_fetch_guest_virt,
8189 	.read_emulated       = emulator_read_emulated,
8190 	.write_emulated      = emulator_write_emulated,
8191 	.cmpxchg_emulated    = emulator_cmpxchg_emulated,
8192 	.invlpg              = emulator_invlpg,
8193 	.pio_in_emulated     = emulator_pio_in_emulated,
8194 	.pio_out_emulated    = emulator_pio_out_emulated,
8195 	.get_segment         = emulator_get_segment,
8196 	.set_segment         = emulator_set_segment,
8197 	.get_cached_segment_base = emulator_get_cached_segment_base,
8198 	.get_gdt             = emulator_get_gdt,
8199 	.get_idt	     = emulator_get_idt,
8200 	.set_gdt             = emulator_set_gdt,
8201 	.set_idt	     = emulator_set_idt,
8202 	.get_cr              = emulator_get_cr,
8203 	.set_cr              = emulator_set_cr,
8204 	.cpl                 = emulator_get_cpl,
8205 	.get_dr              = emulator_get_dr,
8206 	.set_dr              = emulator_set_dr,
8207 	.set_msr_with_filter = emulator_set_msr_with_filter,
8208 	.get_msr_with_filter = emulator_get_msr_with_filter,
8209 	.get_msr             = emulator_get_msr,
8210 	.check_pmc	     = emulator_check_pmc,
8211 	.read_pmc            = emulator_read_pmc,
8212 	.halt                = emulator_halt,
8213 	.wbinvd              = emulator_wbinvd,
8214 	.fix_hypercall       = emulator_fix_hypercall,
8215 	.intercept           = emulator_intercept,
8216 	.get_cpuid           = emulator_get_cpuid,
8217 	.guest_has_long_mode = emulator_guest_has_long_mode,
8218 	.guest_has_movbe     = emulator_guest_has_movbe,
8219 	.guest_has_fxsr      = emulator_guest_has_fxsr,
8220 	.guest_has_rdpid     = emulator_guest_has_rdpid,
8221 	.set_nmi_mask        = emulator_set_nmi_mask,
8222 	.get_hflags          = emulator_get_hflags,
8223 	.leave_smm           = emulator_leave_smm,
8224 	.triple_fault        = emulator_triple_fault,
8225 	.set_xcr             = emulator_set_xcr,
8226 };
8227 
8228 static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
8229 {
8230 	u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
8231 	/*
8232 	 * an sti; sti; sequence only disable interrupts for the first
8233 	 * instruction. So, if the last instruction, be it emulated or
8234 	 * not, left the system with the INT_STI flag enabled, it
8235 	 * means that the last instruction is an sti. We should not
8236 	 * leave the flag on in this case. The same goes for mov ss
8237 	 */
8238 	if (int_shadow & mask)
8239 		mask = 0;
8240 	if (unlikely(int_shadow || mask)) {
8241 		static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask);
8242 		if (!mask)
8243 			kvm_make_request(KVM_REQ_EVENT, vcpu);
8244 	}
8245 }
8246 
8247 static void inject_emulated_exception(struct kvm_vcpu *vcpu)
8248 {
8249 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8250 
8251 	if (ctxt->exception.vector == PF_VECTOR)
8252 		kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
8253 	else if (ctxt->exception.error_code_valid)
8254 		kvm_queue_exception_e(vcpu, ctxt->exception.vector,
8255 				      ctxt->exception.error_code);
8256 	else
8257 		kvm_queue_exception(vcpu, ctxt->exception.vector);
8258 }
8259 
8260 static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
8261 {
8262 	struct x86_emulate_ctxt *ctxt;
8263 
8264 	ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
8265 	if (!ctxt) {
8266 		pr_err("kvm: failed to allocate vcpu's emulator\n");
8267 		return NULL;
8268 	}
8269 
8270 	ctxt->vcpu = vcpu;
8271 	ctxt->ops = &emulate_ops;
8272 	vcpu->arch.emulate_ctxt = ctxt;
8273 
8274 	return ctxt;
8275 }
8276 
8277 static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
8278 {
8279 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8280 	int cs_db, cs_l;
8281 
8282 	static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l);
8283 
8284 	ctxt->gpa_available = false;
8285 	ctxt->eflags = kvm_get_rflags(vcpu);
8286 	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
8287 
8288 	ctxt->eip = kvm_rip_read(vcpu);
8289 	ctxt->mode = (!is_protmode(vcpu))		? X86EMUL_MODE_REAL :
8290 		     (ctxt->eflags & X86_EFLAGS_VM)	? X86EMUL_MODE_VM86 :
8291 		     (cs_l && is_long_mode(vcpu))	? X86EMUL_MODE_PROT64 :
8292 		     cs_db				? X86EMUL_MODE_PROT32 :
8293 							  X86EMUL_MODE_PROT16;
8294 	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
8295 
8296 	ctxt->interruptibility = 0;
8297 	ctxt->have_exception = false;
8298 	ctxt->exception.vector = -1;
8299 	ctxt->perm_ok = false;
8300 
8301 	init_decode_cache(ctxt);
8302 	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8303 }
8304 
8305 void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
8306 {
8307 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8308 	int ret;
8309 
8310 	init_emulate_ctxt(vcpu);
8311 
8312 	ctxt->op_bytes = 2;
8313 	ctxt->ad_bytes = 2;
8314 	ctxt->_eip = ctxt->eip + inc_eip;
8315 	ret = emulate_int_real(ctxt, irq);
8316 
8317 	if (ret != X86EMUL_CONTINUE) {
8318 		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
8319 	} else {
8320 		ctxt->eip = ctxt->_eip;
8321 		kvm_rip_write(vcpu, ctxt->eip);
8322 		kvm_set_rflags(vcpu, ctxt->eflags);
8323 	}
8324 }
8325 EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
8326 
8327 static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8328 					   u8 ndata, u8 *insn_bytes, u8 insn_size)
8329 {
8330 	struct kvm_run *run = vcpu->run;
8331 	u64 info[5];
8332 	u8 info_start;
8333 
8334 	/*
8335 	 * Zero the whole array used to retrieve the exit info, as casting to
8336 	 * u32 for select entries will leave some chunks uninitialized.
8337 	 */
8338 	memset(&info, 0, sizeof(info));
8339 
8340 	static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1],
8341 					   &info[2], (u32 *)&info[3],
8342 					   (u32 *)&info[4]);
8343 
8344 	run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
8345 	run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION;
8346 
8347 	/*
8348 	 * There's currently space for 13 entries, but 5 are used for the exit
8349 	 * reason and info.  Restrict to 4 to reduce the maintenance burden
8350 	 * when expanding kvm_run.emulation_failure in the future.
8351 	 */
8352 	if (WARN_ON_ONCE(ndata > 4))
8353 		ndata = 4;
8354 
8355 	/* Always include the flags as a 'data' entry. */
8356 	info_start = 1;
8357 	run->emulation_failure.flags = 0;
8358 
8359 	if (insn_size) {
8360 		BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) +
8361 			      sizeof(run->emulation_failure.insn_bytes) != 16));
8362 		info_start += 2;
8363 		run->emulation_failure.flags |=
8364 			KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES;
8365 		run->emulation_failure.insn_size = insn_size;
8366 		memset(run->emulation_failure.insn_bytes, 0x90,
8367 		       sizeof(run->emulation_failure.insn_bytes));
8368 		memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size);
8369 	}
8370 
8371 	memcpy(&run->internal.data[info_start], info, sizeof(info));
8372 	memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data,
8373 	       ndata * sizeof(data[0]));
8374 
8375 	run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata;
8376 }
8377 
8378 static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu)
8379 {
8380 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8381 
8382 	prepare_emulation_failure_exit(vcpu, NULL, 0, ctxt->fetch.data,
8383 				       ctxt->fetch.end - ctxt->fetch.data);
8384 }
8385 
8386 void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data,
8387 					  u8 ndata)
8388 {
8389 	prepare_emulation_failure_exit(vcpu, data, ndata, NULL, 0);
8390 }
8391 EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit);
8392 
8393 void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu)
8394 {
8395 	__kvm_prepare_emulation_failure_exit(vcpu, NULL, 0);
8396 }
8397 EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit);
8398 
8399 static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
8400 {
8401 	struct kvm *kvm = vcpu->kvm;
8402 
8403 	++vcpu->stat.insn_emulation_fail;
8404 	trace_kvm_emulate_insn_failed(vcpu);
8405 
8406 	if (emulation_type & EMULTYPE_VMWARE_GP) {
8407 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8408 		return 1;
8409 	}
8410 
8411 	if (kvm->arch.exit_on_emulation_error ||
8412 	    (emulation_type & EMULTYPE_SKIP)) {
8413 		prepare_emulation_ctxt_failure_exit(vcpu);
8414 		return 0;
8415 	}
8416 
8417 	kvm_queue_exception(vcpu, UD_VECTOR);
8418 
8419 	if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) {
8420 		prepare_emulation_ctxt_failure_exit(vcpu);
8421 		return 0;
8422 	}
8423 
8424 	return 1;
8425 }
8426 
8427 static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
8428 				  bool write_fault_to_shadow_pgtable,
8429 				  int emulation_type)
8430 {
8431 	gpa_t gpa = cr2_or_gpa;
8432 	kvm_pfn_t pfn;
8433 
8434 	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
8435 		return false;
8436 
8437 	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8438 	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
8439 		return false;
8440 
8441 	if (!vcpu->arch.mmu->root_role.direct) {
8442 		/*
8443 		 * Write permission should be allowed since only
8444 		 * write access need to be emulated.
8445 		 */
8446 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
8447 
8448 		/*
8449 		 * If the mapping is invalid in guest, let cpu retry
8450 		 * it to generate fault.
8451 		 */
8452 		if (gpa == INVALID_GPA)
8453 			return true;
8454 	}
8455 
8456 	/*
8457 	 * Do not retry the unhandleable instruction if it faults on the
8458 	 * readonly host memory, otherwise it will goto a infinite loop:
8459 	 * retry instruction -> write #PF -> emulation fail -> retry
8460 	 * instruction -> ...
8461 	 */
8462 	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
8463 
8464 	/*
8465 	 * If the instruction failed on the error pfn, it can not be fixed,
8466 	 * report the error to userspace.
8467 	 */
8468 	if (is_error_noslot_pfn(pfn))
8469 		return false;
8470 
8471 	kvm_release_pfn_clean(pfn);
8472 
8473 	/* The instructions are well-emulated on direct mmu. */
8474 	if (vcpu->arch.mmu->root_role.direct) {
8475 		unsigned int indirect_shadow_pages;
8476 
8477 		write_lock(&vcpu->kvm->mmu_lock);
8478 		indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
8479 		write_unlock(&vcpu->kvm->mmu_lock);
8480 
8481 		if (indirect_shadow_pages)
8482 			kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8483 
8484 		return true;
8485 	}
8486 
8487 	/*
8488 	 * if emulation was due to access to shadowed page table
8489 	 * and it failed try to unshadow page and re-enter the
8490 	 * guest to let CPU execute the instruction.
8491 	 */
8492 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8493 
8494 	/*
8495 	 * If the access faults on its page table, it can not
8496 	 * be fixed by unprotecting shadow page and it should
8497 	 * be reported to userspace.
8498 	 */
8499 	return !write_fault_to_shadow_pgtable;
8500 }
8501 
8502 static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
8503 			      gpa_t cr2_or_gpa,  int emulation_type)
8504 {
8505 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
8506 	unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
8507 
8508 	last_retry_eip = vcpu->arch.last_retry_eip;
8509 	last_retry_addr = vcpu->arch.last_retry_addr;
8510 
8511 	/*
8512 	 * If the emulation is caused by #PF and it is non-page_table
8513 	 * writing instruction, it means the VM-EXIT is caused by shadow
8514 	 * page protected, we can zap the shadow page and retry this
8515 	 * instruction directly.
8516 	 *
8517 	 * Note: if the guest uses a non-page-table modifying instruction
8518 	 * on the PDE that points to the instruction, then we will unmap
8519 	 * the instruction and go to an infinite loop. So, we cache the
8520 	 * last retried eip and the last fault address, if we meet the eip
8521 	 * and the address again, we can break out of the potential infinite
8522 	 * loop.
8523 	 */
8524 	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
8525 
8526 	if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
8527 		return false;
8528 
8529 	if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
8530 	    WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
8531 		return false;
8532 
8533 	if (x86_page_table_writing_insn(ctxt))
8534 		return false;
8535 
8536 	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
8537 		return false;
8538 
8539 	vcpu->arch.last_retry_eip = ctxt->eip;
8540 	vcpu->arch.last_retry_addr = cr2_or_gpa;
8541 
8542 	if (!vcpu->arch.mmu->root_role.direct)
8543 		gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
8544 
8545 	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
8546 
8547 	return true;
8548 }
8549 
8550 static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
8551 static int complete_emulated_pio(struct kvm_vcpu *vcpu);
8552 
8553 static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
8554 				unsigned long *db)
8555 {
8556 	u32 dr6 = 0;
8557 	int i;
8558 	u32 enable, rwlen;
8559 
8560 	enable = dr7;
8561 	rwlen = dr7 >> 16;
8562 	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
8563 		if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
8564 			dr6 |= (1 << i);
8565 	return dr6;
8566 }
8567 
8568 static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
8569 {
8570 	struct kvm_run *kvm_run = vcpu->run;
8571 
8572 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
8573 		kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW;
8574 		kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
8575 		kvm_run->debug.arch.exception = DB_VECTOR;
8576 		kvm_run->exit_reason = KVM_EXIT_DEBUG;
8577 		return 0;
8578 	}
8579 	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
8580 	return 1;
8581 }
8582 
8583 int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
8584 {
8585 	unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
8586 	int r;
8587 
8588 	r = static_call(kvm_x86_skip_emulated_instruction)(vcpu);
8589 	if (unlikely(!r))
8590 		return 0;
8591 
8592 	kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
8593 
8594 	/*
8595 	 * rflags is the old, "raw" value of the flags.  The new value has
8596 	 * not been saved yet.
8597 	 *
8598 	 * This is correct even for TF set by the guest, because "the
8599 	 * processor will not generate this exception after the instruction
8600 	 * that sets the TF flag".
8601 	 */
8602 	if (unlikely(rflags & X86_EFLAGS_TF))
8603 		r = kvm_vcpu_do_singlestep(vcpu);
8604 	return r;
8605 }
8606 EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
8607 
8608 static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
8609 {
8610 	u32 shadow;
8611 
8612 	if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
8613 		return true;
8614 
8615 	/*
8616 	 * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
8617 	 * but AMD CPUs do not.  MOV/POP SS blocking is rare, check that first
8618 	 * to avoid the relatively expensive CPUID lookup.
8619 	 */
8620 	shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
8621 	return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
8622 	       guest_cpuid_is_intel(vcpu);
8623 }
8624 
8625 static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
8626 					   int emulation_type, int *r)
8627 {
8628 	WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
8629 
8630 	/*
8631 	 * Do not check for code breakpoints if hardware has already done the
8632 	 * checks, as inferred from the emulation type.  On NO_DECODE and SKIP,
8633 	 * the instruction has passed all exception checks, and all intercepted
8634 	 * exceptions that trigger emulation have lower priority than code
8635 	 * breakpoints, i.e. the fact that the intercepted exception occurred
8636 	 * means any code breakpoints have already been serviced.
8637 	 *
8638 	 * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
8639 	 * hardware has checked the RIP of the magic prefix, but not the RIP of
8640 	 * the instruction being emulated.  The intent of forced emulation is
8641 	 * to behave as if KVM intercepted the instruction without an exception
8642 	 * and without a prefix.
8643 	 */
8644 	if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
8645 			      EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
8646 		return false;
8647 
8648 	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
8649 	    (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
8650 		struct kvm_run *kvm_run = vcpu->run;
8651 		unsigned long eip = kvm_get_linear_rip(vcpu);
8652 		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
8653 					   vcpu->arch.guest_debug_dr7,
8654 					   vcpu->arch.eff_db);
8655 
8656 		if (dr6 != 0) {
8657 			kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW;
8658 			kvm_run->debug.arch.pc = eip;
8659 			kvm_run->debug.arch.exception = DB_VECTOR;
8660 			kvm_run->exit_reason = KVM_EXIT_DEBUG;
8661 			*r = 0;
8662 			return true;
8663 		}
8664 	}
8665 
8666 	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
8667 	    !kvm_is_code_breakpoint_inhibited(vcpu)) {
8668 		unsigned long eip = kvm_get_linear_rip(vcpu);
8669 		u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
8670 					   vcpu->arch.dr7,
8671 					   vcpu->arch.db);
8672 
8673 		if (dr6 != 0) {
8674 			kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
8675 			*r = 1;
8676 			return true;
8677 		}
8678 	}
8679 
8680 	return false;
8681 }
8682 
8683 static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
8684 {
8685 	switch (ctxt->opcode_len) {
8686 	case 1:
8687 		switch (ctxt->b) {
8688 		case 0xe4:	/* IN */
8689 		case 0xe5:
8690 		case 0xec:
8691 		case 0xed:
8692 		case 0xe6:	/* OUT */
8693 		case 0xe7:
8694 		case 0xee:
8695 		case 0xef:
8696 		case 0x6c:	/* INS */
8697 		case 0x6d:
8698 		case 0x6e:	/* OUTS */
8699 		case 0x6f:
8700 			return true;
8701 		}
8702 		break;
8703 	case 2:
8704 		switch (ctxt->b) {
8705 		case 0x33:	/* RDPMC */
8706 			return true;
8707 		}
8708 		break;
8709 	}
8710 
8711 	return false;
8712 }
8713 
8714 /*
8715  * Decode an instruction for emulation.  The caller is responsible for handling
8716  * code breakpoints.  Note, manually detecting code breakpoints is unnecessary
8717  * (and wrong) when emulating on an intercepted fault-like exception[*], as
8718  * code breakpoints have higher priority and thus have already been done by
8719  * hardware.
8720  *
8721  * [*] Except #MC, which is higher priority, but KVM should never emulate in
8722  *     response to a machine check.
8723  */
8724 int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
8725 				    void *insn, int insn_len)
8726 {
8727 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8728 	int r;
8729 
8730 	init_emulate_ctxt(vcpu);
8731 
8732 	r = x86_decode_insn(ctxt, insn, insn_len, emulation_type);
8733 
8734 	trace_kvm_emulate_insn_start(vcpu);
8735 	++vcpu->stat.insn_emulation;
8736 
8737 	return r;
8738 }
8739 EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
8740 
8741 int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
8742 			    int emulation_type, void *insn, int insn_len)
8743 {
8744 	int r;
8745 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
8746 	bool writeback = true;
8747 	bool write_fault_to_spt;
8748 
8749 	if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
8750 		return 1;
8751 
8752 	vcpu->arch.l1tf_flush_l1d = true;
8753 
8754 	/*
8755 	 * Clear write_fault_to_shadow_pgtable here to ensure it is
8756 	 * never reused.
8757 	 */
8758 	write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
8759 	vcpu->arch.write_fault_to_shadow_pgtable = false;
8760 
8761 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
8762 		kvm_clear_exception_queue(vcpu);
8763 
8764 		/*
8765 		 * Return immediately if RIP hits a code breakpoint, such #DBs
8766 		 * are fault-like and are higher priority than any faults on
8767 		 * the code fetch itself.
8768 		 */
8769 		if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
8770 			return r;
8771 
8772 		r = x86_decode_emulated_instruction(vcpu, emulation_type,
8773 						    insn, insn_len);
8774 		if (r != EMULATION_OK)  {
8775 			if ((emulation_type & EMULTYPE_TRAP_UD) ||
8776 			    (emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
8777 				kvm_queue_exception(vcpu, UD_VECTOR);
8778 				return 1;
8779 			}
8780 			if (reexecute_instruction(vcpu, cr2_or_gpa,
8781 						  write_fault_to_spt,
8782 						  emulation_type))
8783 				return 1;
8784 
8785 			if (ctxt->have_exception &&
8786 			    !(emulation_type & EMULTYPE_SKIP)) {
8787 				/*
8788 				 * #UD should result in just EMULATION_FAILED, and trap-like
8789 				 * exception should not be encountered during decode.
8790 				 */
8791 				WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR ||
8792 					     exception_type(ctxt->exception.vector) == EXCPT_TRAP);
8793 				inject_emulated_exception(vcpu);
8794 				return 1;
8795 			}
8796 			return handle_emulation_failure(vcpu, emulation_type);
8797 		}
8798 	}
8799 
8800 	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
8801 	    !is_vmware_backdoor_opcode(ctxt)) {
8802 		kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
8803 		return 1;
8804 	}
8805 
8806 	/*
8807 	 * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for
8808 	 * use *only* by vendor callbacks for kvm_skip_emulated_instruction().
8809 	 * The caller is responsible for updating interruptibility state and
8810 	 * injecting single-step #DBs.
8811 	 */
8812 	if (emulation_type & EMULTYPE_SKIP) {
8813 		if (ctxt->mode != X86EMUL_MODE_PROT64)
8814 			ctxt->eip = (u32)ctxt->_eip;
8815 		else
8816 			ctxt->eip = ctxt->_eip;
8817 
8818 		if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) {
8819 			r = 1;
8820 			goto writeback;
8821 		}
8822 
8823 		kvm_rip_write(vcpu, ctxt->eip);
8824 		if (ctxt->eflags & X86_EFLAGS_RF)
8825 			kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
8826 		return 1;
8827 	}
8828 
8829 	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
8830 		return 1;
8831 
8832 	/* this is needed for vmware backdoor interface to work since it
8833 	   changes registers values  during IO operation */
8834 	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
8835 		vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
8836 		emulator_invalidate_register_cache(ctxt);
8837 	}
8838 
8839 restart:
8840 	if (emulation_type & EMULTYPE_PF) {
8841 		/* Save the faulting GPA (cr2) in the address field */
8842 		ctxt->exception.address = cr2_or_gpa;
8843 
8844 		/* With shadow page tables, cr2 contains a GVA or nGPA. */
8845 		if (vcpu->arch.mmu->root_role.direct) {
8846 			ctxt->gpa_available = true;
8847 			ctxt->gpa_val = cr2_or_gpa;
8848 		}
8849 	} else {
8850 		/* Sanitize the address out of an abundance of paranoia. */
8851 		ctxt->exception.address = 0;
8852 	}
8853 
8854 	r = x86_emulate_insn(ctxt);
8855 
8856 	if (r == EMULATION_INTERCEPTED)
8857 		return 1;
8858 
8859 	if (r == EMULATION_FAILED) {
8860 		if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
8861 					emulation_type))
8862 			return 1;
8863 
8864 		return handle_emulation_failure(vcpu, emulation_type);
8865 	}
8866 
8867 	if (ctxt->have_exception) {
8868 		r = 1;
8869 		inject_emulated_exception(vcpu);
8870 	} else if (vcpu->arch.pio.count) {
8871 		if (!vcpu->arch.pio.in) {
8872 			/* FIXME: return into emulator if single-stepping.  */
8873 			vcpu->arch.pio.count = 0;
8874 		} else {
8875 			writeback = false;
8876 			vcpu->arch.complete_userspace_io = complete_emulated_pio;
8877 		}
8878 		r = 0;
8879 	} else if (vcpu->mmio_needed) {
8880 		++vcpu->stat.mmio_exits;
8881 
8882 		if (!vcpu->mmio_is_write)
8883 			writeback = false;
8884 		r = 0;
8885 		vcpu->arch.complete_userspace_io = complete_emulated_mmio;
8886 	} else if (vcpu->arch.complete_userspace_io) {
8887 		writeback = false;
8888 		r = 0;
8889 	} else if (r == EMULATION_RESTART)
8890 		goto restart;
8891 	else
8892 		r = 1;
8893 
8894 writeback:
8895 	if (writeback) {
8896 		unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
8897 		toggle_interruptibility(vcpu, ctxt->interruptibility);
8898 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
8899 
8900 		/*
8901 		 * Note, EXCPT_DB is assumed to be fault-like as the emulator
8902 		 * only supports code breakpoints and general detect #DB, both
8903 		 * of which are fault-like.
8904 		 */
8905 		if (!ctxt->have_exception ||
8906 		    exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
8907 			kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
8908 			if (ctxt->is_branch)
8909 				kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
8910 			kvm_rip_write(vcpu, ctxt->eip);
8911 			if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
8912 				r = kvm_vcpu_do_singlestep(vcpu);
8913 			static_call_cond(kvm_x86_update_emulated_instruction)(vcpu);
8914 			__kvm_set_rflags(vcpu, ctxt->eflags);
8915 		}
8916 
8917 		/*
8918 		 * For STI, interrupts are shadowed; so KVM_REQ_EVENT will
8919 		 * do nothing, and it will be requested again as soon as
8920 		 * the shadow expires.  But we still need to check here,
8921 		 * because POPF has no interrupt shadow.
8922 		 */
8923 		if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
8924 			kvm_make_request(KVM_REQ_EVENT, vcpu);
8925 	} else
8926 		vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
8927 
8928 	return r;
8929 }
8930 
8931 int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
8932 {
8933 	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
8934 }
8935 EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
8936 
8937 int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
8938 					void *insn, int insn_len)
8939 {
8940 	return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
8941 }
8942 EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
8943 
8944 static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
8945 {
8946 	vcpu->arch.pio.count = 0;
8947 	return 1;
8948 }
8949 
8950 static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
8951 {
8952 	vcpu->arch.pio.count = 0;
8953 
8954 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
8955 		return 1;
8956 
8957 	return kvm_skip_emulated_instruction(vcpu);
8958 }
8959 
8960 static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
8961 			    unsigned short port)
8962 {
8963 	unsigned long val = kvm_rax_read(vcpu);
8964 	int ret = emulator_pio_out(vcpu, size, port, &val, 1);
8965 
8966 	if (ret)
8967 		return ret;
8968 
8969 	/*
8970 	 * Workaround userspace that relies on old KVM behavior of %rip being
8971 	 * incremented prior to exiting to userspace to handle "OUT 0x7e".
8972 	 */
8973 	if (port == 0x7e &&
8974 	    kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
8975 		vcpu->arch.complete_userspace_io =
8976 			complete_fast_pio_out_port_0x7e;
8977 		kvm_skip_emulated_instruction(vcpu);
8978 	} else {
8979 		vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
8980 		vcpu->arch.complete_userspace_io = complete_fast_pio_out;
8981 	}
8982 	return 0;
8983 }
8984 
8985 static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
8986 {
8987 	unsigned long val;
8988 
8989 	/* We should only ever be called with arch.pio.count equal to 1 */
8990 	BUG_ON(vcpu->arch.pio.count != 1);
8991 
8992 	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
8993 		vcpu->arch.pio.count = 0;
8994 		return 1;
8995 	}
8996 
8997 	/* For size less than 4 we merge, else we zero extend */
8998 	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
8999 
9000 	complete_emulator_pio_in(vcpu, &val);
9001 	kvm_rax_write(vcpu, val);
9002 
9003 	return kvm_skip_emulated_instruction(vcpu);
9004 }
9005 
9006 static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
9007 			   unsigned short port)
9008 {
9009 	unsigned long val;
9010 	int ret;
9011 
9012 	/* For size less than 4 we merge, else we zero extend */
9013 	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
9014 
9015 	ret = emulator_pio_in(vcpu, size, port, &val, 1);
9016 	if (ret) {
9017 		kvm_rax_write(vcpu, val);
9018 		return ret;
9019 	}
9020 
9021 	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
9022 	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
9023 
9024 	return 0;
9025 }
9026 
9027 int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
9028 {
9029 	int ret;
9030 
9031 	if (in)
9032 		ret = kvm_fast_pio_in(vcpu, size, port);
9033 	else
9034 		ret = kvm_fast_pio_out(vcpu, size, port);
9035 	return ret && kvm_skip_emulated_instruction(vcpu);
9036 }
9037 EXPORT_SYMBOL_GPL(kvm_fast_pio);
9038 
9039 static int kvmclock_cpu_down_prep(unsigned int cpu)
9040 {
9041 	__this_cpu_write(cpu_tsc_khz, 0);
9042 	return 0;
9043 }
9044 
9045 static void tsc_khz_changed(void *data)
9046 {
9047 	struct cpufreq_freqs *freq = data;
9048 	unsigned long khz = 0;
9049 
9050 	WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
9051 
9052 	if (data)
9053 		khz = freq->new;
9054 	else
9055 		khz = cpufreq_quick_get(raw_smp_processor_id());
9056 	if (!khz)
9057 		khz = tsc_khz;
9058 	__this_cpu_write(cpu_tsc_khz, khz);
9059 }
9060 
9061 #ifdef CONFIG_X86_64
9062 static void kvm_hyperv_tsc_notifier(void)
9063 {
9064 	struct kvm *kvm;
9065 	int cpu;
9066 
9067 	mutex_lock(&kvm_lock);
9068 	list_for_each_entry(kvm, &vm_list, vm_list)
9069 		kvm_make_mclock_inprogress_request(kvm);
9070 
9071 	/* no guest entries from this point */
9072 	hyperv_stop_tsc_emulation();
9073 
9074 	/* TSC frequency always matches when on Hyper-V */
9075 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9076 		for_each_present_cpu(cpu)
9077 			per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
9078 	}
9079 	kvm_caps.max_guest_tsc_khz = tsc_khz;
9080 
9081 	list_for_each_entry(kvm, &vm_list, vm_list) {
9082 		__kvm_start_pvclock_update(kvm);
9083 		pvclock_update_vm_gtod_copy(kvm);
9084 		kvm_end_pvclock_update(kvm);
9085 	}
9086 
9087 	mutex_unlock(&kvm_lock);
9088 }
9089 #endif
9090 
9091 static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
9092 {
9093 	struct kvm *kvm;
9094 	struct kvm_vcpu *vcpu;
9095 	int send_ipi = 0;
9096 	unsigned long i;
9097 
9098 	/*
9099 	 * We allow guests to temporarily run on slowing clocks,
9100 	 * provided we notify them after, or to run on accelerating
9101 	 * clocks, provided we notify them before.  Thus time never
9102 	 * goes backwards.
9103 	 *
9104 	 * However, we have a problem.  We can't atomically update
9105 	 * the frequency of a given CPU from this function; it is
9106 	 * merely a notifier, which can be called from any CPU.
9107 	 * Changing the TSC frequency at arbitrary points in time
9108 	 * requires a recomputation of local variables related to
9109 	 * the TSC for each VCPU.  We must flag these local variables
9110 	 * to be updated and be sure the update takes place with the
9111 	 * new frequency before any guests proceed.
9112 	 *
9113 	 * Unfortunately, the combination of hotplug CPU and frequency
9114 	 * change creates an intractable locking scenario; the order
9115 	 * of when these callouts happen is undefined with respect to
9116 	 * CPU hotplug, and they can race with each other.  As such,
9117 	 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
9118 	 * undefined; you can actually have a CPU frequency change take
9119 	 * place in between the computation of X and the setting of the
9120 	 * variable.  To protect against this problem, all updates of
9121 	 * the per_cpu tsc_khz variable are done in an interrupt
9122 	 * protected IPI, and all callers wishing to update the value
9123 	 * must wait for a synchronous IPI to complete (which is trivial
9124 	 * if the caller is on the CPU already).  This establishes the
9125 	 * necessary total order on variable updates.
9126 	 *
9127 	 * Note that because a guest time update may take place
9128 	 * anytime after the setting of the VCPU's request bit, the
9129 	 * correct TSC value must be set before the request.  However,
9130 	 * to ensure the update actually makes it to any guest which
9131 	 * starts running in hardware virtualization between the set
9132 	 * and the acquisition of the spinlock, we must also ping the
9133 	 * CPU after setting the request bit.
9134 	 *
9135 	 */
9136 
9137 	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
9138 
9139 	mutex_lock(&kvm_lock);
9140 	list_for_each_entry(kvm, &vm_list, vm_list) {
9141 		kvm_for_each_vcpu(i, vcpu, kvm) {
9142 			if (vcpu->cpu != cpu)
9143 				continue;
9144 			kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
9145 			if (vcpu->cpu != raw_smp_processor_id())
9146 				send_ipi = 1;
9147 		}
9148 	}
9149 	mutex_unlock(&kvm_lock);
9150 
9151 	if (freq->old < freq->new && send_ipi) {
9152 		/*
9153 		 * We upscale the frequency.  Must make the guest
9154 		 * doesn't see old kvmclock values while running with
9155 		 * the new frequency, otherwise we risk the guest sees
9156 		 * time go backwards.
9157 		 *
9158 		 * In case we update the frequency for another cpu
9159 		 * (which might be in guest context) send an interrupt
9160 		 * to kick the cpu out of guest context.  Next time
9161 		 * guest context is entered kvmclock will be updated,
9162 		 * so the guest will not see stale values.
9163 		 */
9164 		smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
9165 	}
9166 }
9167 
9168 static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
9169 				     void *data)
9170 {
9171 	struct cpufreq_freqs *freq = data;
9172 	int cpu;
9173 
9174 	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
9175 		return 0;
9176 	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
9177 		return 0;
9178 
9179 	for_each_cpu(cpu, freq->policy->cpus)
9180 		__kvmclock_cpufreq_notifier(freq, cpu);
9181 
9182 	return 0;
9183 }
9184 
9185 static struct notifier_block kvmclock_cpufreq_notifier_block = {
9186 	.notifier_call  = kvmclock_cpufreq_notifier
9187 };
9188 
9189 static int kvmclock_cpu_online(unsigned int cpu)
9190 {
9191 	tsc_khz_changed(NULL);
9192 	return 0;
9193 }
9194 
9195 static void kvm_timer_init(void)
9196 {
9197 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9198 		max_tsc_khz = tsc_khz;
9199 
9200 		if (IS_ENABLED(CONFIG_CPU_FREQ)) {
9201 			struct cpufreq_policy *policy;
9202 			int cpu;
9203 
9204 			cpu = get_cpu();
9205 			policy = cpufreq_cpu_get(cpu);
9206 			if (policy) {
9207 				if (policy->cpuinfo.max_freq)
9208 					max_tsc_khz = policy->cpuinfo.max_freq;
9209 				cpufreq_cpu_put(policy);
9210 			}
9211 			put_cpu();
9212 		}
9213 		cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
9214 					  CPUFREQ_TRANSITION_NOTIFIER);
9215 
9216 		cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
9217 				  kvmclock_cpu_online, kvmclock_cpu_down_prep);
9218 	}
9219 }
9220 
9221 #ifdef CONFIG_X86_64
9222 static void pvclock_gtod_update_fn(struct work_struct *work)
9223 {
9224 	struct kvm *kvm;
9225 	struct kvm_vcpu *vcpu;
9226 	unsigned long i;
9227 
9228 	mutex_lock(&kvm_lock);
9229 	list_for_each_entry(kvm, &vm_list, vm_list)
9230 		kvm_for_each_vcpu(i, vcpu, kvm)
9231 			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
9232 	atomic_set(&kvm_guest_has_master_clock, 0);
9233 	mutex_unlock(&kvm_lock);
9234 }
9235 
9236 static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
9237 
9238 /*
9239  * Indirection to move queue_work() out of the tk_core.seq write held
9240  * region to prevent possible deadlocks against time accessors which
9241  * are invoked with work related locks held.
9242  */
9243 static void pvclock_irq_work_fn(struct irq_work *w)
9244 {
9245 	queue_work(system_long_wq, &pvclock_gtod_work);
9246 }
9247 
9248 static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
9249 
9250 /*
9251  * Notification about pvclock gtod data update.
9252  */
9253 static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
9254 			       void *priv)
9255 {
9256 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
9257 	struct timekeeper *tk = priv;
9258 
9259 	update_pvclock_gtod(tk);
9260 
9261 	/*
9262 	 * Disable master clock if host does not trust, or does not use,
9263 	 * TSC based clocksource. Delegate queue_work() to irq_work as
9264 	 * this is invoked with tk_core.seq write held.
9265 	 */
9266 	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
9267 	    atomic_read(&kvm_guest_has_master_clock) != 0)
9268 		irq_work_queue(&pvclock_irq_work);
9269 	return 0;
9270 }
9271 
9272 static struct notifier_block pvclock_gtod_notifier = {
9273 	.notifier_call = pvclock_gtod_notify,
9274 };
9275 #endif
9276 
9277 int kvm_arch_init(void *opaque)
9278 {
9279 	struct kvm_x86_init_ops *ops = opaque;
9280 	u64 host_pat;
9281 	int r;
9282 
9283 	if (kvm_x86_ops.hardware_enable) {
9284 		pr_err("kvm: already loaded vendor module '%s'\n", kvm_x86_ops.name);
9285 		return -EEXIST;
9286 	}
9287 
9288 	if (!ops->cpu_has_kvm_support()) {
9289 		pr_err_ratelimited("kvm: no hardware support for '%s'\n",
9290 				   ops->runtime_ops->name);
9291 		return -EOPNOTSUPP;
9292 	}
9293 	if (ops->disabled_by_bios()) {
9294 		pr_err_ratelimited("kvm: support for '%s' disabled by bios\n",
9295 				   ops->runtime_ops->name);
9296 		return -EOPNOTSUPP;
9297 	}
9298 
9299 	/*
9300 	 * KVM explicitly assumes that the guest has an FPU and
9301 	 * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
9302 	 * vCPU's FPU state as a fxregs_state struct.
9303 	 */
9304 	if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) {
9305 		printk(KERN_ERR "kvm: inadequate fpu\n");
9306 		return -EOPNOTSUPP;
9307 	}
9308 
9309 	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9310 		pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n");
9311 		return -EOPNOTSUPP;
9312 	}
9313 
9314 	/*
9315 	 * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes
9316 	 * the PAT bits in SPTEs.  Bail if PAT[0] is programmed to something
9317 	 * other than WB.  Note, EPT doesn't utilize the PAT, but don't bother
9318 	 * with an exception.  PAT[0] is set to WB on RESET and also by the
9319 	 * kernel, i.e. failure indicates a kernel bug or broken firmware.
9320 	 */
9321 	if (rdmsrl_safe(MSR_IA32_CR_PAT, &host_pat) ||
9322 	    (host_pat & GENMASK(2, 0)) != 6) {
9323 		pr_err("kvm: host PAT[0] is not WB\n");
9324 		return -EIO;
9325 	}
9326 
9327 	x86_emulator_cache = kvm_alloc_emulator_cache();
9328 	if (!x86_emulator_cache) {
9329 		pr_err("kvm: failed to allocate cache for x86 emulator\n");
9330 		return -ENOMEM;
9331 	}
9332 
9333 	user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
9334 	if (!user_return_msrs) {
9335 		printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
9336 		r = -ENOMEM;
9337 		goto out_free_x86_emulator_cache;
9338 	}
9339 	kvm_nr_uret_msrs = 0;
9340 
9341 	r = kvm_mmu_vendor_module_init();
9342 	if (r)
9343 		goto out_free_percpu;
9344 
9345 	kvm_timer_init();
9346 
9347 	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
9348 		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
9349 		kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
9350 	}
9351 
9352 	if (pi_inject_timer == -1)
9353 		pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER);
9354 #ifdef CONFIG_X86_64
9355 	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
9356 
9357 	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
9358 		set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
9359 #endif
9360 
9361 	return 0;
9362 
9363 out_free_percpu:
9364 	free_percpu(user_return_msrs);
9365 out_free_x86_emulator_cache:
9366 	kmem_cache_destroy(x86_emulator_cache);
9367 	return r;
9368 }
9369 
9370 void kvm_arch_exit(void)
9371 {
9372 #ifdef CONFIG_X86_64
9373 	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
9374 		clear_hv_tscchange_cb();
9375 #endif
9376 	kvm_lapic_exit();
9377 
9378 	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
9379 		cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
9380 					    CPUFREQ_TRANSITION_NOTIFIER);
9381 		cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
9382 	}
9383 #ifdef CONFIG_X86_64
9384 	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
9385 	irq_work_sync(&pvclock_irq_work);
9386 	cancel_work_sync(&pvclock_gtod_work);
9387 #endif
9388 	kvm_x86_ops.hardware_enable = NULL;
9389 	kvm_mmu_vendor_module_exit();
9390 	free_percpu(user_return_msrs);
9391 	kmem_cache_destroy(x86_emulator_cache);
9392 #ifdef CONFIG_KVM_XEN
9393 	static_key_deferred_flush(&kvm_xen_enabled);
9394 	WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key));
9395 #endif
9396 }
9397 
9398 static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason)
9399 {
9400 	/*
9401 	 * The vCPU has halted, e.g. executed HLT.  Update the run state if the
9402 	 * local APIC is in-kernel, the run loop will detect the non-runnable
9403 	 * state and halt the vCPU.  Exit to userspace if the local APIC is
9404 	 * managed by userspace, in which case userspace is responsible for
9405 	 * handling wake events.
9406 	 */
9407 	++vcpu->stat.halt_exits;
9408 	if (lapic_in_kernel(vcpu)) {
9409 		vcpu->arch.mp_state = state;
9410 		return 1;
9411 	} else {
9412 		vcpu->run->exit_reason = reason;
9413 		return 0;
9414 	}
9415 }
9416 
9417 int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
9418 {
9419 	return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT);
9420 }
9421 EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
9422 
9423 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
9424 {
9425 	int ret = kvm_skip_emulated_instruction(vcpu);
9426 	/*
9427 	 * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
9428 	 * KVM_EXIT_DEBUG here.
9429 	 */
9430 	return kvm_emulate_halt_noskip(vcpu) && ret;
9431 }
9432 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
9433 
9434 int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu)
9435 {
9436 	int ret = kvm_skip_emulated_instruction(vcpu);
9437 
9438 	return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD,
9439 					KVM_EXIT_AP_RESET_HOLD) && ret;
9440 }
9441 EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold);
9442 
9443 #ifdef CONFIG_X86_64
9444 static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
9445 			        unsigned long clock_type)
9446 {
9447 	struct kvm_clock_pairing clock_pairing;
9448 	struct timespec64 ts;
9449 	u64 cycle;
9450 	int ret;
9451 
9452 	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
9453 		return -KVM_EOPNOTSUPP;
9454 
9455 	/*
9456 	 * When tsc is in permanent catchup mode guests won't be able to use
9457 	 * pvclock_read_retry loop to get consistent view of pvclock
9458 	 */
9459 	if (vcpu->arch.tsc_always_catchup)
9460 		return -KVM_EOPNOTSUPP;
9461 
9462 	if (!kvm_get_walltime_and_clockread(&ts, &cycle))
9463 		return -KVM_EOPNOTSUPP;
9464 
9465 	clock_pairing.sec = ts.tv_sec;
9466 	clock_pairing.nsec = ts.tv_nsec;
9467 	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
9468 	clock_pairing.flags = 0;
9469 	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
9470 
9471 	ret = 0;
9472 	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
9473 			    sizeof(struct kvm_clock_pairing)))
9474 		ret = -KVM_EFAULT;
9475 
9476 	return ret;
9477 }
9478 #endif
9479 
9480 /*
9481  * kvm_pv_kick_cpu_op:  Kick a vcpu.
9482  *
9483  * @apicid - apicid of vcpu to be kicked.
9484  */
9485 static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid)
9486 {
9487 	/*
9488 	 * All other fields are unused for APIC_DM_REMRD, but may be consumed by
9489 	 * common code, e.g. for tracing. Defer initialization to the compiler.
9490 	 */
9491 	struct kvm_lapic_irq lapic_irq = {
9492 		.delivery_mode = APIC_DM_REMRD,
9493 		.dest_mode = APIC_DEST_PHYSICAL,
9494 		.shorthand = APIC_DEST_NOSHORT,
9495 		.dest_id = apicid,
9496 	};
9497 
9498 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
9499 }
9500 
9501 bool kvm_apicv_activated(struct kvm *kvm)
9502 {
9503 	return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
9504 }
9505 EXPORT_SYMBOL_GPL(kvm_apicv_activated);
9506 
9507 bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
9508 {
9509 	ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
9510 	ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
9511 
9512 	return (vm_reasons | vcpu_reasons) == 0;
9513 }
9514 EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
9515 
9516 static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
9517 				       enum kvm_apicv_inhibit reason, bool set)
9518 {
9519 	if (set)
9520 		__set_bit(reason, inhibits);
9521 	else
9522 		__clear_bit(reason, inhibits);
9523 
9524 	trace_kvm_apicv_inhibit_changed(reason, set, *inhibits);
9525 }
9526 
9527 static void kvm_apicv_init(struct kvm *kvm)
9528 {
9529 	unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons;
9530 
9531 	init_rwsem(&kvm->arch.apicv_update_lock);
9532 
9533 	set_or_clear_apicv_inhibit(inhibits, APICV_INHIBIT_REASON_ABSENT, true);
9534 
9535 	if (!enable_apicv)
9536 		set_or_clear_apicv_inhibit(inhibits,
9537 					   APICV_INHIBIT_REASON_DISABLE, true);
9538 }
9539 
9540 static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id)
9541 {
9542 	struct kvm_vcpu *target = NULL;
9543 	struct kvm_apic_map *map;
9544 
9545 	vcpu->stat.directed_yield_attempted++;
9546 
9547 	if (single_task_running())
9548 		goto no_yield;
9549 
9550 	rcu_read_lock();
9551 	map = rcu_dereference(vcpu->kvm->arch.apic_map);
9552 
9553 	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
9554 		target = map->phys_map[dest_id]->vcpu;
9555 
9556 	rcu_read_unlock();
9557 
9558 	if (!target || !READ_ONCE(target->ready))
9559 		goto no_yield;
9560 
9561 	/* Ignore requests to yield to self */
9562 	if (vcpu == target)
9563 		goto no_yield;
9564 
9565 	if (kvm_vcpu_yield_to(target) <= 0)
9566 		goto no_yield;
9567 
9568 	vcpu->stat.directed_yield_successful++;
9569 
9570 no_yield:
9571 	return;
9572 }
9573 
9574 static int complete_hypercall_exit(struct kvm_vcpu *vcpu)
9575 {
9576 	u64 ret = vcpu->run->hypercall.ret;
9577 
9578 	if (!is_64_bit_mode(vcpu))
9579 		ret = (u32)ret;
9580 	kvm_rax_write(vcpu, ret);
9581 	++vcpu->stat.hypercalls;
9582 	return kvm_skip_emulated_instruction(vcpu);
9583 }
9584 
9585 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
9586 {
9587 	unsigned long nr, a0, a1, a2, a3, ret;
9588 	int op_64_bit;
9589 
9590 	if (kvm_xen_hypercall_enabled(vcpu->kvm))
9591 		return kvm_xen_hypercall(vcpu);
9592 
9593 	if (kvm_hv_hypercall_enabled(vcpu))
9594 		return kvm_hv_hypercall(vcpu);
9595 
9596 	nr = kvm_rax_read(vcpu);
9597 	a0 = kvm_rbx_read(vcpu);
9598 	a1 = kvm_rcx_read(vcpu);
9599 	a2 = kvm_rdx_read(vcpu);
9600 	a3 = kvm_rsi_read(vcpu);
9601 
9602 	trace_kvm_hypercall(nr, a0, a1, a2, a3);
9603 
9604 	op_64_bit = is_64_bit_hypercall(vcpu);
9605 	if (!op_64_bit) {
9606 		nr &= 0xFFFFFFFF;
9607 		a0 &= 0xFFFFFFFF;
9608 		a1 &= 0xFFFFFFFF;
9609 		a2 &= 0xFFFFFFFF;
9610 		a3 &= 0xFFFFFFFF;
9611 	}
9612 
9613 	if (static_call(kvm_x86_get_cpl)(vcpu) != 0) {
9614 		ret = -KVM_EPERM;
9615 		goto out;
9616 	}
9617 
9618 	ret = -KVM_ENOSYS;
9619 
9620 	switch (nr) {
9621 	case KVM_HC_VAPIC_POLL_IRQ:
9622 		ret = 0;
9623 		break;
9624 	case KVM_HC_KICK_CPU:
9625 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
9626 			break;
9627 
9628 		kvm_pv_kick_cpu_op(vcpu->kvm, a1);
9629 		kvm_sched_yield(vcpu, a1);
9630 		ret = 0;
9631 		break;
9632 #ifdef CONFIG_X86_64
9633 	case KVM_HC_CLOCK_PAIRING:
9634 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
9635 		break;
9636 #endif
9637 	case KVM_HC_SEND_IPI:
9638 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
9639 			break;
9640 
9641 		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
9642 		break;
9643 	case KVM_HC_SCHED_YIELD:
9644 		if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
9645 			break;
9646 
9647 		kvm_sched_yield(vcpu, a0);
9648 		ret = 0;
9649 		break;
9650 	case KVM_HC_MAP_GPA_RANGE: {
9651 		u64 gpa = a0, npages = a1, attrs = a2;
9652 
9653 		ret = -KVM_ENOSYS;
9654 		if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE)))
9655 			break;
9656 
9657 		if (!PAGE_ALIGNED(gpa) || !npages ||
9658 		    gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) {
9659 			ret = -KVM_EINVAL;
9660 			break;
9661 		}
9662 
9663 		vcpu->run->exit_reason        = KVM_EXIT_HYPERCALL;
9664 		vcpu->run->hypercall.nr       = KVM_HC_MAP_GPA_RANGE;
9665 		vcpu->run->hypercall.args[0]  = gpa;
9666 		vcpu->run->hypercall.args[1]  = npages;
9667 		vcpu->run->hypercall.args[2]  = attrs;
9668 		vcpu->run->hypercall.longmode = op_64_bit;
9669 		vcpu->arch.complete_userspace_io = complete_hypercall_exit;
9670 		return 0;
9671 	}
9672 	default:
9673 		ret = -KVM_ENOSYS;
9674 		break;
9675 	}
9676 out:
9677 	if (!op_64_bit)
9678 		ret = (u32)ret;
9679 	kvm_rax_write(vcpu, ret);
9680 
9681 	++vcpu->stat.hypercalls;
9682 	return kvm_skip_emulated_instruction(vcpu);
9683 }
9684 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
9685 
9686 static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
9687 {
9688 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
9689 	char instruction[3];
9690 	unsigned long rip = kvm_rip_read(vcpu);
9691 
9692 	/*
9693 	 * If the quirk is disabled, synthesize a #UD and let the guest pick up
9694 	 * the pieces.
9695 	 */
9696 	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
9697 		ctxt->exception.error_code_valid = false;
9698 		ctxt->exception.vector = UD_VECTOR;
9699 		ctxt->have_exception = true;
9700 		return X86EMUL_PROPAGATE_FAULT;
9701 	}
9702 
9703 	static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
9704 
9705 	return emulator_write_emulated(ctxt, rip, instruction, 3,
9706 		&ctxt->exception);
9707 }
9708 
9709 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
9710 {
9711 	return vcpu->run->request_interrupt_window &&
9712 		likely(!pic_in_kernel(vcpu->kvm));
9713 }
9714 
9715 /* Called within kvm->srcu read side.  */
9716 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
9717 {
9718 	struct kvm_run *kvm_run = vcpu->run;
9719 
9720 	kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu);
9721 	kvm_run->cr8 = kvm_get_cr8(vcpu);
9722 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
9723 
9724 	kvm_run->ready_for_interrupt_injection =
9725 		pic_in_kernel(vcpu->kvm) ||
9726 		kvm_vcpu_ready_for_interrupt_injection(vcpu);
9727 
9728 	if (is_smm(vcpu))
9729 		kvm_run->flags |= KVM_RUN_X86_SMM;
9730 }
9731 
9732 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
9733 {
9734 	int max_irr, tpr;
9735 
9736 	if (!kvm_x86_ops.update_cr8_intercept)
9737 		return;
9738 
9739 	if (!lapic_in_kernel(vcpu))
9740 		return;
9741 
9742 	if (vcpu->arch.apic->apicv_active)
9743 		return;
9744 
9745 	if (!vcpu->arch.apic->vapic_addr)
9746 		max_irr = kvm_lapic_find_highest_irr(vcpu);
9747 	else
9748 		max_irr = -1;
9749 
9750 	if (max_irr != -1)
9751 		max_irr >>= 4;
9752 
9753 	tpr = kvm_lapic_get_cr8(vcpu);
9754 
9755 	static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr);
9756 }
9757 
9758 
9759 int kvm_check_nested_events(struct kvm_vcpu *vcpu)
9760 {
9761 	if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
9762 		kvm_x86_ops.nested_ops->triple_fault(vcpu);
9763 		return 1;
9764 	}
9765 
9766 	return kvm_x86_ops.nested_ops->check_events(vcpu);
9767 }
9768 
9769 static void kvm_inject_exception(struct kvm_vcpu *vcpu)
9770 {
9771 	trace_kvm_inj_exception(vcpu->arch.exception.vector,
9772 				vcpu->arch.exception.has_error_code,
9773 				vcpu->arch.exception.error_code,
9774 				vcpu->arch.exception.injected);
9775 
9776 	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
9777 		vcpu->arch.exception.error_code = false;
9778 	static_call(kvm_x86_inject_exception)(vcpu);
9779 }
9780 
9781 /*
9782  * Check for any event (interrupt or exception) that is ready to be injected,
9783  * and if there is at least one event, inject the event with the highest
9784  * priority.  This handles both "pending" events, i.e. events that have never
9785  * been injected into the guest, and "injected" events, i.e. events that were
9786  * injected as part of a previous VM-Enter, but weren't successfully delivered
9787  * and need to be re-injected.
9788  *
9789  * Note, this is not guaranteed to be invoked on a guest instruction boundary,
9790  * i.e. doesn't guarantee that there's an event window in the guest.  KVM must
9791  * be able to inject exceptions in the "middle" of an instruction, and so must
9792  * also be able to re-inject NMIs and IRQs in the middle of an instruction.
9793  * I.e. for exceptions and re-injected events, NOT invoking this on instruction
9794  * boundaries is necessary and correct.
9795  *
9796  * For simplicity, KVM uses a single path to inject all events (except events
9797  * that are injected directly from L1 to L2) and doesn't explicitly track
9798  * instruction boundaries for asynchronous events.  However, because VM-Exits
9799  * that can occur during instruction execution typically result in KVM skipping
9800  * the instruction or injecting an exception, e.g. instruction and exception
9801  * intercepts, and because pending exceptions have higher priority than pending
9802  * interrupts, KVM still honors instruction boundaries in most scenarios.
9803  *
9804  * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
9805  * the instruction or inject an exception, then KVM can incorrecty inject a new
9806  * asynchrounous event if the event became pending after the CPU fetched the
9807  * instruction (in the guest).  E.g. if a page fault (#PF, #NPF, EPT violation)
9808  * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
9809  * injected on the restarted instruction instead of being deferred until the
9810  * instruction completes.
9811  *
9812  * In practice, this virtualization hole is unlikely to be observed by the
9813  * guest, and even less likely to cause functional problems.  To detect the
9814  * hole, the guest would have to trigger an event on a side effect of an early
9815  * phase of instruction execution, e.g. on the instruction fetch from memory.
9816  * And for it to be a functional problem, the guest would need to depend on the
9817  * ordering between that side effect, the instruction completing, _and_ the
9818  * delivery of the asynchronous event.
9819  */
9820 static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
9821 				       bool *req_immediate_exit)
9822 {
9823 	bool can_inject;
9824 	int r;
9825 
9826 	/*
9827 	 * Process nested events first, as nested VM-Exit supercedes event
9828 	 * re-injection.  If there's an event queued for re-injection, it will
9829 	 * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
9830 	 */
9831 	if (is_guest_mode(vcpu))
9832 		r = kvm_check_nested_events(vcpu);
9833 	else
9834 		r = 0;
9835 
9836 	/*
9837 	 * Re-inject exceptions and events *especially* if immediate entry+exit
9838 	 * to/from L2 is needed, as any event that has already been injected
9839 	 * into L2 needs to complete its lifecycle before injecting a new event.
9840 	 *
9841 	 * Don't re-inject an NMI or interrupt if there is a pending exception.
9842 	 * This collision arises if an exception occurred while vectoring the
9843 	 * injected event, KVM intercepted said exception, and KVM ultimately
9844 	 * determined the fault belongs to the guest and queues the exception
9845 	 * for injection back into the guest.
9846 	 *
9847 	 * "Injected" interrupts can also collide with pending exceptions if
9848 	 * userspace ignores the "ready for injection" flag and blindly queues
9849 	 * an interrupt.  In that case, prioritizing the exception is correct,
9850 	 * as the exception "occurred" before the exit to userspace.  Trap-like
9851 	 * exceptions, e.g. most #DBs, have higher priority than interrupts.
9852 	 * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
9853 	 * priority, they're only generated (pended) during instruction
9854 	 * execution, and interrupts are recognized at instruction boundaries.
9855 	 * Thus a pending fault-like exception means the fault occurred on the
9856 	 * *previous* instruction and must be serviced prior to recognizing any
9857 	 * new events in order to fully complete the previous instruction.
9858 	 */
9859 	if (vcpu->arch.exception.injected)
9860 		kvm_inject_exception(vcpu);
9861 	else if (kvm_is_exception_pending(vcpu))
9862 		; /* see above */
9863 	else if (vcpu->arch.nmi_injected)
9864 		static_call(kvm_x86_inject_nmi)(vcpu);
9865 	else if (vcpu->arch.interrupt.injected)
9866 		static_call(kvm_x86_inject_irq)(vcpu, true);
9867 
9868 	/*
9869 	 * Exceptions that morph to VM-Exits are handled above, and pending
9870 	 * exceptions on top of injected exceptions that do not VM-Exit should
9871 	 * either morph to #DF or, sadly, override the injected exception.
9872 	 */
9873 	WARN_ON_ONCE(vcpu->arch.exception.injected &&
9874 		     vcpu->arch.exception.pending);
9875 
9876 	/*
9877 	 * Bail if immediate entry+exit to/from the guest is needed to complete
9878 	 * nested VM-Enter or event re-injection so that a different pending
9879 	 * event can be serviced (or if KVM needs to exit to userspace).
9880 	 *
9881 	 * Otherwise, continue processing events even if VM-Exit occurred.  The
9882 	 * VM-Exit will have cleared exceptions that were meant for L2, but
9883 	 * there may now be events that can be injected into L1.
9884 	 */
9885 	if (r < 0)
9886 		goto out;
9887 
9888 	/*
9889 	 * A pending exception VM-Exit should either result in nested VM-Exit
9890 	 * or force an immediate re-entry and exit to/from L2, and exception
9891 	 * VM-Exits cannot be injected (flag should _never_ be set).
9892 	 */
9893 	WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
9894 		     vcpu->arch.exception_vmexit.pending);
9895 
9896 	/*
9897 	 * New events, other than exceptions, cannot be injected if KVM needs
9898 	 * to re-inject a previous event.  See above comments on re-injecting
9899 	 * for why pending exceptions get priority.
9900 	 */
9901 	can_inject = !kvm_event_needs_reinjection(vcpu);
9902 
9903 	if (vcpu->arch.exception.pending) {
9904 		/*
9905 		 * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
9906 		 * value pushed on the stack.  Trap-like exception and all #DBs
9907 		 * leave RF as-is (KVM follows Intel's behavior in this regard;
9908 		 * AMD states that code breakpoint #DBs excplitly clear RF=0).
9909 		 *
9910 		 * Note, most versions of Intel's SDM and AMD's APM incorrectly
9911 		 * describe the behavior of General Detect #DBs, which are
9912 		 * fault-like.  They do _not_ set RF, a la code breakpoints.
9913 		 */
9914 		if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
9915 			__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
9916 					     X86_EFLAGS_RF);
9917 
9918 		if (vcpu->arch.exception.vector == DB_VECTOR) {
9919 			kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
9920 			if (vcpu->arch.dr7 & DR7_GD) {
9921 				vcpu->arch.dr7 &= ~DR7_GD;
9922 				kvm_update_dr7(vcpu);
9923 			}
9924 		}
9925 
9926 		kvm_inject_exception(vcpu);
9927 
9928 		vcpu->arch.exception.pending = false;
9929 		vcpu->arch.exception.injected = true;
9930 
9931 		can_inject = false;
9932 	}
9933 
9934 	/* Don't inject interrupts if the user asked to avoid doing so */
9935 	if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ)
9936 		return 0;
9937 
9938 	/*
9939 	 * Finally, inject interrupt events.  If an event cannot be injected
9940 	 * due to architectural conditions (e.g. IF=0) a window-open exit
9941 	 * will re-request KVM_REQ_EVENT.  Sometimes however an event is pending
9942 	 * and can architecturally be injected, but we cannot do it right now:
9943 	 * an interrupt could have arrived just now and we have to inject it
9944 	 * as a vmexit, or there could already an event in the queue, which is
9945 	 * indicated by can_inject.  In that case we request an immediate exit
9946 	 * in order to make progress and get back here for another iteration.
9947 	 * The kvm_x86_ops hooks communicate this by returning -EBUSY.
9948 	 */
9949 #ifdef CONFIG_KVM_SMM
9950 	if (vcpu->arch.smi_pending) {
9951 		r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY;
9952 		if (r < 0)
9953 			goto out;
9954 		if (r) {
9955 			vcpu->arch.smi_pending = false;
9956 			++vcpu->arch.smi_count;
9957 			enter_smm(vcpu);
9958 			can_inject = false;
9959 		} else
9960 			static_call(kvm_x86_enable_smi_window)(vcpu);
9961 	}
9962 #endif
9963 
9964 	if (vcpu->arch.nmi_pending) {
9965 		r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY;
9966 		if (r < 0)
9967 			goto out;
9968 		if (r) {
9969 			--vcpu->arch.nmi_pending;
9970 			vcpu->arch.nmi_injected = true;
9971 			static_call(kvm_x86_inject_nmi)(vcpu);
9972 			can_inject = false;
9973 			WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0);
9974 		}
9975 		if (vcpu->arch.nmi_pending)
9976 			static_call(kvm_x86_enable_nmi_window)(vcpu);
9977 	}
9978 
9979 	if (kvm_cpu_has_injectable_intr(vcpu)) {
9980 		r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
9981 		if (r < 0)
9982 			goto out;
9983 		if (r) {
9984 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
9985 			static_call(kvm_x86_inject_irq)(vcpu, false);
9986 			WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0);
9987 		}
9988 		if (kvm_cpu_has_injectable_intr(vcpu))
9989 			static_call(kvm_x86_enable_irq_window)(vcpu);
9990 	}
9991 
9992 	if (is_guest_mode(vcpu) &&
9993 	    kvm_x86_ops.nested_ops->has_events &&
9994 	    kvm_x86_ops.nested_ops->has_events(vcpu))
9995 		*req_immediate_exit = true;
9996 
9997 	/*
9998 	 * KVM must never queue a new exception while injecting an event; KVM
9999 	 * is done emulating and should only propagate the to-be-injected event
10000 	 * to the VMCS/VMCB.  Queueing a new exception can put the vCPU into an
10001 	 * infinite loop as KVM will bail from VM-Enter to inject the pending
10002 	 * exception and start the cycle all over.
10003 	 *
10004 	 * Exempt triple faults as they have special handling and won't put the
10005 	 * vCPU into an infinite loop.  Triple fault can be queued when running
10006 	 * VMX without unrestricted guest, as that requires KVM to emulate Real
10007 	 * Mode events (see kvm_inject_realmode_interrupt()).
10008 	 */
10009 	WARN_ON_ONCE(vcpu->arch.exception.pending ||
10010 		     vcpu->arch.exception_vmexit.pending);
10011 	return 0;
10012 
10013 out:
10014 	if (r == -EBUSY) {
10015 		*req_immediate_exit = true;
10016 		r = 0;
10017 	}
10018 	return r;
10019 }
10020 
10021 static void process_nmi(struct kvm_vcpu *vcpu)
10022 {
10023 	unsigned limit = 2;
10024 
10025 	/*
10026 	 * x86 is limited to one NMI running, and one NMI pending after it.
10027 	 * If an NMI is already in progress, limit further NMIs to just one.
10028 	 * Otherwise, allow two (and we'll inject the first one immediately).
10029 	 */
10030 	if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected)
10031 		limit = 1;
10032 
10033 	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
10034 	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
10035 	kvm_make_request(KVM_REQ_EVENT, vcpu);
10036 }
10037 
10038 void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
10039 				       unsigned long *vcpu_bitmap)
10040 {
10041 	kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap);
10042 }
10043 
10044 void kvm_make_scan_ioapic_request(struct kvm *kvm)
10045 {
10046 	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
10047 }
10048 
10049 void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
10050 {
10051 	struct kvm_lapic *apic = vcpu->arch.apic;
10052 	bool activate;
10053 
10054 	if (!lapic_in_kernel(vcpu))
10055 		return;
10056 
10057 	down_read(&vcpu->kvm->arch.apicv_update_lock);
10058 	preempt_disable();
10059 
10060 	/* Do not activate APICV when APIC is disabled */
10061 	activate = kvm_vcpu_apicv_activated(vcpu) &&
10062 		   (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED);
10063 
10064 	if (apic->apicv_active == activate)
10065 		goto out;
10066 
10067 	apic->apicv_active = activate;
10068 	kvm_apic_update_apicv(vcpu);
10069 	static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu);
10070 
10071 	/*
10072 	 * When APICv gets disabled, we may still have injected interrupts
10073 	 * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
10074 	 * still active when the interrupt got accepted. Make sure
10075 	 * kvm_check_and_inject_events() is called to check for that.
10076 	 */
10077 	if (!apic->apicv_active)
10078 		kvm_make_request(KVM_REQ_EVENT, vcpu);
10079 
10080 out:
10081 	preempt_enable();
10082 	up_read(&vcpu->kvm->arch.apicv_update_lock);
10083 }
10084 EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
10085 
10086 void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
10087 				      enum kvm_apicv_inhibit reason, bool set)
10088 {
10089 	unsigned long old, new;
10090 
10091 	lockdep_assert_held_write(&kvm->arch.apicv_update_lock);
10092 
10093 	if (!static_call(kvm_x86_check_apicv_inhibit_reasons)(reason))
10094 		return;
10095 
10096 	old = new = kvm->arch.apicv_inhibit_reasons;
10097 
10098 	set_or_clear_apicv_inhibit(&new, reason, set);
10099 
10100 	if (!!old != !!new) {
10101 		/*
10102 		 * Kick all vCPUs before setting apicv_inhibit_reasons to avoid
10103 		 * false positives in the sanity check WARN in svm_vcpu_run().
10104 		 * This task will wait for all vCPUs to ack the kick IRQ before
10105 		 * updating apicv_inhibit_reasons, and all other vCPUs will
10106 		 * block on acquiring apicv_update_lock so that vCPUs can't
10107 		 * redo svm_vcpu_run() without seeing the new inhibit state.
10108 		 *
10109 		 * Note, holding apicv_update_lock and taking it in the read
10110 		 * side (handling the request) also prevents other vCPUs from
10111 		 * servicing the request with a stale apicv_inhibit_reasons.
10112 		 */
10113 		kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE);
10114 		kvm->arch.apicv_inhibit_reasons = new;
10115 		if (new) {
10116 			unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE);
10117 			int idx = srcu_read_lock(&kvm->srcu);
10118 
10119 			kvm_zap_gfn_range(kvm, gfn, gfn+1);
10120 			srcu_read_unlock(&kvm->srcu, idx);
10121 		}
10122 	} else {
10123 		kvm->arch.apicv_inhibit_reasons = new;
10124 	}
10125 }
10126 
10127 void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
10128 				    enum kvm_apicv_inhibit reason, bool set)
10129 {
10130 	if (!enable_apicv)
10131 		return;
10132 
10133 	down_write(&kvm->arch.apicv_update_lock);
10134 	__kvm_set_or_clear_apicv_inhibit(kvm, reason, set);
10135 	up_write(&kvm->arch.apicv_update_lock);
10136 }
10137 EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit);
10138 
10139 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
10140 {
10141 	if (!kvm_apic_present(vcpu))
10142 		return;
10143 
10144 	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
10145 
10146 	if (irqchip_split(vcpu->kvm))
10147 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
10148 	else {
10149 		static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10150 		if (ioapic_in_kernel(vcpu->kvm))
10151 			kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
10152 	}
10153 
10154 	if (is_guest_mode(vcpu))
10155 		vcpu->arch.load_eoi_exitmap_pending = true;
10156 	else
10157 		kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
10158 }
10159 
10160 static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
10161 {
10162 	u64 eoi_exit_bitmap[4];
10163 
10164 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
10165 		return;
10166 
10167 	if (to_hv_vcpu(vcpu)) {
10168 		bitmap_or((ulong *)eoi_exit_bitmap,
10169 			  vcpu->arch.ioapic_handled_vectors,
10170 			  to_hv_synic(vcpu)->vec_bitmap, 256);
10171 		static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap);
10172 		return;
10173 	}
10174 
10175 	static_call_cond(kvm_x86_load_eoi_exitmap)(
10176 		vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors);
10177 }
10178 
10179 void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
10180 					    unsigned long start, unsigned long end)
10181 {
10182 	unsigned long apic_address;
10183 
10184 	/*
10185 	 * The physical address of apic access page is stored in the VMCS.
10186 	 * Update it when it becomes invalid.
10187 	 */
10188 	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
10189 	if (start <= apic_address && apic_address < end)
10190 		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
10191 }
10192 
10193 void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
10194 {
10195 	static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm);
10196 }
10197 
10198 static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
10199 {
10200 	if (!lapic_in_kernel(vcpu))
10201 		return;
10202 
10203 	static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu);
10204 }
10205 
10206 void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
10207 {
10208 	smp_send_reschedule(vcpu->cpu);
10209 }
10210 EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
10211 
10212 /*
10213  * Called within kvm->srcu read side.
10214  * Returns 1 to let vcpu_run() continue the guest execution loop without
10215  * exiting to the userspace.  Otherwise, the value will be returned to the
10216  * userspace.
10217  */
10218 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
10219 {
10220 	int r;
10221 	bool req_int_win =
10222 		dm_request_for_irq_injection(vcpu) &&
10223 		kvm_cpu_accept_dm_intr(vcpu);
10224 	fastpath_t exit_fastpath;
10225 
10226 	bool req_immediate_exit = false;
10227 
10228 	if (kvm_request_pending(vcpu)) {
10229 		if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) {
10230 			r = -EIO;
10231 			goto out;
10232 		}
10233 
10234 		if (kvm_dirty_ring_check_request(vcpu)) {
10235 			r = 0;
10236 			goto out;
10237 		}
10238 
10239 		if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
10240 			if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
10241 				r = 0;
10242 				goto out;
10243 			}
10244 		}
10245 		if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
10246 			kvm_mmu_free_obsolete_roots(vcpu);
10247 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
10248 			__kvm_migrate_timers(vcpu);
10249 		if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
10250 			kvm_update_masterclock(vcpu->kvm);
10251 		if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
10252 			kvm_gen_kvmclock_update(vcpu);
10253 		if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
10254 			r = kvm_guest_time_update(vcpu);
10255 			if (unlikely(r))
10256 				goto out;
10257 		}
10258 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
10259 			kvm_mmu_sync_roots(vcpu);
10260 		if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
10261 			kvm_mmu_load_pgd(vcpu);
10262 
10263 		/*
10264 		 * Note, the order matters here, as flushing "all" TLB entries
10265 		 * also flushes the "current" TLB entries, i.e. servicing the
10266 		 * flush "all" will clear any request to flush "current".
10267 		 */
10268 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
10269 			kvm_vcpu_flush_tlb_all(vcpu);
10270 
10271 		kvm_service_local_tlb_flush_requests(vcpu);
10272 
10273 		/*
10274 		 * Fall back to a "full" guest flush if Hyper-V's precise
10275 		 * flushing fails.  Note, Hyper-V's flushing is per-vCPU, but
10276 		 * the flushes are considered "remote" and not "local" because
10277 		 * the requests can be initiated from other vCPUs.
10278 		 */
10279 		if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) &&
10280 		    kvm_hv_vcpu_flush_tlb(vcpu))
10281 			kvm_vcpu_flush_tlb_guest(vcpu);
10282 
10283 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
10284 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
10285 			r = 0;
10286 			goto out;
10287 		}
10288 		if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10289 			if (is_guest_mode(vcpu))
10290 				kvm_x86_ops.nested_ops->triple_fault(vcpu);
10291 
10292 			if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
10293 				vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
10294 				vcpu->mmio_needed = 0;
10295 				r = 0;
10296 				goto out;
10297 			}
10298 		}
10299 		if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
10300 			/* Page is swapped out. Do synthetic halt */
10301 			vcpu->arch.apf.halted = true;
10302 			r = 1;
10303 			goto out;
10304 		}
10305 		if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
10306 			record_steal_time(vcpu);
10307 #ifdef CONFIG_KVM_SMM
10308 		if (kvm_check_request(KVM_REQ_SMI, vcpu))
10309 			process_smi(vcpu);
10310 #endif
10311 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
10312 			process_nmi(vcpu);
10313 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
10314 			kvm_pmu_handle_event(vcpu);
10315 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
10316 			kvm_pmu_deliver_pmi(vcpu);
10317 		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
10318 			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
10319 			if (test_bit(vcpu->arch.pending_ioapic_eoi,
10320 				     vcpu->arch.ioapic_handled_vectors)) {
10321 				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
10322 				vcpu->run->eoi.vector =
10323 						vcpu->arch.pending_ioapic_eoi;
10324 				r = 0;
10325 				goto out;
10326 			}
10327 		}
10328 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
10329 			vcpu_scan_ioapic(vcpu);
10330 		if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
10331 			vcpu_load_eoi_exitmap(vcpu);
10332 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
10333 			kvm_vcpu_reload_apic_access_page(vcpu);
10334 		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
10335 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10336 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
10337 			vcpu->run->system_event.ndata = 0;
10338 			r = 0;
10339 			goto out;
10340 		}
10341 		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
10342 			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
10343 			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
10344 			vcpu->run->system_event.ndata = 0;
10345 			r = 0;
10346 			goto out;
10347 		}
10348 		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
10349 			struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
10350 
10351 			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
10352 			vcpu->run->hyperv = hv_vcpu->exit;
10353 			r = 0;
10354 			goto out;
10355 		}
10356 
10357 		/*
10358 		 * KVM_REQ_HV_STIMER has to be processed after
10359 		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
10360 		 * depend on the guest clock being up-to-date
10361 		 */
10362 		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
10363 			kvm_hv_process_stimers(vcpu);
10364 		if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
10365 			kvm_vcpu_update_apicv(vcpu);
10366 		if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
10367 			kvm_check_async_pf_completion(vcpu);
10368 		if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
10369 			static_call(kvm_x86_msr_filter_changed)(vcpu);
10370 
10371 		if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu))
10372 			static_call(kvm_x86_update_cpu_dirty_logging)(vcpu);
10373 	}
10374 
10375 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win ||
10376 	    kvm_xen_has_interrupt(vcpu)) {
10377 		++vcpu->stat.req_event;
10378 		r = kvm_apic_accept_events(vcpu);
10379 		if (r < 0) {
10380 			r = 0;
10381 			goto out;
10382 		}
10383 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
10384 			r = 1;
10385 			goto out;
10386 		}
10387 
10388 		r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
10389 		if (r < 0) {
10390 			r = 0;
10391 			goto out;
10392 		}
10393 		if (req_int_win)
10394 			static_call(kvm_x86_enable_irq_window)(vcpu);
10395 
10396 		if (kvm_lapic_enabled(vcpu)) {
10397 			update_cr8_intercept(vcpu);
10398 			kvm_lapic_sync_to_vapic(vcpu);
10399 		}
10400 	}
10401 
10402 	r = kvm_mmu_reload(vcpu);
10403 	if (unlikely(r)) {
10404 		goto cancel_injection;
10405 	}
10406 
10407 	preempt_disable();
10408 
10409 	static_call(kvm_x86_prepare_switch_to_guest)(vcpu);
10410 
10411 	/*
10412 	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt
10413 	 * IPI are then delayed after guest entry, which ensures that they
10414 	 * result in virtual interrupt delivery.
10415 	 */
10416 	local_irq_disable();
10417 
10418 	/* Store vcpu->apicv_active before vcpu->mode.  */
10419 	smp_store_release(&vcpu->mode, IN_GUEST_MODE);
10420 
10421 	kvm_vcpu_srcu_read_unlock(vcpu);
10422 
10423 	/*
10424 	 * 1) We should set ->mode before checking ->requests.  Please see
10425 	 * the comment in kvm_vcpu_exiting_guest_mode().
10426 	 *
10427 	 * 2) For APICv, we should set ->mode before checking PID.ON. This
10428 	 * pairs with the memory barrier implicit in pi_test_and_set_on
10429 	 * (see vmx_deliver_posted_interrupt).
10430 	 *
10431 	 * 3) This also orders the write to mode from any reads to the page
10432 	 * tables done while the VCPU is running.  Please see the comment
10433 	 * in kvm_flush_remote_tlbs.
10434 	 */
10435 	smp_mb__after_srcu_read_unlock();
10436 
10437 	/*
10438 	 * Process pending posted interrupts to handle the case where the
10439 	 * notification IRQ arrived in the host, or was never sent (because the
10440 	 * target vCPU wasn't running).  Do this regardless of the vCPU's APICv
10441 	 * status, KVM doesn't update assigned devices when APICv is inhibited,
10442 	 * i.e. they can post interrupts even if APICv is temporarily disabled.
10443 	 */
10444 	if (kvm_lapic_enabled(vcpu))
10445 		static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10446 
10447 	if (kvm_vcpu_exit_request(vcpu)) {
10448 		vcpu->mode = OUTSIDE_GUEST_MODE;
10449 		smp_wmb();
10450 		local_irq_enable();
10451 		preempt_enable();
10452 		kvm_vcpu_srcu_read_lock(vcpu);
10453 		r = 1;
10454 		goto cancel_injection;
10455 	}
10456 
10457 	if (req_immediate_exit) {
10458 		kvm_make_request(KVM_REQ_EVENT, vcpu);
10459 		static_call(kvm_x86_request_immediate_exit)(vcpu);
10460 	}
10461 
10462 	fpregs_assert_state_consistent();
10463 	if (test_thread_flag(TIF_NEED_FPU_LOAD))
10464 		switch_fpu_return();
10465 
10466 	if (vcpu->arch.guest_fpu.xfd_err)
10467 		wrmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
10468 
10469 	if (unlikely(vcpu->arch.switch_db_regs)) {
10470 		set_debugreg(0, 7);
10471 		set_debugreg(vcpu->arch.eff_db[0], 0);
10472 		set_debugreg(vcpu->arch.eff_db[1], 1);
10473 		set_debugreg(vcpu->arch.eff_db[2], 2);
10474 		set_debugreg(vcpu->arch.eff_db[3], 3);
10475 	} else if (unlikely(hw_breakpoint_active())) {
10476 		set_debugreg(0, 7);
10477 	}
10478 
10479 	guest_timing_enter_irqoff();
10480 
10481 	for (;;) {
10482 		/*
10483 		 * Assert that vCPU vs. VM APICv state is consistent.  An APICv
10484 		 * update must kick and wait for all vCPUs before toggling the
10485 		 * per-VM state, and responsing vCPUs must wait for the update
10486 		 * to complete before servicing KVM_REQ_APICV_UPDATE.
10487 		 */
10488 		WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) &&
10489 			     (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED));
10490 
10491 		exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
10492 		if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
10493 			break;
10494 
10495 		if (kvm_lapic_enabled(vcpu))
10496 			static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
10497 
10498 		if (unlikely(kvm_vcpu_exit_request(vcpu))) {
10499 			exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
10500 			break;
10501 		}
10502 	}
10503 
10504 	/*
10505 	 * Do this here before restoring debug registers on the host.  And
10506 	 * since we do this before handling the vmexit, a DR access vmexit
10507 	 * can (a) read the correct value of the debug registers, (b) set
10508 	 * KVM_DEBUGREG_WONT_EXIT again.
10509 	 */
10510 	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
10511 		WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
10512 		static_call(kvm_x86_sync_dirty_debug_regs)(vcpu);
10513 		kvm_update_dr0123(vcpu);
10514 		kvm_update_dr7(vcpu);
10515 	}
10516 
10517 	/*
10518 	 * If the guest has used debug registers, at least dr7
10519 	 * will be disabled while returning to the host.
10520 	 * If we don't have active breakpoints in the host, we don't
10521 	 * care about the messed up debug address registers. But if
10522 	 * we have some of them active, restore the old state.
10523 	 */
10524 	if (hw_breakpoint_active())
10525 		hw_breakpoint_restore();
10526 
10527 	vcpu->arch.last_vmentry_cpu = vcpu->cpu;
10528 	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
10529 
10530 	vcpu->mode = OUTSIDE_GUEST_MODE;
10531 	smp_wmb();
10532 
10533 	/*
10534 	 * Sync xfd before calling handle_exit_irqoff() which may
10535 	 * rely on the fact that guest_fpu::xfd is up-to-date (e.g.
10536 	 * in #NM irqoff handler).
10537 	 */
10538 	if (vcpu->arch.xfd_no_write_intercept)
10539 		fpu_sync_guest_vmexit_xfd_state();
10540 
10541 	static_call(kvm_x86_handle_exit_irqoff)(vcpu);
10542 
10543 	if (vcpu->arch.guest_fpu.xfd_err)
10544 		wrmsrl(MSR_IA32_XFD_ERR, 0);
10545 
10546 	/*
10547 	 * Consume any pending interrupts, including the possible source of
10548 	 * VM-Exit on SVM and any ticks that occur between VM-Exit and now.
10549 	 * An instruction is required after local_irq_enable() to fully unblock
10550 	 * interrupts on processors that implement an interrupt shadow, the
10551 	 * stat.exits increment will do nicely.
10552 	 */
10553 	kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
10554 	local_irq_enable();
10555 	++vcpu->stat.exits;
10556 	local_irq_disable();
10557 	kvm_after_interrupt(vcpu);
10558 
10559 	/*
10560 	 * Wait until after servicing IRQs to account guest time so that any
10561 	 * ticks that occurred while running the guest are properly accounted
10562 	 * to the guest.  Waiting until IRQs are enabled degrades the accuracy
10563 	 * of accounting via context tracking, but the loss of accuracy is
10564 	 * acceptable for all known use cases.
10565 	 */
10566 	guest_timing_exit_irqoff();
10567 
10568 	local_irq_enable();
10569 	preempt_enable();
10570 
10571 	kvm_vcpu_srcu_read_lock(vcpu);
10572 
10573 	/*
10574 	 * Profile KVM exit RIPs:
10575 	 */
10576 	if (unlikely(prof_on == KVM_PROFILING)) {
10577 		unsigned long rip = kvm_rip_read(vcpu);
10578 		profile_hit(KVM_PROFILING, (void *)rip);
10579 	}
10580 
10581 	if (unlikely(vcpu->arch.tsc_always_catchup))
10582 		kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
10583 
10584 	if (vcpu->arch.apic_attention)
10585 		kvm_lapic_sync_from_vapic(vcpu);
10586 
10587 	r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath);
10588 	return r;
10589 
10590 cancel_injection:
10591 	if (req_immediate_exit)
10592 		kvm_make_request(KVM_REQ_EVENT, vcpu);
10593 	static_call(kvm_x86_cancel_injection)(vcpu);
10594 	if (unlikely(vcpu->arch.apic_attention))
10595 		kvm_lapic_sync_from_vapic(vcpu);
10596 out:
10597 	return r;
10598 }
10599 
10600 /* Called within kvm->srcu read side.  */
10601 static inline int vcpu_block(struct kvm_vcpu *vcpu)
10602 {
10603 	bool hv_timer;
10604 
10605 	if (!kvm_arch_vcpu_runnable(vcpu)) {
10606 		/*
10607 		 * Switch to the software timer before halt-polling/blocking as
10608 		 * the guest's timer may be a break event for the vCPU, and the
10609 		 * hypervisor timer runs only when the CPU is in guest mode.
10610 		 * Switch before halt-polling so that KVM recognizes an expired
10611 		 * timer before blocking.
10612 		 */
10613 		hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
10614 		if (hv_timer)
10615 			kvm_lapic_switch_to_sw_timer(vcpu);
10616 
10617 		kvm_vcpu_srcu_read_unlock(vcpu);
10618 		if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
10619 			kvm_vcpu_halt(vcpu);
10620 		else
10621 			kvm_vcpu_block(vcpu);
10622 		kvm_vcpu_srcu_read_lock(vcpu);
10623 
10624 		if (hv_timer)
10625 			kvm_lapic_switch_to_hv_timer(vcpu);
10626 
10627 		/*
10628 		 * If the vCPU is not runnable, a signal or another host event
10629 		 * of some kind is pending; service it without changing the
10630 		 * vCPU's activity state.
10631 		 */
10632 		if (!kvm_arch_vcpu_runnable(vcpu))
10633 			return 1;
10634 	}
10635 
10636 	/*
10637 	 * Evaluate nested events before exiting the halted state.  This allows
10638 	 * the halt state to be recorded properly in the VMCS12's activity
10639 	 * state field (AMD does not have a similar field and a VM-Exit always
10640 	 * causes a spurious wakeup from HLT).
10641 	 */
10642 	if (is_guest_mode(vcpu)) {
10643 		if (kvm_check_nested_events(vcpu) < 0)
10644 			return 0;
10645 	}
10646 
10647 	if (kvm_apic_accept_events(vcpu) < 0)
10648 		return 0;
10649 	switch(vcpu->arch.mp_state) {
10650 	case KVM_MP_STATE_HALTED:
10651 	case KVM_MP_STATE_AP_RESET_HOLD:
10652 		vcpu->arch.pv.pv_unhalted = false;
10653 		vcpu->arch.mp_state =
10654 			KVM_MP_STATE_RUNNABLE;
10655 		fallthrough;
10656 	case KVM_MP_STATE_RUNNABLE:
10657 		vcpu->arch.apf.halted = false;
10658 		break;
10659 	case KVM_MP_STATE_INIT_RECEIVED:
10660 		break;
10661 	default:
10662 		WARN_ON_ONCE(1);
10663 		break;
10664 	}
10665 	return 1;
10666 }
10667 
10668 static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
10669 {
10670 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
10671 		!vcpu->arch.apf.halted);
10672 }
10673 
10674 /* Called within kvm->srcu read side.  */
10675 static int vcpu_run(struct kvm_vcpu *vcpu)
10676 {
10677 	int r;
10678 
10679 	vcpu->arch.l1tf_flush_l1d = true;
10680 
10681 	for (;;) {
10682 		/*
10683 		 * If another guest vCPU requests a PV TLB flush in the middle
10684 		 * of instruction emulation, the rest of the emulation could
10685 		 * use a stale page translation. Assume that any code after
10686 		 * this point can start executing an instruction.
10687 		 */
10688 		vcpu->arch.at_instruction_boundary = false;
10689 		if (kvm_vcpu_running(vcpu)) {
10690 			r = vcpu_enter_guest(vcpu);
10691 		} else {
10692 			r = vcpu_block(vcpu);
10693 		}
10694 
10695 		if (r <= 0)
10696 			break;
10697 
10698 		kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
10699 		if (kvm_xen_has_pending_events(vcpu))
10700 			kvm_xen_inject_pending_events(vcpu);
10701 
10702 		if (kvm_cpu_has_pending_timer(vcpu))
10703 			kvm_inject_pending_timer_irqs(vcpu);
10704 
10705 		if (dm_request_for_irq_injection(vcpu) &&
10706 			kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
10707 			r = 0;
10708 			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
10709 			++vcpu->stat.request_irq_exits;
10710 			break;
10711 		}
10712 
10713 		if (__xfer_to_guest_mode_work_pending()) {
10714 			kvm_vcpu_srcu_read_unlock(vcpu);
10715 			r = xfer_to_guest_mode_handle_work(vcpu);
10716 			kvm_vcpu_srcu_read_lock(vcpu);
10717 			if (r)
10718 				return r;
10719 		}
10720 	}
10721 
10722 	return r;
10723 }
10724 
10725 static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
10726 {
10727 	return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
10728 }
10729 
10730 static int complete_emulated_pio(struct kvm_vcpu *vcpu)
10731 {
10732 	BUG_ON(!vcpu->arch.pio.count);
10733 
10734 	return complete_emulated_io(vcpu);
10735 }
10736 
10737 /*
10738  * Implements the following, as a state machine:
10739  *
10740  * read:
10741  *   for each fragment
10742  *     for each mmio piece in the fragment
10743  *       write gpa, len
10744  *       exit
10745  *       copy data
10746  *   execute insn
10747  *
10748  * write:
10749  *   for each fragment
10750  *     for each mmio piece in the fragment
10751  *       write gpa, len
10752  *       copy data
10753  *       exit
10754  */
10755 static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
10756 {
10757 	struct kvm_run *run = vcpu->run;
10758 	struct kvm_mmio_fragment *frag;
10759 	unsigned len;
10760 
10761 	BUG_ON(!vcpu->mmio_needed);
10762 
10763 	/* Complete previous fragment */
10764 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
10765 	len = min(8u, frag->len);
10766 	if (!vcpu->mmio_is_write)
10767 		memcpy(frag->data, run->mmio.data, len);
10768 
10769 	if (frag->len <= 8) {
10770 		/* Switch to the next fragment. */
10771 		frag++;
10772 		vcpu->mmio_cur_fragment++;
10773 	} else {
10774 		/* Go forward to the next mmio piece. */
10775 		frag->data += len;
10776 		frag->gpa += len;
10777 		frag->len -= len;
10778 	}
10779 
10780 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
10781 		vcpu->mmio_needed = 0;
10782 
10783 		/* FIXME: return into emulator if single-stepping.  */
10784 		if (vcpu->mmio_is_write)
10785 			return 1;
10786 		vcpu->mmio_read_completed = 1;
10787 		return complete_emulated_io(vcpu);
10788 	}
10789 
10790 	run->exit_reason = KVM_EXIT_MMIO;
10791 	run->mmio.phys_addr = frag->gpa;
10792 	if (vcpu->mmio_is_write)
10793 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
10794 	run->mmio.len = min(8u, frag->len);
10795 	run->mmio.is_write = vcpu->mmio_is_write;
10796 	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
10797 	return 0;
10798 }
10799 
10800 /* Swap (qemu) user FPU context for the guest FPU context. */
10801 static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
10802 {
10803 	/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
10804 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
10805 	trace_kvm_fpu(1);
10806 }
10807 
10808 /* When vcpu_run ends, restore user space FPU context. */
10809 static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
10810 {
10811 	fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
10812 	++vcpu->stat.fpu_reload;
10813 	trace_kvm_fpu(0);
10814 }
10815 
10816 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
10817 {
10818 	struct kvm_queued_exception *ex = &vcpu->arch.exception;
10819 	struct kvm_run *kvm_run = vcpu->run;
10820 	int r;
10821 
10822 	vcpu_load(vcpu);
10823 	kvm_sigset_activate(vcpu);
10824 	kvm_run->flags = 0;
10825 	kvm_load_guest_fpu(vcpu);
10826 
10827 	kvm_vcpu_srcu_read_lock(vcpu);
10828 	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
10829 		if (kvm_run->immediate_exit) {
10830 			r = -EINTR;
10831 			goto out;
10832 		}
10833 		/*
10834 		 * It should be impossible for the hypervisor timer to be in
10835 		 * use before KVM has ever run the vCPU.
10836 		 */
10837 		WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
10838 
10839 		kvm_vcpu_srcu_read_unlock(vcpu);
10840 		kvm_vcpu_block(vcpu);
10841 		kvm_vcpu_srcu_read_lock(vcpu);
10842 
10843 		if (kvm_apic_accept_events(vcpu) < 0) {
10844 			r = 0;
10845 			goto out;
10846 		}
10847 		r = -EAGAIN;
10848 		if (signal_pending(current)) {
10849 			r = -EINTR;
10850 			kvm_run->exit_reason = KVM_EXIT_INTR;
10851 			++vcpu->stat.signal_exits;
10852 		}
10853 		goto out;
10854 	}
10855 
10856 	if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) ||
10857 	    (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) {
10858 		r = -EINVAL;
10859 		goto out;
10860 	}
10861 
10862 	if (kvm_run->kvm_dirty_regs) {
10863 		r = sync_regs(vcpu);
10864 		if (r != 0)
10865 			goto out;
10866 	}
10867 
10868 	/* re-sync apic's tpr */
10869 	if (!lapic_in_kernel(vcpu)) {
10870 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
10871 			r = -EINVAL;
10872 			goto out;
10873 		}
10874 	}
10875 
10876 	/*
10877 	 * If userspace set a pending exception and L2 is active, convert it to
10878 	 * a pending VM-Exit if L1 wants to intercept the exception.
10879 	 */
10880 	if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
10881 	    kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
10882 							ex->error_code)) {
10883 		kvm_queue_exception_vmexit(vcpu, ex->vector,
10884 					   ex->has_error_code, ex->error_code,
10885 					   ex->has_payload, ex->payload);
10886 		ex->injected = false;
10887 		ex->pending = false;
10888 	}
10889 	vcpu->arch.exception_from_userspace = false;
10890 
10891 	if (unlikely(vcpu->arch.complete_userspace_io)) {
10892 		int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
10893 		vcpu->arch.complete_userspace_io = NULL;
10894 		r = cui(vcpu);
10895 		if (r <= 0)
10896 			goto out;
10897 	} else {
10898 		WARN_ON_ONCE(vcpu->arch.pio.count);
10899 		WARN_ON_ONCE(vcpu->mmio_needed);
10900 	}
10901 
10902 	if (kvm_run->immediate_exit) {
10903 		r = -EINTR;
10904 		goto out;
10905 	}
10906 
10907 	r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
10908 	if (r <= 0)
10909 		goto out;
10910 
10911 	r = vcpu_run(vcpu);
10912 
10913 out:
10914 	kvm_put_guest_fpu(vcpu);
10915 	if (kvm_run->kvm_valid_regs)
10916 		store_regs(vcpu);
10917 	post_kvm_run_save(vcpu);
10918 	kvm_vcpu_srcu_read_unlock(vcpu);
10919 
10920 	kvm_sigset_deactivate(vcpu);
10921 	vcpu_put(vcpu);
10922 	return r;
10923 }
10924 
10925 static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
10926 {
10927 	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
10928 		/*
10929 		 * We are here if userspace calls get_regs() in the middle of
10930 		 * instruction emulation. Registers state needs to be copied
10931 		 * back from emulation context to vcpu. Userspace shouldn't do
10932 		 * that usually, but some bad designed PV devices (vmware
10933 		 * backdoor interface) need this to work
10934 		 */
10935 		emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
10936 		vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
10937 	}
10938 	regs->rax = kvm_rax_read(vcpu);
10939 	regs->rbx = kvm_rbx_read(vcpu);
10940 	regs->rcx = kvm_rcx_read(vcpu);
10941 	regs->rdx = kvm_rdx_read(vcpu);
10942 	regs->rsi = kvm_rsi_read(vcpu);
10943 	regs->rdi = kvm_rdi_read(vcpu);
10944 	regs->rsp = kvm_rsp_read(vcpu);
10945 	regs->rbp = kvm_rbp_read(vcpu);
10946 #ifdef CONFIG_X86_64
10947 	regs->r8 = kvm_r8_read(vcpu);
10948 	regs->r9 = kvm_r9_read(vcpu);
10949 	regs->r10 = kvm_r10_read(vcpu);
10950 	regs->r11 = kvm_r11_read(vcpu);
10951 	regs->r12 = kvm_r12_read(vcpu);
10952 	regs->r13 = kvm_r13_read(vcpu);
10953 	regs->r14 = kvm_r14_read(vcpu);
10954 	regs->r15 = kvm_r15_read(vcpu);
10955 #endif
10956 
10957 	regs->rip = kvm_rip_read(vcpu);
10958 	regs->rflags = kvm_get_rflags(vcpu);
10959 }
10960 
10961 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
10962 {
10963 	vcpu_load(vcpu);
10964 	__get_regs(vcpu, regs);
10965 	vcpu_put(vcpu);
10966 	return 0;
10967 }
10968 
10969 static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
10970 {
10971 	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
10972 	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
10973 
10974 	kvm_rax_write(vcpu, regs->rax);
10975 	kvm_rbx_write(vcpu, regs->rbx);
10976 	kvm_rcx_write(vcpu, regs->rcx);
10977 	kvm_rdx_write(vcpu, regs->rdx);
10978 	kvm_rsi_write(vcpu, regs->rsi);
10979 	kvm_rdi_write(vcpu, regs->rdi);
10980 	kvm_rsp_write(vcpu, regs->rsp);
10981 	kvm_rbp_write(vcpu, regs->rbp);
10982 #ifdef CONFIG_X86_64
10983 	kvm_r8_write(vcpu, regs->r8);
10984 	kvm_r9_write(vcpu, regs->r9);
10985 	kvm_r10_write(vcpu, regs->r10);
10986 	kvm_r11_write(vcpu, regs->r11);
10987 	kvm_r12_write(vcpu, regs->r12);
10988 	kvm_r13_write(vcpu, regs->r13);
10989 	kvm_r14_write(vcpu, regs->r14);
10990 	kvm_r15_write(vcpu, regs->r15);
10991 #endif
10992 
10993 	kvm_rip_write(vcpu, regs->rip);
10994 	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
10995 
10996 	vcpu->arch.exception.pending = false;
10997 	vcpu->arch.exception_vmexit.pending = false;
10998 
10999 	kvm_make_request(KVM_REQ_EVENT, vcpu);
11000 }
11001 
11002 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
11003 {
11004 	vcpu_load(vcpu);
11005 	__set_regs(vcpu, regs);
11006 	vcpu_put(vcpu);
11007 	return 0;
11008 }
11009 
11010 static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11011 {
11012 	struct desc_ptr dt;
11013 
11014 	if (vcpu->arch.guest_state_protected)
11015 		goto skip_protected_regs;
11016 
11017 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11018 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11019 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11020 	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11021 	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11022 	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
11023 
11024 	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11025 	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
11026 
11027 	static_call(kvm_x86_get_idt)(vcpu, &dt);
11028 	sregs->idt.limit = dt.size;
11029 	sregs->idt.base = dt.address;
11030 	static_call(kvm_x86_get_gdt)(vcpu, &dt);
11031 	sregs->gdt.limit = dt.size;
11032 	sregs->gdt.base = dt.address;
11033 
11034 	sregs->cr2 = vcpu->arch.cr2;
11035 	sregs->cr3 = kvm_read_cr3(vcpu);
11036 
11037 skip_protected_regs:
11038 	sregs->cr0 = kvm_read_cr0(vcpu);
11039 	sregs->cr4 = kvm_read_cr4(vcpu);
11040 	sregs->cr8 = kvm_get_cr8(vcpu);
11041 	sregs->efer = vcpu->arch.efer;
11042 	sregs->apic_base = kvm_get_apic_base(vcpu);
11043 }
11044 
11045 static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11046 {
11047 	__get_sregs_common(vcpu, sregs);
11048 
11049 	if (vcpu->arch.guest_state_protected)
11050 		return;
11051 
11052 	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
11053 		set_bit(vcpu->arch.interrupt.nr,
11054 			(unsigned long *)sregs->interrupt_bitmap);
11055 }
11056 
11057 static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11058 {
11059 	int i;
11060 
11061 	__get_sregs_common(vcpu, (struct kvm_sregs *)sregs2);
11062 
11063 	if (vcpu->arch.guest_state_protected)
11064 		return;
11065 
11066 	if (is_pae_paging(vcpu)) {
11067 		for (i = 0 ; i < 4 ; i++)
11068 			sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, i);
11069 		sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID;
11070 	}
11071 }
11072 
11073 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
11074 				  struct kvm_sregs *sregs)
11075 {
11076 	vcpu_load(vcpu);
11077 	__get_sregs(vcpu, sregs);
11078 	vcpu_put(vcpu);
11079 	return 0;
11080 }
11081 
11082 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
11083 				    struct kvm_mp_state *mp_state)
11084 {
11085 	int r;
11086 
11087 	vcpu_load(vcpu);
11088 	if (kvm_mpx_supported())
11089 		kvm_load_guest_fpu(vcpu);
11090 
11091 	r = kvm_apic_accept_events(vcpu);
11092 	if (r < 0)
11093 		goto out;
11094 	r = 0;
11095 
11096 	if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED ||
11097 	     vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) &&
11098 	    vcpu->arch.pv.pv_unhalted)
11099 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
11100 	else
11101 		mp_state->mp_state = vcpu->arch.mp_state;
11102 
11103 out:
11104 	if (kvm_mpx_supported())
11105 		kvm_put_guest_fpu(vcpu);
11106 	vcpu_put(vcpu);
11107 	return r;
11108 }
11109 
11110 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
11111 				    struct kvm_mp_state *mp_state)
11112 {
11113 	int ret = -EINVAL;
11114 
11115 	vcpu_load(vcpu);
11116 
11117 	switch (mp_state->mp_state) {
11118 	case KVM_MP_STATE_UNINITIALIZED:
11119 	case KVM_MP_STATE_HALTED:
11120 	case KVM_MP_STATE_AP_RESET_HOLD:
11121 	case KVM_MP_STATE_INIT_RECEIVED:
11122 	case KVM_MP_STATE_SIPI_RECEIVED:
11123 		if (!lapic_in_kernel(vcpu))
11124 			goto out;
11125 		break;
11126 
11127 	case KVM_MP_STATE_RUNNABLE:
11128 		break;
11129 
11130 	default:
11131 		goto out;
11132 	}
11133 
11134 	/*
11135 	 * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
11136 	 * forcing the guest into INIT/SIPI if those events are supposed to be
11137 	 * blocked.  KVM prioritizes SMI over INIT, so reject INIT/SIPI state
11138 	 * if an SMI is pending as well.
11139 	 */
11140 	if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
11141 	    (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
11142 	     mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
11143 		goto out;
11144 
11145 	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
11146 		vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
11147 		set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
11148 	} else
11149 		vcpu->arch.mp_state = mp_state->mp_state;
11150 	kvm_make_request(KVM_REQ_EVENT, vcpu);
11151 
11152 	ret = 0;
11153 out:
11154 	vcpu_put(vcpu);
11155 	return ret;
11156 }
11157 
11158 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
11159 		    int reason, bool has_error_code, u32 error_code)
11160 {
11161 	struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
11162 	int ret;
11163 
11164 	init_emulate_ctxt(vcpu);
11165 
11166 	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
11167 				   has_error_code, error_code);
11168 	if (ret) {
11169 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
11170 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
11171 		vcpu->run->internal.ndata = 0;
11172 		return 0;
11173 	}
11174 
11175 	kvm_rip_write(vcpu, ctxt->eip);
11176 	kvm_set_rflags(vcpu, ctxt->eflags);
11177 	return 1;
11178 }
11179 EXPORT_SYMBOL_GPL(kvm_task_switch);
11180 
11181 static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11182 {
11183 	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
11184 		/*
11185 		 * When EFER.LME and CR0.PG are set, the processor is in
11186 		 * 64-bit mode (though maybe in a 32-bit code segment).
11187 		 * CR4.PAE and EFER.LMA must be set.
11188 		 */
11189 		if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA))
11190 			return false;
11191 		if (kvm_vcpu_is_illegal_gpa(vcpu, sregs->cr3))
11192 			return false;
11193 	} else {
11194 		/*
11195 		 * Not in 64-bit mode: EFER.LMA is clear and the code
11196 		 * segment cannot be 64-bit.
11197 		 */
11198 		if (sregs->efer & EFER_LMA || sregs->cs.l)
11199 			return false;
11200 	}
11201 
11202 	return kvm_is_valid_cr4(vcpu, sregs->cr4);
11203 }
11204 
11205 static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs,
11206 		int *mmu_reset_needed, bool update_pdptrs)
11207 {
11208 	struct msr_data apic_base_msr;
11209 	int idx;
11210 	struct desc_ptr dt;
11211 
11212 	if (!kvm_is_valid_sregs(vcpu, sregs))
11213 		return -EINVAL;
11214 
11215 	apic_base_msr.data = sregs->apic_base;
11216 	apic_base_msr.host_initiated = true;
11217 	if (kvm_set_apic_base(vcpu, &apic_base_msr))
11218 		return -EINVAL;
11219 
11220 	if (vcpu->arch.guest_state_protected)
11221 		return 0;
11222 
11223 	dt.size = sregs->idt.limit;
11224 	dt.address = sregs->idt.base;
11225 	static_call(kvm_x86_set_idt)(vcpu, &dt);
11226 	dt.size = sregs->gdt.limit;
11227 	dt.address = sregs->gdt.base;
11228 	static_call(kvm_x86_set_gdt)(vcpu, &dt);
11229 
11230 	vcpu->arch.cr2 = sregs->cr2;
11231 	*mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
11232 	vcpu->arch.cr3 = sregs->cr3;
11233 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
11234 	static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3);
11235 
11236 	kvm_set_cr8(vcpu, sregs->cr8);
11237 
11238 	*mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
11239 	static_call(kvm_x86_set_efer)(vcpu, sregs->efer);
11240 
11241 	*mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
11242 	static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0);
11243 	vcpu->arch.cr0 = sregs->cr0;
11244 
11245 	*mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
11246 	static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4);
11247 
11248 	if (update_pdptrs) {
11249 		idx = srcu_read_lock(&vcpu->kvm->srcu);
11250 		if (is_pae_paging(vcpu)) {
11251 			load_pdptrs(vcpu, kvm_read_cr3(vcpu));
11252 			*mmu_reset_needed = 1;
11253 		}
11254 		srcu_read_unlock(&vcpu->kvm->srcu, idx);
11255 	}
11256 
11257 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
11258 	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
11259 	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
11260 	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
11261 	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
11262 	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
11263 
11264 	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
11265 	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
11266 
11267 	update_cr8_intercept(vcpu);
11268 
11269 	/* Older userspace won't unhalt the vcpu on reset. */
11270 	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
11271 	    sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
11272 	    !is_protmode(vcpu))
11273 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11274 
11275 	return 0;
11276 }
11277 
11278 static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
11279 {
11280 	int pending_vec, max_bits;
11281 	int mmu_reset_needed = 0;
11282 	int ret = __set_sregs_common(vcpu, sregs, &mmu_reset_needed, true);
11283 
11284 	if (ret)
11285 		return ret;
11286 
11287 	if (mmu_reset_needed)
11288 		kvm_mmu_reset_context(vcpu);
11289 
11290 	max_bits = KVM_NR_INTERRUPTS;
11291 	pending_vec = find_first_bit(
11292 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
11293 
11294 	if (pending_vec < max_bits) {
11295 		kvm_queue_interrupt(vcpu, pending_vec, false);
11296 		pr_debug("Set back pending irq %d\n", pending_vec);
11297 		kvm_make_request(KVM_REQ_EVENT, vcpu);
11298 	}
11299 	return 0;
11300 }
11301 
11302 static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2)
11303 {
11304 	int mmu_reset_needed = 0;
11305 	bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID;
11306 	bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) &&
11307 		!(sregs2->efer & EFER_LMA);
11308 	int i, ret;
11309 
11310 	if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID)
11311 		return -EINVAL;
11312 
11313 	if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected))
11314 		return -EINVAL;
11315 
11316 	ret = __set_sregs_common(vcpu, (struct kvm_sregs *)sregs2,
11317 				 &mmu_reset_needed, !valid_pdptrs);
11318 	if (ret)
11319 		return ret;
11320 
11321 	if (valid_pdptrs) {
11322 		for (i = 0; i < 4 ; i++)
11323 			kvm_pdptr_write(vcpu, i, sregs2->pdptrs[i]);
11324 
11325 		kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
11326 		mmu_reset_needed = 1;
11327 		vcpu->arch.pdptrs_from_userspace = true;
11328 	}
11329 	if (mmu_reset_needed)
11330 		kvm_mmu_reset_context(vcpu);
11331 	return 0;
11332 }
11333 
11334 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
11335 				  struct kvm_sregs *sregs)
11336 {
11337 	int ret;
11338 
11339 	vcpu_load(vcpu);
11340 	ret = __set_sregs(vcpu, sregs);
11341 	vcpu_put(vcpu);
11342 	return ret;
11343 }
11344 
11345 static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm)
11346 {
11347 	bool set = false;
11348 	struct kvm_vcpu *vcpu;
11349 	unsigned long i;
11350 
11351 	if (!enable_apicv)
11352 		return;
11353 
11354 	down_write(&kvm->arch.apicv_update_lock);
11355 
11356 	kvm_for_each_vcpu(i, vcpu, kvm) {
11357 		if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) {
11358 			set = true;
11359 			break;
11360 		}
11361 	}
11362 	__kvm_set_or_clear_apicv_inhibit(kvm, APICV_INHIBIT_REASON_BLOCKIRQ, set);
11363 	up_write(&kvm->arch.apicv_update_lock);
11364 }
11365 
11366 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
11367 					struct kvm_guest_debug *dbg)
11368 {
11369 	unsigned long rflags;
11370 	int i, r;
11371 
11372 	if (vcpu->arch.guest_state_protected)
11373 		return -EINVAL;
11374 
11375 	vcpu_load(vcpu);
11376 
11377 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
11378 		r = -EBUSY;
11379 		if (kvm_is_exception_pending(vcpu))
11380 			goto out;
11381 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
11382 			kvm_queue_exception(vcpu, DB_VECTOR);
11383 		else
11384 			kvm_queue_exception(vcpu, BP_VECTOR);
11385 	}
11386 
11387 	/*
11388 	 * Read rflags as long as potentially injected trace flags are still
11389 	 * filtered out.
11390 	 */
11391 	rflags = kvm_get_rflags(vcpu);
11392 
11393 	vcpu->guest_debug = dbg->control;
11394 	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
11395 		vcpu->guest_debug = 0;
11396 
11397 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
11398 		for (i = 0; i < KVM_NR_DB_REGS; ++i)
11399 			vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
11400 		vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
11401 	} else {
11402 		for (i = 0; i < KVM_NR_DB_REGS; i++)
11403 			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
11404 	}
11405 	kvm_update_dr7(vcpu);
11406 
11407 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
11408 		vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu);
11409 
11410 	/*
11411 	 * Trigger an rflags update that will inject or remove the trace
11412 	 * flags.
11413 	 */
11414 	kvm_set_rflags(vcpu, rflags);
11415 
11416 	static_call(kvm_x86_update_exception_bitmap)(vcpu);
11417 
11418 	kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm);
11419 
11420 	r = 0;
11421 
11422 out:
11423 	vcpu_put(vcpu);
11424 	return r;
11425 }
11426 
11427 /*
11428  * Translate a guest virtual address to a guest physical address.
11429  */
11430 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
11431 				    struct kvm_translation *tr)
11432 {
11433 	unsigned long vaddr = tr->linear_address;
11434 	gpa_t gpa;
11435 	int idx;
11436 
11437 	vcpu_load(vcpu);
11438 
11439 	idx = srcu_read_lock(&vcpu->kvm->srcu);
11440 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
11441 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
11442 	tr->physical_address = gpa;
11443 	tr->valid = gpa != INVALID_GPA;
11444 	tr->writeable = 1;
11445 	tr->usermode = 0;
11446 
11447 	vcpu_put(vcpu);
11448 	return 0;
11449 }
11450 
11451 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11452 {
11453 	struct fxregs_state *fxsave;
11454 
11455 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
11456 		return 0;
11457 
11458 	vcpu_load(vcpu);
11459 
11460 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
11461 	memcpy(fpu->fpr, fxsave->st_space, 128);
11462 	fpu->fcw = fxsave->cwd;
11463 	fpu->fsw = fxsave->swd;
11464 	fpu->ftwx = fxsave->twd;
11465 	fpu->last_opcode = fxsave->fop;
11466 	fpu->last_ip = fxsave->rip;
11467 	fpu->last_dp = fxsave->rdp;
11468 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
11469 
11470 	vcpu_put(vcpu);
11471 	return 0;
11472 }
11473 
11474 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
11475 {
11476 	struct fxregs_state *fxsave;
11477 
11478 	if (fpstate_is_confidential(&vcpu->arch.guest_fpu))
11479 		return 0;
11480 
11481 	vcpu_load(vcpu);
11482 
11483 	fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave;
11484 
11485 	memcpy(fxsave->st_space, fpu->fpr, 128);
11486 	fxsave->cwd = fpu->fcw;
11487 	fxsave->swd = fpu->fsw;
11488 	fxsave->twd = fpu->ftwx;
11489 	fxsave->fop = fpu->last_opcode;
11490 	fxsave->rip = fpu->last_ip;
11491 	fxsave->rdp = fpu->last_dp;
11492 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
11493 
11494 	vcpu_put(vcpu);
11495 	return 0;
11496 }
11497 
11498 static void store_regs(struct kvm_vcpu *vcpu)
11499 {
11500 	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
11501 
11502 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
11503 		__get_regs(vcpu, &vcpu->run->s.regs.regs);
11504 
11505 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
11506 		__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
11507 
11508 	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
11509 		kvm_vcpu_ioctl_x86_get_vcpu_events(
11510 				vcpu, &vcpu->run->s.regs.events);
11511 }
11512 
11513 static int sync_regs(struct kvm_vcpu *vcpu)
11514 {
11515 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
11516 		__set_regs(vcpu, &vcpu->run->s.regs.regs);
11517 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
11518 	}
11519 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
11520 		if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
11521 			return -EINVAL;
11522 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
11523 	}
11524 	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
11525 		if (kvm_vcpu_ioctl_x86_set_vcpu_events(
11526 				vcpu, &vcpu->run->s.regs.events))
11527 			return -EINVAL;
11528 		vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
11529 	}
11530 
11531 	return 0;
11532 }
11533 
11534 int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
11535 {
11536 	if (kvm_check_tsc_unstable() && kvm->created_vcpus)
11537 		pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
11538 			     "guest TSC will not be reliable\n");
11539 
11540 	if (!kvm->arch.max_vcpu_ids)
11541 		kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS;
11542 
11543 	if (id >= kvm->arch.max_vcpu_ids)
11544 		return -EINVAL;
11545 
11546 	return static_call(kvm_x86_vcpu_precreate)(kvm);
11547 }
11548 
11549 int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
11550 {
11551 	struct page *page;
11552 	int r;
11553 
11554 	vcpu->arch.last_vmentry_cpu = -1;
11555 	vcpu->arch.regs_avail = ~0;
11556 	vcpu->arch.regs_dirty = ~0;
11557 
11558 	kvm_gpc_init(&vcpu->arch.pv_time, vcpu->kvm, vcpu, KVM_HOST_USES_PFN);
11559 
11560 	if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
11561 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
11562 	else
11563 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
11564 
11565 	r = kvm_mmu_create(vcpu);
11566 	if (r < 0)
11567 		return r;
11568 
11569 	if (irqchip_in_kernel(vcpu->kvm)) {
11570 		r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
11571 		if (r < 0)
11572 			goto fail_mmu_destroy;
11573 
11574 		/*
11575 		 * Defer evaluating inhibits until the vCPU is first run, as
11576 		 * this vCPU will not get notified of any changes until this
11577 		 * vCPU is visible to other vCPUs (marked online and added to
11578 		 * the set of vCPUs).  Opportunistically mark APICv active as
11579 		 * VMX in particularly is highly unlikely to have inhibits.
11580 		 * Ignore the current per-VM APICv state so that vCPU creation
11581 		 * is guaranteed to run with a deterministic value, the request
11582 		 * will ensure the vCPU gets the correct state before VM-Entry.
11583 		 */
11584 		if (enable_apicv) {
11585 			vcpu->arch.apic->apicv_active = true;
11586 			kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
11587 		}
11588 	} else
11589 		static_branch_inc(&kvm_has_noapic_vcpu);
11590 
11591 	r = -ENOMEM;
11592 
11593 	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
11594 	if (!page)
11595 		goto fail_free_lapic;
11596 	vcpu->arch.pio_data = page_address(page);
11597 
11598 	vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, sizeof(u64),
11599 				       GFP_KERNEL_ACCOUNT);
11600 	vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64),
11601 					    GFP_KERNEL_ACCOUNT);
11602 	if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks)
11603 		goto fail_free_mce_banks;
11604 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
11605 
11606 	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
11607 				GFP_KERNEL_ACCOUNT))
11608 		goto fail_free_mce_banks;
11609 
11610 	if (!alloc_emulate_ctxt(vcpu))
11611 		goto free_wbinvd_dirty_mask;
11612 
11613 	if (!fpu_alloc_guest_fpstate(&vcpu->arch.guest_fpu)) {
11614 		pr_err("kvm: failed to allocate vcpu's fpu\n");
11615 		goto free_emulate_ctxt;
11616 	}
11617 
11618 	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
11619 	vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu);
11620 
11621 	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
11622 
11623 	kvm_async_pf_hash_reset(vcpu);
11624 
11625 	vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap;
11626 	kvm_pmu_init(vcpu);
11627 
11628 	vcpu->arch.pending_external_vector = -1;
11629 	vcpu->arch.preempted_in_kernel = false;
11630 
11631 #if IS_ENABLED(CONFIG_HYPERV)
11632 	vcpu->arch.hv_root_tdp = INVALID_PAGE;
11633 #endif
11634 
11635 	r = static_call(kvm_x86_vcpu_create)(vcpu);
11636 	if (r)
11637 		goto free_guest_fpu;
11638 
11639 	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
11640 	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
11641 	kvm_xen_init_vcpu(vcpu);
11642 	kvm_vcpu_mtrr_init(vcpu);
11643 	vcpu_load(vcpu);
11644 	kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
11645 	kvm_vcpu_reset(vcpu, false);
11646 	kvm_init_mmu(vcpu);
11647 	vcpu_put(vcpu);
11648 	return 0;
11649 
11650 free_guest_fpu:
11651 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
11652 free_emulate_ctxt:
11653 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
11654 free_wbinvd_dirty_mask:
11655 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
11656 fail_free_mce_banks:
11657 	kfree(vcpu->arch.mce_banks);
11658 	kfree(vcpu->arch.mci_ctl2_banks);
11659 	free_page((unsigned long)vcpu->arch.pio_data);
11660 fail_free_lapic:
11661 	kvm_free_lapic(vcpu);
11662 fail_mmu_destroy:
11663 	kvm_mmu_destroy(vcpu);
11664 	return r;
11665 }
11666 
11667 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
11668 {
11669 	struct kvm *kvm = vcpu->kvm;
11670 
11671 	if (mutex_lock_killable(&vcpu->mutex))
11672 		return;
11673 	vcpu_load(vcpu);
11674 	kvm_synchronize_tsc(vcpu, 0);
11675 	vcpu_put(vcpu);
11676 
11677 	/* poll control enabled by default */
11678 	vcpu->arch.msr_kvm_poll_control = 1;
11679 
11680 	mutex_unlock(&vcpu->mutex);
11681 
11682 	if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
11683 		schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
11684 						KVMCLOCK_SYNC_PERIOD);
11685 }
11686 
11687 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
11688 {
11689 	int idx;
11690 
11691 	kvmclock_reset(vcpu);
11692 
11693 	static_call(kvm_x86_vcpu_free)(vcpu);
11694 
11695 	kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
11696 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
11697 	fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
11698 
11699 	kvm_xen_destroy_vcpu(vcpu);
11700 	kvm_hv_vcpu_uninit(vcpu);
11701 	kvm_pmu_destroy(vcpu);
11702 	kfree(vcpu->arch.mce_banks);
11703 	kfree(vcpu->arch.mci_ctl2_banks);
11704 	kvm_free_lapic(vcpu);
11705 	idx = srcu_read_lock(&vcpu->kvm->srcu);
11706 	kvm_mmu_destroy(vcpu);
11707 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
11708 	free_page((unsigned long)vcpu->arch.pio_data);
11709 	kvfree(vcpu->arch.cpuid_entries);
11710 	if (!lapic_in_kernel(vcpu))
11711 		static_branch_dec(&kvm_has_noapic_vcpu);
11712 }
11713 
11714 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
11715 {
11716 	struct kvm_cpuid_entry2 *cpuid_0x1;
11717 	unsigned long old_cr0 = kvm_read_cr0(vcpu);
11718 	unsigned long new_cr0;
11719 
11720 	/*
11721 	 * Several of the "set" flows, e.g. ->set_cr0(), read other registers
11722 	 * to handle side effects.  RESET emulation hits those flows and relies
11723 	 * on emulated/virtualized registers, including those that are loaded
11724 	 * into hardware, to be zeroed at vCPU creation.  Use CRs as a sentinel
11725 	 * to detect improper or missing initialization.
11726 	 */
11727 	WARN_ON_ONCE(!init_event &&
11728 		     (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu)));
11729 
11730 	/*
11731 	 * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's
11732 	 * possible to INIT the vCPU while L2 is active.  Force the vCPU back
11733 	 * into L1 as EFER.SVME is cleared on INIT (along with all other EFER
11734 	 * bits), i.e. virtualization is disabled.
11735 	 */
11736 	if (is_guest_mode(vcpu))
11737 		kvm_leave_nested(vcpu);
11738 
11739 	kvm_lapic_reset(vcpu, init_event);
11740 
11741 	WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu));
11742 	vcpu->arch.hflags = 0;
11743 
11744 	vcpu->arch.smi_pending = 0;
11745 	vcpu->arch.smi_count = 0;
11746 	atomic_set(&vcpu->arch.nmi_queued, 0);
11747 	vcpu->arch.nmi_pending = 0;
11748 	vcpu->arch.nmi_injected = false;
11749 	kvm_clear_interrupt_queue(vcpu);
11750 	kvm_clear_exception_queue(vcpu);
11751 
11752 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
11753 	kvm_update_dr0123(vcpu);
11754 	vcpu->arch.dr6 = DR6_ACTIVE_LOW;
11755 	vcpu->arch.dr7 = DR7_FIXED_1;
11756 	kvm_update_dr7(vcpu);
11757 
11758 	vcpu->arch.cr2 = 0;
11759 
11760 	kvm_make_request(KVM_REQ_EVENT, vcpu);
11761 	vcpu->arch.apf.msr_en_val = 0;
11762 	vcpu->arch.apf.msr_int_val = 0;
11763 	vcpu->arch.st.msr_val = 0;
11764 
11765 	kvmclock_reset(vcpu);
11766 
11767 	kvm_clear_async_pf_completion_queue(vcpu);
11768 	kvm_async_pf_hash_reset(vcpu);
11769 	vcpu->arch.apf.halted = false;
11770 
11771 	if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) {
11772 		struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
11773 
11774 		/*
11775 		 * All paths that lead to INIT are required to load the guest's
11776 		 * FPU state (because most paths are buried in KVM_RUN).
11777 		 */
11778 		if (init_event)
11779 			kvm_put_guest_fpu(vcpu);
11780 
11781 		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDREGS);
11782 		fpstate_clear_xstate_component(fpstate, XFEATURE_BNDCSR);
11783 
11784 		if (init_event)
11785 			kvm_load_guest_fpu(vcpu);
11786 	}
11787 
11788 	if (!init_event) {
11789 		kvm_pmu_reset(vcpu);
11790 		vcpu->arch.smbase = 0x30000;
11791 
11792 		vcpu->arch.msr_misc_features_enables = 0;
11793 		vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
11794 						  MSR_IA32_MISC_ENABLE_BTS_UNAVAIL;
11795 
11796 		__kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
11797 		__kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
11798 	}
11799 
11800 	/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
11801 	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
11802 	kvm_register_mark_dirty(vcpu, VCPU_REGS_RSP);
11803 
11804 	/*
11805 	 * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon)
11806 	 * if no CPUID match is found.  Note, it's impossible to get a match at
11807 	 * RESET since KVM emulates RESET before exposing the vCPU to userspace,
11808 	 * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry
11809 	 * on RESET.  But, go through the motions in case that's ever remedied.
11810 	 */
11811 	cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1);
11812 	kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
11813 
11814 	static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
11815 
11816 	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
11817 	kvm_rip_write(vcpu, 0xfff0);
11818 
11819 	vcpu->arch.cr3 = 0;
11820 	kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3);
11821 
11822 	/*
11823 	 * CR0.CD/NW are set on RESET, preserved on INIT.  Note, some versions
11824 	 * of Intel's SDM list CD/NW as being set on INIT, but they contradict
11825 	 * (or qualify) that with a footnote stating that CD/NW are preserved.
11826 	 */
11827 	new_cr0 = X86_CR0_ET;
11828 	if (init_event)
11829 		new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD));
11830 	else
11831 		new_cr0 |= X86_CR0_NW | X86_CR0_CD;
11832 
11833 	static_call(kvm_x86_set_cr0)(vcpu, new_cr0);
11834 	static_call(kvm_x86_set_cr4)(vcpu, 0);
11835 	static_call(kvm_x86_set_efer)(vcpu, 0);
11836 	static_call(kvm_x86_update_exception_bitmap)(vcpu);
11837 
11838 	/*
11839 	 * On the standard CR0/CR4/EFER modification paths, there are several
11840 	 * complex conditions determining whether the MMU has to be reset and/or
11841 	 * which PCIDs have to be flushed.  However, CR0.WP and the paging-related
11842 	 * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush
11843 	 * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as
11844 	 * CR0 will be '0' prior to RESET).  So we only need to check CR0.PG here.
11845 	 */
11846 	if (old_cr0 & X86_CR0_PG) {
11847 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
11848 		kvm_mmu_reset_context(vcpu);
11849 	}
11850 
11851 	/*
11852 	 * Intel's SDM states that all TLB entries are flushed on INIT.  AMD's
11853 	 * APM states the TLBs are untouched by INIT, but it also states that
11854 	 * the TLBs are flushed on "External initialization of the processor."
11855 	 * Flush the guest TLB regardless of vendor, there is no meaningful
11856 	 * benefit in relying on the guest to flush the TLB immediately after
11857 	 * INIT.  A spurious TLB flush is benign and likely negligible from a
11858 	 * performance perspective.
11859 	 */
11860 	if (init_event)
11861 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
11862 }
11863 EXPORT_SYMBOL_GPL(kvm_vcpu_reset);
11864 
11865 void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
11866 {
11867 	struct kvm_segment cs;
11868 
11869 	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
11870 	cs.selector = vector << 8;
11871 	cs.base = vector << 12;
11872 	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
11873 	kvm_rip_write(vcpu, 0);
11874 }
11875 EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector);
11876 
11877 int kvm_arch_hardware_enable(void)
11878 {
11879 	struct kvm *kvm;
11880 	struct kvm_vcpu *vcpu;
11881 	unsigned long i;
11882 	int ret;
11883 	u64 local_tsc;
11884 	u64 max_tsc = 0;
11885 	bool stable, backwards_tsc = false;
11886 
11887 	kvm_user_return_msr_cpu_online();
11888 	ret = static_call(kvm_x86_hardware_enable)();
11889 	if (ret != 0)
11890 		return ret;
11891 
11892 	local_tsc = rdtsc();
11893 	stable = !kvm_check_tsc_unstable();
11894 	list_for_each_entry(kvm, &vm_list, vm_list) {
11895 		kvm_for_each_vcpu(i, vcpu, kvm) {
11896 			if (!stable && vcpu->cpu == smp_processor_id())
11897 				kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
11898 			if (stable && vcpu->arch.last_host_tsc > local_tsc) {
11899 				backwards_tsc = true;
11900 				if (vcpu->arch.last_host_tsc > max_tsc)
11901 					max_tsc = vcpu->arch.last_host_tsc;
11902 			}
11903 		}
11904 	}
11905 
11906 	/*
11907 	 * Sometimes, even reliable TSCs go backwards.  This happens on
11908 	 * platforms that reset TSC during suspend or hibernate actions, but
11909 	 * maintain synchronization.  We must compensate.  Fortunately, we can
11910 	 * detect that condition here, which happens early in CPU bringup,
11911 	 * before any KVM threads can be running.  Unfortunately, we can't
11912 	 * bring the TSCs fully up to date with real time, as we aren't yet far
11913 	 * enough into CPU bringup that we know how much real time has actually
11914 	 * elapsed; our helper function, ktime_get_boottime_ns() will be using boot
11915 	 * variables that haven't been updated yet.
11916 	 *
11917 	 * So we simply find the maximum observed TSC above, then record the
11918 	 * adjustment to TSC in each VCPU.  When the VCPU later gets loaded,
11919 	 * the adjustment will be applied.  Note that we accumulate
11920 	 * adjustments, in case multiple suspend cycles happen before some VCPU
11921 	 * gets a chance to run again.  In the event that no KVM threads get a
11922 	 * chance to run, we will miss the entire elapsed period, as we'll have
11923 	 * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
11924 	 * loose cycle time.  This isn't too big a deal, since the loss will be
11925 	 * uniform across all VCPUs (not to mention the scenario is extremely
11926 	 * unlikely). It is possible that a second hibernate recovery happens
11927 	 * much faster than a first, causing the observed TSC here to be
11928 	 * smaller; this would require additional padding adjustment, which is
11929 	 * why we set last_host_tsc to the local tsc observed here.
11930 	 *
11931 	 * N.B. - this code below runs only on platforms with reliable TSC,
11932 	 * as that is the only way backwards_tsc is set above.  Also note
11933 	 * that this runs for ALL vcpus, which is not a bug; all VCPUs should
11934 	 * have the same delta_cyc adjustment applied if backwards_tsc
11935 	 * is detected.  Note further, this adjustment is only done once,
11936 	 * as we reset last_host_tsc on all VCPUs to stop this from being
11937 	 * called multiple times (one for each physical CPU bringup).
11938 	 *
11939 	 * Platforms with unreliable TSCs don't have to deal with this, they
11940 	 * will be compensated by the logic in vcpu_load, which sets the TSC to
11941 	 * catchup mode.  This will catchup all VCPUs to real time, but cannot
11942 	 * guarantee that they stay in perfect synchronization.
11943 	 */
11944 	if (backwards_tsc) {
11945 		u64 delta_cyc = max_tsc - local_tsc;
11946 		list_for_each_entry(kvm, &vm_list, vm_list) {
11947 			kvm->arch.backwards_tsc_observed = true;
11948 			kvm_for_each_vcpu(i, vcpu, kvm) {
11949 				vcpu->arch.tsc_offset_adjustment += delta_cyc;
11950 				vcpu->arch.last_host_tsc = local_tsc;
11951 				kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
11952 			}
11953 
11954 			/*
11955 			 * We have to disable TSC offset matching.. if you were
11956 			 * booting a VM while issuing an S4 host suspend....
11957 			 * you may have some problem.  Solving this issue is
11958 			 * left as an exercise to the reader.
11959 			 */
11960 			kvm->arch.last_tsc_nsec = 0;
11961 			kvm->arch.last_tsc_write = 0;
11962 		}
11963 
11964 	}
11965 	return 0;
11966 }
11967 
11968 void kvm_arch_hardware_disable(void)
11969 {
11970 	static_call(kvm_x86_hardware_disable)();
11971 	drop_user_return_notifiers();
11972 }
11973 
11974 static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
11975 {
11976 	memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
11977 
11978 #define __KVM_X86_OP(func) \
11979 	static_call_update(kvm_x86_##func, kvm_x86_ops.func);
11980 #define KVM_X86_OP(func) \
11981 	WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
11982 #define KVM_X86_OP_OPTIONAL __KVM_X86_OP
11983 #define KVM_X86_OP_OPTIONAL_RET0(func) \
11984 	static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
11985 					   (void *)__static_call_return0);
11986 #include <asm/kvm-x86-ops.h>
11987 #undef __KVM_X86_OP
11988 
11989 	kvm_pmu_ops_update(ops->pmu_ops);
11990 }
11991 
11992 int kvm_arch_hardware_setup(void *opaque)
11993 {
11994 	struct kvm_x86_init_ops *ops = opaque;
11995 	int r;
11996 
11997 	rdmsrl_safe(MSR_EFER, &host_efer);
11998 
11999 	if (boot_cpu_has(X86_FEATURE_XSAVES))
12000 		rdmsrl(MSR_IA32_XSS, host_xss);
12001 
12002 	kvm_init_pmu_capability();
12003 
12004 	r = ops->hardware_setup();
12005 	if (r != 0)
12006 		return r;
12007 
12008 	kvm_ops_update(ops);
12009 
12010 	kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
12011 
12012 	if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
12013 		kvm_caps.supported_xss = 0;
12014 
12015 #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
12016 	cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
12017 #undef __kvm_cpu_cap_has
12018 
12019 	if (kvm_caps.has_tsc_control) {
12020 		/*
12021 		 * Make sure the user can only configure tsc_khz values that
12022 		 * fit into a signed integer.
12023 		 * A min value is not calculated because it will always
12024 		 * be 1 on all machines.
12025 		 */
12026 		u64 max = min(0x7fffffffULL,
12027 			      __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz));
12028 		kvm_caps.max_guest_tsc_khz = max;
12029 	}
12030 	kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits;
12031 	kvm_init_msr_list();
12032 	return 0;
12033 }
12034 
12035 void kvm_arch_hardware_unsetup(void)
12036 {
12037 	kvm_unregister_perf_callbacks();
12038 
12039 	static_call(kvm_x86_hardware_unsetup)();
12040 }
12041 
12042 int kvm_arch_check_processor_compat(void *opaque)
12043 {
12044 	struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
12045 	struct kvm_x86_init_ops *ops = opaque;
12046 
12047 	WARN_ON(!irqs_disabled());
12048 
12049 	if (__cr4_reserved_bits(cpu_has, c) !=
12050 	    __cr4_reserved_bits(cpu_has, &boot_cpu_data))
12051 		return -EIO;
12052 
12053 	return ops->check_processor_compatibility();
12054 }
12055 
12056 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
12057 {
12058 	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
12059 }
12060 
12061 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
12062 {
12063 	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
12064 }
12065 
12066 __read_mostly DEFINE_STATIC_KEY_FALSE(kvm_has_noapic_vcpu);
12067 EXPORT_SYMBOL_GPL(kvm_has_noapic_vcpu);
12068 
12069 void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
12070 {
12071 	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
12072 
12073 	vcpu->arch.l1tf_flush_l1d = true;
12074 	if (pmu->version && unlikely(pmu->event_count)) {
12075 		pmu->need_cleanup = true;
12076 		kvm_make_request(KVM_REQ_PMU, vcpu);
12077 	}
12078 	static_call(kvm_x86_sched_in)(vcpu, cpu);
12079 }
12080 
12081 void kvm_arch_free_vm(struct kvm *kvm)
12082 {
12083 	kfree(to_kvm_hv(kvm)->hv_pa_pg);
12084 	__kvm_arch_free_vm(kvm);
12085 }
12086 
12087 
12088 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
12089 {
12090 	int ret;
12091 	unsigned long flags;
12092 
12093 	if (type)
12094 		return -EINVAL;
12095 
12096 	ret = kvm_page_track_init(kvm);
12097 	if (ret)
12098 		goto out;
12099 
12100 	ret = kvm_mmu_init_vm(kvm);
12101 	if (ret)
12102 		goto out_page_track;
12103 
12104 	ret = static_call(kvm_x86_vm_init)(kvm);
12105 	if (ret)
12106 		goto out_uninit_mmu;
12107 
12108 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
12109 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
12110 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
12111 
12112 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
12113 	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
12114 	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
12115 	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
12116 		&kvm->arch.irq_sources_bitmap);
12117 
12118 	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
12119 	mutex_init(&kvm->arch.apic_map_lock);
12120 	seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock);
12121 	kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
12122 
12123 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
12124 	pvclock_update_vm_gtod_copy(kvm);
12125 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
12126 
12127 	kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
12128 	kvm->arch.guest_can_read_msr_platform_info = true;
12129 	kvm->arch.enable_pmu = enable_pmu;
12130 
12131 #if IS_ENABLED(CONFIG_HYPERV)
12132 	spin_lock_init(&kvm->arch.hv_root_tdp_lock);
12133 	kvm->arch.hv_root_tdp = INVALID_PAGE;
12134 #endif
12135 
12136 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
12137 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
12138 
12139 	kvm_apicv_init(kvm);
12140 	kvm_hv_init_vm(kvm);
12141 	kvm_xen_init_vm(kvm);
12142 
12143 	return 0;
12144 
12145 out_uninit_mmu:
12146 	kvm_mmu_uninit_vm(kvm);
12147 out_page_track:
12148 	kvm_page_track_cleanup(kvm);
12149 out:
12150 	return ret;
12151 }
12152 
12153 int kvm_arch_post_init_vm(struct kvm *kvm)
12154 {
12155 	return kvm_mmu_post_init_vm(kvm);
12156 }
12157 
12158 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
12159 {
12160 	vcpu_load(vcpu);
12161 	kvm_mmu_unload(vcpu);
12162 	vcpu_put(vcpu);
12163 }
12164 
12165 static void kvm_unload_vcpu_mmus(struct kvm *kvm)
12166 {
12167 	unsigned long i;
12168 	struct kvm_vcpu *vcpu;
12169 
12170 	kvm_for_each_vcpu(i, vcpu, kvm) {
12171 		kvm_clear_async_pf_completion_queue(vcpu);
12172 		kvm_unload_vcpu_mmu(vcpu);
12173 	}
12174 }
12175 
12176 void kvm_arch_sync_events(struct kvm *kvm)
12177 {
12178 	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
12179 	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
12180 	kvm_free_pit(kvm);
12181 }
12182 
12183 /**
12184  * __x86_set_memory_region: Setup KVM internal memory slot
12185  *
12186  * @kvm: the kvm pointer to the VM.
12187  * @id: the slot ID to setup.
12188  * @gpa: the GPA to install the slot (unused when @size == 0).
12189  * @size: the size of the slot. Set to zero to uninstall a slot.
12190  *
12191  * This function helps to setup a KVM internal memory slot.  Specify
12192  * @size > 0 to install a new slot, while @size == 0 to uninstall a
12193  * slot.  The return code can be one of the following:
12194  *
12195  *   HVA:           on success (uninstall will return a bogus HVA)
12196  *   -errno:        on error
12197  *
12198  * The caller should always use IS_ERR() to check the return value
12199  * before use.  Note, the KVM internal memory slots are guaranteed to
12200  * remain valid and unchanged until the VM is destroyed, i.e., the
12201  * GPA->HVA translation will not change.  However, the HVA is a user
12202  * address, i.e. its accessibility is not guaranteed, and must be
12203  * accessed via __copy_{to,from}_user().
12204  */
12205 void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa,
12206 				      u32 size)
12207 {
12208 	int i, r;
12209 	unsigned long hva, old_npages;
12210 	struct kvm_memslots *slots = kvm_memslots(kvm);
12211 	struct kvm_memory_slot *slot;
12212 
12213 	/* Called with kvm->slots_lock held.  */
12214 	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
12215 		return ERR_PTR_USR(-EINVAL);
12216 
12217 	slot = id_to_memslot(slots, id);
12218 	if (size) {
12219 		if (slot && slot->npages)
12220 			return ERR_PTR_USR(-EEXIST);
12221 
12222 		/*
12223 		 * MAP_SHARED to prevent internal slot pages from being moved
12224 		 * by fork()/COW.
12225 		 */
12226 		hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE,
12227 			      MAP_SHARED | MAP_ANONYMOUS, 0);
12228 		if (IS_ERR((void *)hva))
12229 			return (void __user *)hva;
12230 	} else {
12231 		if (!slot || !slot->npages)
12232 			return NULL;
12233 
12234 		old_npages = slot->npages;
12235 		hva = slot->userspace_addr;
12236 	}
12237 
12238 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
12239 		struct kvm_userspace_memory_region m;
12240 
12241 		m.slot = id | (i << 16);
12242 		m.flags = 0;
12243 		m.guest_phys_addr = gpa;
12244 		m.userspace_addr = hva;
12245 		m.memory_size = size;
12246 		r = __kvm_set_memory_region(kvm, &m);
12247 		if (r < 0)
12248 			return ERR_PTR_USR(r);
12249 	}
12250 
12251 	if (!size)
12252 		vm_munmap(hva, old_npages * PAGE_SIZE);
12253 
12254 	return (void __user *)hva;
12255 }
12256 EXPORT_SYMBOL_GPL(__x86_set_memory_region);
12257 
12258 void kvm_arch_pre_destroy_vm(struct kvm *kvm)
12259 {
12260 	kvm_mmu_pre_destroy_vm(kvm);
12261 }
12262 
12263 void kvm_arch_destroy_vm(struct kvm *kvm)
12264 {
12265 	if (current->mm == kvm->mm) {
12266 		/*
12267 		 * Free memory regions allocated on behalf of userspace,
12268 		 * unless the memory map has changed due to process exit
12269 		 * or fd copying.
12270 		 */
12271 		mutex_lock(&kvm->slots_lock);
12272 		__x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
12273 					0, 0);
12274 		__x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
12275 					0, 0);
12276 		__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
12277 		mutex_unlock(&kvm->slots_lock);
12278 	}
12279 	kvm_unload_vcpu_mmus(kvm);
12280 	static_call_cond(kvm_x86_vm_destroy)(kvm);
12281 	kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
12282 	kvm_pic_destroy(kvm);
12283 	kvm_ioapic_destroy(kvm);
12284 	kvm_destroy_vcpus(kvm);
12285 	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
12286 	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
12287 	kvm_mmu_uninit_vm(kvm);
12288 	kvm_page_track_cleanup(kvm);
12289 	kvm_xen_destroy_vm(kvm);
12290 	kvm_hv_destroy_vm(kvm);
12291 }
12292 
12293 static void memslot_rmap_free(struct kvm_memory_slot *slot)
12294 {
12295 	int i;
12296 
12297 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
12298 		kvfree(slot->arch.rmap[i]);
12299 		slot->arch.rmap[i] = NULL;
12300 	}
12301 }
12302 
12303 void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
12304 {
12305 	int i;
12306 
12307 	memslot_rmap_free(slot);
12308 
12309 	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12310 		kvfree(slot->arch.lpage_info[i - 1]);
12311 		slot->arch.lpage_info[i - 1] = NULL;
12312 	}
12313 
12314 	kvm_page_track_free_memslot(slot);
12315 }
12316 
12317 int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages)
12318 {
12319 	const int sz = sizeof(*slot->arch.rmap[0]);
12320 	int i;
12321 
12322 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
12323 		int level = i + 1;
12324 		int lpages = __kvm_mmu_slot_lpages(slot, npages, level);
12325 
12326 		if (slot->arch.rmap[i])
12327 			continue;
12328 
12329 		slot->arch.rmap[i] = __vcalloc(lpages, sz, GFP_KERNEL_ACCOUNT);
12330 		if (!slot->arch.rmap[i]) {
12331 			memslot_rmap_free(slot);
12332 			return -ENOMEM;
12333 		}
12334 	}
12335 
12336 	return 0;
12337 }
12338 
12339 static int kvm_alloc_memslot_metadata(struct kvm *kvm,
12340 				      struct kvm_memory_slot *slot)
12341 {
12342 	unsigned long npages = slot->npages;
12343 	int i, r;
12344 
12345 	/*
12346 	 * Clear out the previous array pointers for the KVM_MR_MOVE case.  The
12347 	 * old arrays will be freed by __kvm_set_memory_region() if installing
12348 	 * the new memslot is successful.
12349 	 */
12350 	memset(&slot->arch, 0, sizeof(slot->arch));
12351 
12352 	if (kvm_memslots_have_rmaps(kvm)) {
12353 		r = memslot_rmap_alloc(slot, npages);
12354 		if (r)
12355 			return r;
12356 	}
12357 
12358 	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12359 		struct kvm_lpage_info *linfo;
12360 		unsigned long ugfn;
12361 		int lpages;
12362 		int level = i + 1;
12363 
12364 		lpages = __kvm_mmu_slot_lpages(slot, npages, level);
12365 
12366 		linfo = __vcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
12367 		if (!linfo)
12368 			goto out_free;
12369 
12370 		slot->arch.lpage_info[i - 1] = linfo;
12371 
12372 		if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
12373 			linfo[0].disallow_lpage = 1;
12374 		if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
12375 			linfo[lpages - 1].disallow_lpage = 1;
12376 		ugfn = slot->userspace_addr >> PAGE_SHIFT;
12377 		/*
12378 		 * If the gfn and userspace address are not aligned wrt each
12379 		 * other, disable large page support for this slot.
12380 		 */
12381 		if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
12382 			unsigned long j;
12383 
12384 			for (j = 0; j < lpages; ++j)
12385 				linfo[j].disallow_lpage = 1;
12386 		}
12387 	}
12388 
12389 	if (kvm_page_track_create_memslot(kvm, slot, npages))
12390 		goto out_free;
12391 
12392 	return 0;
12393 
12394 out_free:
12395 	memslot_rmap_free(slot);
12396 
12397 	for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
12398 		kvfree(slot->arch.lpage_info[i - 1]);
12399 		slot->arch.lpage_info[i - 1] = NULL;
12400 	}
12401 	return -ENOMEM;
12402 }
12403 
12404 void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
12405 {
12406 	struct kvm_vcpu *vcpu;
12407 	unsigned long i;
12408 
12409 	/*
12410 	 * memslots->generation has been incremented.
12411 	 * mmio generation may have reached its maximum value.
12412 	 */
12413 	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
12414 
12415 	/* Force re-initialization of steal_time cache */
12416 	kvm_for_each_vcpu(i, vcpu, kvm)
12417 		kvm_vcpu_kick(vcpu);
12418 }
12419 
12420 int kvm_arch_prepare_memory_region(struct kvm *kvm,
12421 				   const struct kvm_memory_slot *old,
12422 				   struct kvm_memory_slot *new,
12423 				   enum kvm_mr_change change)
12424 {
12425 	if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) {
12426 		if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn())
12427 			return -EINVAL;
12428 
12429 		return kvm_alloc_memslot_metadata(kvm, new);
12430 	}
12431 
12432 	if (change == KVM_MR_FLAGS_ONLY)
12433 		memcpy(&new->arch, &old->arch, sizeof(old->arch));
12434 	else if (WARN_ON_ONCE(change != KVM_MR_DELETE))
12435 		return -EIO;
12436 
12437 	return 0;
12438 }
12439 
12440 
12441 static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable)
12442 {
12443 	struct kvm_arch *ka = &kvm->arch;
12444 
12445 	if (!kvm_x86_ops.cpu_dirty_log_size)
12446 		return;
12447 
12448 	if ((enable && ++ka->cpu_dirty_logging_count == 1) ||
12449 	    (!enable && --ka->cpu_dirty_logging_count == 0))
12450 		kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING);
12451 
12452 	WARN_ON_ONCE(ka->cpu_dirty_logging_count < 0);
12453 }
12454 
12455 static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
12456 				     struct kvm_memory_slot *old,
12457 				     const struct kvm_memory_slot *new,
12458 				     enum kvm_mr_change change)
12459 {
12460 	u32 old_flags = old ? old->flags : 0;
12461 	u32 new_flags = new ? new->flags : 0;
12462 	bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES;
12463 
12464 	/*
12465 	 * Update CPU dirty logging if dirty logging is being toggled.  This
12466 	 * applies to all operations.
12467 	 */
12468 	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)
12469 		kvm_mmu_update_cpu_dirty_logging(kvm, log_dirty_pages);
12470 
12471 	/*
12472 	 * Nothing more to do for RO slots (which can't be dirtied and can't be
12473 	 * made writable) or CREATE/MOVE/DELETE of a slot.
12474 	 *
12475 	 * For a memslot with dirty logging disabled:
12476 	 * CREATE:      No dirty mappings will already exist.
12477 	 * MOVE/DELETE: The old mappings will already have been cleaned up by
12478 	 *		kvm_arch_flush_shadow_memslot()
12479 	 *
12480 	 * For a memslot with dirty logging enabled:
12481 	 * CREATE:      No shadow pages exist, thus nothing to write-protect
12482 	 *		and no dirty bits to clear.
12483 	 * MOVE/DELETE: The old mappings will already have been cleaned up by
12484 	 *		kvm_arch_flush_shadow_memslot().
12485 	 */
12486 	if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY))
12487 		return;
12488 
12489 	/*
12490 	 * READONLY and non-flags changes were filtered out above, and the only
12491 	 * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty
12492 	 * logging isn't being toggled on or off.
12493 	 */
12494 	if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES)))
12495 		return;
12496 
12497 	if (!log_dirty_pages) {
12498 		/*
12499 		 * Dirty logging tracks sptes in 4k granularity, meaning that
12500 		 * large sptes have to be split.  If live migration succeeds,
12501 		 * the guest in the source machine will be destroyed and large
12502 		 * sptes will be created in the destination.  However, if the
12503 		 * guest continues to run in the source machine (for example if
12504 		 * live migration fails), small sptes will remain around and
12505 		 * cause bad performance.
12506 		 *
12507 		 * Scan sptes if dirty logging has been stopped, dropping those
12508 		 * which can be collapsed into a single large-page spte.  Later
12509 		 * page faults will create the large-page sptes.
12510 		 */
12511 		kvm_mmu_zap_collapsible_sptes(kvm, new);
12512 	} else {
12513 		/*
12514 		 * Initially-all-set does not require write protecting any page,
12515 		 * because they're all assumed to be dirty.
12516 		 */
12517 		if (kvm_dirty_log_manual_protect_and_init_set(kvm))
12518 			return;
12519 
12520 		if (READ_ONCE(eager_page_split))
12521 			kvm_mmu_slot_try_split_huge_pages(kvm, new, PG_LEVEL_4K);
12522 
12523 		if (kvm_x86_ops.cpu_dirty_log_size) {
12524 			kvm_mmu_slot_leaf_clear_dirty(kvm, new);
12525 			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_2M);
12526 		} else {
12527 			kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K);
12528 		}
12529 
12530 		/*
12531 		 * Unconditionally flush the TLBs after enabling dirty logging.
12532 		 * A flush is almost always going to be necessary (see below),
12533 		 * and unconditionally flushing allows the helpers to omit
12534 		 * the subtly complex checks when removing write access.
12535 		 *
12536 		 * Do the flush outside of mmu_lock to reduce the amount of
12537 		 * time mmu_lock is held.  Flushing after dropping mmu_lock is
12538 		 * safe as KVM only needs to guarantee the slot is fully
12539 		 * write-protected before returning to userspace, i.e. before
12540 		 * userspace can consume the dirty status.
12541 		 *
12542 		 * Flushing outside of mmu_lock requires KVM to be careful when
12543 		 * making decisions based on writable status of an SPTE, e.g. a
12544 		 * !writable SPTE doesn't guarantee a CPU can't perform writes.
12545 		 *
12546 		 * Specifically, KVM also write-protects guest page tables to
12547 		 * monitor changes when using shadow paging, and must guarantee
12548 		 * no CPUs can write to those page before mmu_lock is dropped.
12549 		 * Because CPUs may have stale TLB entries at this point, a
12550 		 * !writable SPTE doesn't guarantee CPUs can't perform writes.
12551 		 *
12552 		 * KVM also allows making SPTES writable outside of mmu_lock,
12553 		 * e.g. to allow dirty logging without taking mmu_lock.
12554 		 *
12555 		 * To handle these scenarios, KVM uses a separate software-only
12556 		 * bit (MMU-writable) to track if a SPTE is !writable due to
12557 		 * a guest page table being write-protected (KVM clears the
12558 		 * MMU-writable flag when write-protecting for shadow paging).
12559 		 *
12560 		 * The use of MMU-writable is also the primary motivation for
12561 		 * the unconditional flush.  Because KVM must guarantee that a
12562 		 * CPU doesn't contain stale, writable TLB entries for a
12563 		 * !MMU-writable SPTE, KVM must flush if it encounters any
12564 		 * MMU-writable SPTE regardless of whether the actual hardware
12565 		 * writable bit was set.  I.e. KVM is almost guaranteed to need
12566 		 * to flush, while unconditionally flushing allows the "remove
12567 		 * write access" helpers to ignore MMU-writable entirely.
12568 		 *
12569 		 * See is_writable_pte() for more details (the case involving
12570 		 * access-tracked SPTEs is particularly relevant).
12571 		 */
12572 		kvm_arch_flush_remote_tlbs_memslot(kvm, new);
12573 	}
12574 }
12575 
12576 void kvm_arch_commit_memory_region(struct kvm *kvm,
12577 				struct kvm_memory_slot *old,
12578 				const struct kvm_memory_slot *new,
12579 				enum kvm_mr_change change)
12580 {
12581 	if (!kvm->arch.n_requested_mmu_pages &&
12582 	    (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) {
12583 		unsigned long nr_mmu_pages;
12584 
12585 		nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO;
12586 		nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES);
12587 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
12588 	}
12589 
12590 	kvm_mmu_slot_apply_flags(kvm, old, new, change);
12591 
12592 	/* Free the arrays associated with the old memslot. */
12593 	if (change == KVM_MR_MOVE)
12594 		kvm_arch_free_memslot(kvm, old);
12595 }
12596 
12597 void kvm_arch_flush_shadow_all(struct kvm *kvm)
12598 {
12599 	kvm_mmu_zap_all(kvm);
12600 }
12601 
12602 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
12603 				   struct kvm_memory_slot *slot)
12604 {
12605 	kvm_page_track_flush_slot(kvm, slot);
12606 }
12607 
12608 static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
12609 {
12610 	return (is_guest_mode(vcpu) &&
12611 		static_call(kvm_x86_guest_apic_has_interrupt)(vcpu));
12612 }
12613 
12614 static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
12615 {
12616 	if (!list_empty_careful(&vcpu->async_pf.done))
12617 		return true;
12618 
12619 	if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
12620 	    kvm_apic_init_sipi_allowed(vcpu))
12621 		return true;
12622 
12623 	if (vcpu->arch.pv.pv_unhalted)
12624 		return true;
12625 
12626 	if (kvm_is_exception_pending(vcpu))
12627 		return true;
12628 
12629 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
12630 	    (vcpu->arch.nmi_pending &&
12631 	     static_call(kvm_x86_nmi_allowed)(vcpu, false)))
12632 		return true;
12633 
12634 #ifdef CONFIG_KVM_SMM
12635 	if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
12636 	    (vcpu->arch.smi_pending &&
12637 	     static_call(kvm_x86_smi_allowed)(vcpu, false)))
12638 		return true;
12639 #endif
12640 
12641 	if (kvm_arch_interrupt_allowed(vcpu) &&
12642 	    (kvm_cpu_has_interrupt(vcpu) ||
12643 	    kvm_guest_apic_has_interrupt(vcpu)))
12644 		return true;
12645 
12646 	if (kvm_hv_has_stimer_pending(vcpu))
12647 		return true;
12648 
12649 	if (is_guest_mode(vcpu) &&
12650 	    kvm_x86_ops.nested_ops->has_events &&
12651 	    kvm_x86_ops.nested_ops->has_events(vcpu))
12652 		return true;
12653 
12654 	if (kvm_xen_has_pending_events(vcpu))
12655 		return true;
12656 
12657 	return false;
12658 }
12659 
12660 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
12661 {
12662 	return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
12663 }
12664 
12665 bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
12666 {
12667 	if (kvm_vcpu_apicv_active(vcpu) &&
12668 	    static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu))
12669 		return true;
12670 
12671 	return false;
12672 }
12673 
12674 bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
12675 {
12676 	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
12677 		return true;
12678 
12679 	if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
12680 #ifdef CONFIG_KVM_SMM
12681 		kvm_test_request(KVM_REQ_SMI, vcpu) ||
12682 #endif
12683 		 kvm_test_request(KVM_REQ_EVENT, vcpu))
12684 		return true;
12685 
12686 	return kvm_arch_dy_has_pending_interrupt(vcpu);
12687 }
12688 
12689 bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
12690 {
12691 	if (vcpu->arch.guest_state_protected)
12692 		return true;
12693 
12694 	return vcpu->arch.preempted_in_kernel;
12695 }
12696 
12697 unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu)
12698 {
12699 	return kvm_rip_read(vcpu);
12700 }
12701 
12702 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
12703 {
12704 	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
12705 }
12706 
12707 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
12708 {
12709 	return static_call(kvm_x86_interrupt_allowed)(vcpu, false);
12710 }
12711 
12712 unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
12713 {
12714 	/* Can't read the RIP when guest state is protected, just return 0 */
12715 	if (vcpu->arch.guest_state_protected)
12716 		return 0;
12717 
12718 	if (is_64_bit_mode(vcpu))
12719 		return kvm_rip_read(vcpu);
12720 	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
12721 		     kvm_rip_read(vcpu));
12722 }
12723 EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
12724 
12725 bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
12726 {
12727 	return kvm_get_linear_rip(vcpu) == linear_rip;
12728 }
12729 EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
12730 
12731 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
12732 {
12733 	unsigned long rflags;
12734 
12735 	rflags = static_call(kvm_x86_get_rflags)(vcpu);
12736 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
12737 		rflags &= ~X86_EFLAGS_TF;
12738 	return rflags;
12739 }
12740 EXPORT_SYMBOL_GPL(kvm_get_rflags);
12741 
12742 static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
12743 {
12744 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
12745 	    kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
12746 		rflags |= X86_EFLAGS_TF;
12747 	static_call(kvm_x86_set_rflags)(vcpu, rflags);
12748 }
12749 
12750 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
12751 {
12752 	__kvm_set_rflags(vcpu, rflags);
12753 	kvm_make_request(KVM_REQ_EVENT, vcpu);
12754 }
12755 EXPORT_SYMBOL_GPL(kvm_set_rflags);
12756 
12757 static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
12758 {
12759 	BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
12760 
12761 	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
12762 }
12763 
12764 static inline u32 kvm_async_pf_next_probe(u32 key)
12765 {
12766 	return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
12767 }
12768 
12769 static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12770 {
12771 	u32 key = kvm_async_pf_hash_fn(gfn);
12772 
12773 	while (vcpu->arch.apf.gfns[key] != ~0)
12774 		key = kvm_async_pf_next_probe(key);
12775 
12776 	vcpu->arch.apf.gfns[key] = gfn;
12777 }
12778 
12779 static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
12780 {
12781 	int i;
12782 	u32 key = kvm_async_pf_hash_fn(gfn);
12783 
12784 	for (i = 0; i < ASYNC_PF_PER_VCPU &&
12785 		     (vcpu->arch.apf.gfns[key] != gfn &&
12786 		      vcpu->arch.apf.gfns[key] != ~0); i++)
12787 		key = kvm_async_pf_next_probe(key);
12788 
12789 	return key;
12790 }
12791 
12792 bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12793 {
12794 	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
12795 }
12796 
12797 static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
12798 {
12799 	u32 i, j, k;
12800 
12801 	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
12802 
12803 	if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
12804 		return;
12805 
12806 	while (true) {
12807 		vcpu->arch.apf.gfns[i] = ~0;
12808 		do {
12809 			j = kvm_async_pf_next_probe(j);
12810 			if (vcpu->arch.apf.gfns[j] == ~0)
12811 				return;
12812 			k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
12813 			/*
12814 			 * k lies cyclically in ]i,j]
12815 			 * |    i.k.j |
12816 			 * |....j i.k.| or  |.k..j i...|
12817 			 */
12818 		} while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
12819 		vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
12820 		i = j;
12821 	}
12822 }
12823 
12824 static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
12825 {
12826 	u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
12827 
12828 	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
12829 				      sizeof(reason));
12830 }
12831 
12832 static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
12833 {
12834 	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
12835 
12836 	return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
12837 					     &token, offset, sizeof(token));
12838 }
12839 
12840 static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
12841 {
12842 	unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
12843 	u32 val;
12844 
12845 	if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
12846 					 &val, offset, sizeof(val)))
12847 		return false;
12848 
12849 	return !val;
12850 }
12851 
12852 static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
12853 {
12854 
12855 	if (!kvm_pv_async_pf_enabled(vcpu))
12856 		return false;
12857 
12858 	if (vcpu->arch.apf.send_user_only &&
12859 	    static_call(kvm_x86_get_cpl)(vcpu) == 0)
12860 		return false;
12861 
12862 	if (is_guest_mode(vcpu)) {
12863 		/*
12864 		 * L1 needs to opt into the special #PF vmexits that are
12865 		 * used to deliver async page faults.
12866 		 */
12867 		return vcpu->arch.apf.delivery_as_pf_vmexit;
12868 	} else {
12869 		/*
12870 		 * Play it safe in case the guest temporarily disables paging.
12871 		 * The real mode IDT in particular is unlikely to have a #PF
12872 		 * exception setup.
12873 		 */
12874 		return is_paging(vcpu);
12875 	}
12876 }
12877 
12878 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
12879 {
12880 	if (unlikely(!lapic_in_kernel(vcpu) ||
12881 		     kvm_event_needs_reinjection(vcpu) ||
12882 		     kvm_is_exception_pending(vcpu)))
12883 		return false;
12884 
12885 	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
12886 		return false;
12887 
12888 	/*
12889 	 * If interrupts are off we cannot even use an artificial
12890 	 * halt state.
12891 	 */
12892 	return kvm_arch_interrupt_allowed(vcpu);
12893 }
12894 
12895 bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
12896 				     struct kvm_async_pf *work)
12897 {
12898 	struct x86_exception fault;
12899 
12900 	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
12901 	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
12902 
12903 	if (kvm_can_deliver_async_pf(vcpu) &&
12904 	    !apf_put_user_notpresent(vcpu)) {
12905 		fault.vector = PF_VECTOR;
12906 		fault.error_code_valid = true;
12907 		fault.error_code = 0;
12908 		fault.nested_page_fault = false;
12909 		fault.address = work->arch.token;
12910 		fault.async_page_fault = true;
12911 		kvm_inject_page_fault(vcpu, &fault);
12912 		return true;
12913 	} else {
12914 		/*
12915 		 * It is not possible to deliver a paravirtualized asynchronous
12916 		 * page fault, but putting the guest in an artificial halt state
12917 		 * can be beneficial nevertheless: if an interrupt arrives, we
12918 		 * can deliver it timely and perhaps the guest will schedule
12919 		 * another process.  When the instruction that triggered a page
12920 		 * fault is retried, hopefully the page will be ready in the host.
12921 		 */
12922 		kvm_make_request(KVM_REQ_APF_HALT, vcpu);
12923 		return false;
12924 	}
12925 }
12926 
12927 void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
12928 				 struct kvm_async_pf *work)
12929 {
12930 	struct kvm_lapic_irq irq = {
12931 		.delivery_mode = APIC_DM_FIXED,
12932 		.vector = vcpu->arch.apf.vec
12933 	};
12934 
12935 	if (work->wakeup_all)
12936 		work->arch.token = ~0; /* broadcast wakeup */
12937 	else
12938 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
12939 	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
12940 
12941 	if ((work->wakeup_all || work->notpresent_injected) &&
12942 	    kvm_pv_async_pf_enabled(vcpu) &&
12943 	    !apf_put_user_ready(vcpu, work->arch.token)) {
12944 		vcpu->arch.apf.pageready_pending = true;
12945 		kvm_apic_set_irq(vcpu, &irq, NULL);
12946 	}
12947 
12948 	vcpu->arch.apf.halted = false;
12949 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
12950 }
12951 
12952 void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
12953 {
12954 	kvm_make_request(KVM_REQ_APF_READY, vcpu);
12955 	if (!vcpu->arch.apf.pageready_pending)
12956 		kvm_vcpu_kick(vcpu);
12957 }
12958 
12959 bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
12960 {
12961 	if (!kvm_pv_async_pf_enabled(vcpu))
12962 		return true;
12963 	else
12964 		return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
12965 }
12966 
12967 void kvm_arch_start_assignment(struct kvm *kvm)
12968 {
12969 	if (atomic_inc_return(&kvm->arch.assigned_device_count) == 1)
12970 		static_call_cond(kvm_x86_pi_start_assignment)(kvm);
12971 }
12972 EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
12973 
12974 void kvm_arch_end_assignment(struct kvm *kvm)
12975 {
12976 	atomic_dec(&kvm->arch.assigned_device_count);
12977 }
12978 EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
12979 
12980 bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
12981 {
12982 	return arch_atomic_read(&kvm->arch.assigned_device_count);
12983 }
12984 EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
12985 
12986 void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
12987 {
12988 	atomic_inc(&kvm->arch.noncoherent_dma_count);
12989 }
12990 EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
12991 
12992 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
12993 {
12994 	atomic_dec(&kvm->arch.noncoherent_dma_count);
12995 }
12996 EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
12997 
12998 bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
12999 {
13000 	return atomic_read(&kvm->arch.noncoherent_dma_count);
13001 }
13002 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
13003 
13004 bool kvm_arch_has_irq_bypass(void)
13005 {
13006 	return true;
13007 }
13008 
13009 int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
13010 				      struct irq_bypass_producer *prod)
13011 {
13012 	struct kvm_kernel_irqfd *irqfd =
13013 		container_of(cons, struct kvm_kernel_irqfd, consumer);
13014 	int ret;
13015 
13016 	irqfd->producer = prod;
13017 	kvm_arch_start_assignment(irqfd->kvm);
13018 	ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm,
13019 					 prod->irq, irqfd->gsi, 1);
13020 
13021 	if (ret)
13022 		kvm_arch_end_assignment(irqfd->kvm);
13023 
13024 	return ret;
13025 }
13026 
13027 void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
13028 				      struct irq_bypass_producer *prod)
13029 {
13030 	int ret;
13031 	struct kvm_kernel_irqfd *irqfd =
13032 		container_of(cons, struct kvm_kernel_irqfd, consumer);
13033 
13034 	WARN_ON(irqfd->producer != prod);
13035 	irqfd->producer = NULL;
13036 
13037 	/*
13038 	 * When producer of consumer is unregistered, we change back to
13039 	 * remapped mode, so we can re-use the current implementation
13040 	 * when the irq is masked/disabled or the consumer side (KVM
13041 	 * int this case doesn't want to receive the interrupts.
13042 	*/
13043 	ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0);
13044 	if (ret)
13045 		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
13046 		       " fails: %d\n", irqfd->consumer.token, ret);
13047 
13048 	kvm_arch_end_assignment(irqfd->kvm);
13049 }
13050 
13051 int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
13052 				   uint32_t guest_irq, bool set)
13053 {
13054 	return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set);
13055 }
13056 
13057 bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old,
13058 				  struct kvm_kernel_irq_routing_entry *new)
13059 {
13060 	if (new->type != KVM_IRQ_ROUTING_MSI)
13061 		return true;
13062 
13063 	return !!memcmp(&old->msi, &new->msi, sizeof(new->msi));
13064 }
13065 
13066 bool kvm_vector_hashing_enabled(void)
13067 {
13068 	return vector_hashing;
13069 }
13070 
13071 bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
13072 {
13073 	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
13074 }
13075 EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
13076 
13077 
13078 int kvm_spec_ctrl_test_value(u64 value)
13079 {
13080 	/*
13081 	 * test that setting IA32_SPEC_CTRL to given value
13082 	 * is allowed by the host processor
13083 	 */
13084 
13085 	u64 saved_value;
13086 	unsigned long flags;
13087 	int ret = 0;
13088 
13089 	local_irq_save(flags);
13090 
13091 	if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
13092 		ret = 1;
13093 	else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
13094 		ret = 1;
13095 	else
13096 		wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
13097 
13098 	local_irq_restore(flags);
13099 
13100 	return ret;
13101 }
13102 EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
13103 
13104 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
13105 {
13106 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
13107 	struct x86_exception fault;
13108 	u64 access = error_code &
13109 		(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
13110 
13111 	if (!(error_code & PFERR_PRESENT_MASK) ||
13112 	    mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) {
13113 		/*
13114 		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
13115 		 * tables probably do not match the TLB.  Just proceed
13116 		 * with the error code that the processor gave.
13117 		 */
13118 		fault.vector = PF_VECTOR;
13119 		fault.error_code_valid = true;
13120 		fault.error_code = error_code;
13121 		fault.nested_page_fault = false;
13122 		fault.address = gva;
13123 		fault.async_page_fault = false;
13124 	}
13125 	vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
13126 }
13127 EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
13128 
13129 /*
13130  * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
13131  * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
13132  * indicates whether exit to userspace is needed.
13133  */
13134 int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
13135 			      struct x86_exception *e)
13136 {
13137 	if (r == X86EMUL_PROPAGATE_FAULT) {
13138 		if (KVM_BUG_ON(!e, vcpu->kvm))
13139 			return -EIO;
13140 
13141 		kvm_inject_emulated_page_fault(vcpu, e);
13142 		return 1;
13143 	}
13144 
13145 	/*
13146 	 * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
13147 	 * while handling a VMX instruction KVM could've handled the request
13148 	 * correctly by exiting to userspace and performing I/O but there
13149 	 * doesn't seem to be a real use-case behind such requests, just return
13150 	 * KVM_EXIT_INTERNAL_ERROR for now.
13151 	 */
13152 	kvm_prepare_emulation_failure_exit(vcpu);
13153 
13154 	return 0;
13155 }
13156 EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
13157 
13158 int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
13159 {
13160 	bool pcid_enabled;
13161 	struct x86_exception e;
13162 	struct {
13163 		u64 pcid;
13164 		u64 gla;
13165 	} operand;
13166 	int r;
13167 
13168 	r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
13169 	if (r != X86EMUL_CONTINUE)
13170 		return kvm_handle_memory_failure(vcpu, r, &e);
13171 
13172 	if (operand.pcid >> 12 != 0) {
13173 		kvm_inject_gp(vcpu, 0);
13174 		return 1;
13175 	}
13176 
13177 	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
13178 
13179 	switch (type) {
13180 	case INVPCID_TYPE_INDIV_ADDR:
13181 		if ((!pcid_enabled && (operand.pcid != 0)) ||
13182 		    is_noncanonical_address(operand.gla, vcpu)) {
13183 			kvm_inject_gp(vcpu, 0);
13184 			return 1;
13185 		}
13186 		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
13187 		return kvm_skip_emulated_instruction(vcpu);
13188 
13189 	case INVPCID_TYPE_SINGLE_CTXT:
13190 		if (!pcid_enabled && (operand.pcid != 0)) {
13191 			kvm_inject_gp(vcpu, 0);
13192 			return 1;
13193 		}
13194 
13195 		kvm_invalidate_pcid(vcpu, operand.pcid);
13196 		return kvm_skip_emulated_instruction(vcpu);
13197 
13198 	case INVPCID_TYPE_ALL_NON_GLOBAL:
13199 		/*
13200 		 * Currently, KVM doesn't mark global entries in the shadow
13201 		 * page tables, so a non-global flush just degenerates to a
13202 		 * global flush. If needed, we could optimize this later by
13203 		 * keeping track of global entries in shadow page tables.
13204 		 */
13205 
13206 		fallthrough;
13207 	case INVPCID_TYPE_ALL_INCL_GLOBAL:
13208 		kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
13209 		return kvm_skip_emulated_instruction(vcpu);
13210 
13211 	default:
13212 		kvm_inject_gp(vcpu, 0);
13213 		return 1;
13214 	}
13215 }
13216 EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
13217 
13218 static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu)
13219 {
13220 	struct kvm_run *run = vcpu->run;
13221 	struct kvm_mmio_fragment *frag;
13222 	unsigned int len;
13223 
13224 	BUG_ON(!vcpu->mmio_needed);
13225 
13226 	/* Complete previous fragment */
13227 	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
13228 	len = min(8u, frag->len);
13229 	if (!vcpu->mmio_is_write)
13230 		memcpy(frag->data, run->mmio.data, len);
13231 
13232 	if (frag->len <= 8) {
13233 		/* Switch to the next fragment. */
13234 		frag++;
13235 		vcpu->mmio_cur_fragment++;
13236 	} else {
13237 		/* Go forward to the next mmio piece. */
13238 		frag->data += len;
13239 		frag->gpa += len;
13240 		frag->len -= len;
13241 	}
13242 
13243 	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
13244 		vcpu->mmio_needed = 0;
13245 
13246 		// VMG change, at this point, we're always done
13247 		// RIP has already been advanced
13248 		return 1;
13249 	}
13250 
13251 	// More MMIO is needed
13252 	run->mmio.phys_addr = frag->gpa;
13253 	run->mmio.len = min(8u, frag->len);
13254 	run->mmio.is_write = vcpu->mmio_is_write;
13255 	if (run->mmio.is_write)
13256 		memcpy(run->mmio.data, frag->data, min(8u, frag->len));
13257 	run->exit_reason = KVM_EXIT_MMIO;
13258 
13259 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13260 
13261 	return 0;
13262 }
13263 
13264 int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13265 			  void *data)
13266 {
13267 	int handled;
13268 	struct kvm_mmio_fragment *frag;
13269 
13270 	if (!data)
13271 		return -EINVAL;
13272 
13273 	handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13274 	if (handled == bytes)
13275 		return 1;
13276 
13277 	bytes -= handled;
13278 	gpa += handled;
13279 	data += handled;
13280 
13281 	/*TODO: Check if need to increment number of frags */
13282 	frag = vcpu->mmio_fragments;
13283 	vcpu->mmio_nr_fragments = 1;
13284 	frag->len = bytes;
13285 	frag->gpa = gpa;
13286 	frag->data = data;
13287 
13288 	vcpu->mmio_needed = 1;
13289 	vcpu->mmio_cur_fragment = 0;
13290 
13291 	vcpu->run->mmio.phys_addr = gpa;
13292 	vcpu->run->mmio.len = min(8u, frag->len);
13293 	vcpu->run->mmio.is_write = 1;
13294 	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
13295 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
13296 
13297 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13298 
13299 	return 0;
13300 }
13301 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write);
13302 
13303 int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes,
13304 			 void *data)
13305 {
13306 	int handled;
13307 	struct kvm_mmio_fragment *frag;
13308 
13309 	if (!data)
13310 		return -EINVAL;
13311 
13312 	handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data);
13313 	if (handled == bytes)
13314 		return 1;
13315 
13316 	bytes -= handled;
13317 	gpa += handled;
13318 	data += handled;
13319 
13320 	/*TODO: Check if need to increment number of frags */
13321 	frag = vcpu->mmio_fragments;
13322 	vcpu->mmio_nr_fragments = 1;
13323 	frag->len = bytes;
13324 	frag->gpa = gpa;
13325 	frag->data = data;
13326 
13327 	vcpu->mmio_needed = 1;
13328 	vcpu->mmio_cur_fragment = 0;
13329 
13330 	vcpu->run->mmio.phys_addr = gpa;
13331 	vcpu->run->mmio.len = min(8u, frag->len);
13332 	vcpu->run->mmio.is_write = 0;
13333 	vcpu->run->exit_reason = KVM_EXIT_MMIO;
13334 
13335 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio;
13336 
13337 	return 0;
13338 }
13339 EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read);
13340 
13341 static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size)
13342 {
13343 	vcpu->arch.sev_pio_count -= count;
13344 	vcpu->arch.sev_pio_data += count * size;
13345 }
13346 
13347 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
13348 			   unsigned int port);
13349 
13350 static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu)
13351 {
13352 	int size = vcpu->arch.pio.size;
13353 	int port = vcpu->arch.pio.port;
13354 
13355 	vcpu->arch.pio.count = 0;
13356 	if (vcpu->arch.sev_pio_count)
13357 		return kvm_sev_es_outs(vcpu, size, port);
13358 	return 1;
13359 }
13360 
13361 static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size,
13362 			   unsigned int port)
13363 {
13364 	for (;;) {
13365 		unsigned int count =
13366 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
13367 		int ret = emulator_pio_out(vcpu, size, port, vcpu->arch.sev_pio_data, count);
13368 
13369 		/* memcpy done already by emulator_pio_out.  */
13370 		advance_sev_es_emulated_pio(vcpu, count, size);
13371 		if (!ret)
13372 			break;
13373 
13374 		/* Emulation done by the kernel.  */
13375 		if (!vcpu->arch.sev_pio_count)
13376 			return 1;
13377 	}
13378 
13379 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs;
13380 	return 0;
13381 }
13382 
13383 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
13384 			  unsigned int port);
13385 
13386 static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu)
13387 {
13388 	unsigned count = vcpu->arch.pio.count;
13389 	int size = vcpu->arch.pio.size;
13390 	int port = vcpu->arch.pio.port;
13391 
13392 	complete_emulator_pio_in(vcpu, vcpu->arch.sev_pio_data);
13393 	advance_sev_es_emulated_pio(vcpu, count, size);
13394 	if (vcpu->arch.sev_pio_count)
13395 		return kvm_sev_es_ins(vcpu, size, port);
13396 	return 1;
13397 }
13398 
13399 static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size,
13400 			  unsigned int port)
13401 {
13402 	for (;;) {
13403 		unsigned int count =
13404 			min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count);
13405 		if (!emulator_pio_in(vcpu, size, port, vcpu->arch.sev_pio_data, count))
13406 			break;
13407 
13408 		/* Emulation done by the kernel.  */
13409 		advance_sev_es_emulated_pio(vcpu, count, size);
13410 		if (!vcpu->arch.sev_pio_count)
13411 			return 1;
13412 	}
13413 
13414 	vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins;
13415 	return 0;
13416 }
13417 
13418 int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size,
13419 			 unsigned int port, void *data,  unsigned int count,
13420 			 int in)
13421 {
13422 	vcpu->arch.sev_pio_data = data;
13423 	vcpu->arch.sev_pio_count = count;
13424 	return in ? kvm_sev_es_ins(vcpu, size, port)
13425 		  : kvm_sev_es_outs(vcpu, size, port);
13426 }
13427 EXPORT_SYMBOL_GPL(kvm_sev_es_string_io);
13428 
13429 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry);
13430 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
13431 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
13432 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
13433 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
13434 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
13435 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
13436 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
13437 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
13438 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
13439 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
13440 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
13441 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
13442 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
13443 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
13444 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
13445 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
13446 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
13447 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
13448 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
13449 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
13450 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
13451 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
13452 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell);
13453 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
13454 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
13455 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
13456 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter);
13457 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);
13458 
13459 static int __init kvm_x86_init(void)
13460 {
13461 	kvm_mmu_x86_module_init();
13462 	return 0;
13463 }
13464 module_init(kvm_x86_init);
13465 
13466 static void __exit kvm_x86_exit(void)
13467 {
13468 	/*
13469 	 * If module_init() is implemented, module_exit() must also be
13470 	 * implemented to allow module unload.
13471 	 */
13472 }
13473 module_exit(kvm_x86_exit);
13474