xref: /openbmc/linux/arch/x86/kvm/vmx/nested.c (revision e7253313)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5 
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8 
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "pmu.h"
14 #include "trace.h"
15 #include "x86.h"
16 
17 static bool __read_mostly enable_shadow_vmcs = 1;
18 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
19 
20 static bool __read_mostly nested_early_check = 0;
21 module_param(nested_early_check, bool, S_IRUGO);
22 
23 #define CC(consistency_check)						\
24 ({									\
25 	bool failed = (consistency_check);				\
26 	if (failed)							\
27 		trace_kvm_nested_vmenter_failed(#consistency_check, 0);	\
28 	failed;								\
29 })
30 
31 #define SET_MSR_OR_WARN(vcpu, idx, data)				\
32 ({									\
33 	bool failed = kvm_set_msr(vcpu, idx, data);			\
34 	if (failed)							\
35 		pr_warn_ratelimited(					\
36 				"%s cannot write MSR (0x%x, 0x%llx)\n",	\
37 				__func__, idx, data);			\
38 	failed;								\
39 })
40 
41 /*
42  * Hyper-V requires all of these, so mark them as supported even though
43  * they are just treated the same as all-context.
44  */
45 #define VMX_VPID_EXTENT_SUPPORTED_MASK		\
46 	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
47 	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
48 	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
49 	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
50 
51 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
52 
53 enum {
54 	VMX_VMREAD_BITMAP,
55 	VMX_VMWRITE_BITMAP,
56 	VMX_BITMAP_NR
57 };
58 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
59 
60 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
61 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
62 
63 struct shadow_vmcs_field {
64 	u16	encoding;
65 	u16	offset;
66 };
67 static struct shadow_vmcs_field shadow_read_only_fields[] = {
68 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) },
69 #include "vmcs_shadow_fields.h"
70 };
71 static int max_shadow_read_only_fields =
72 	ARRAY_SIZE(shadow_read_only_fields);
73 
74 static struct shadow_vmcs_field shadow_read_write_fields[] = {
75 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) },
76 #include "vmcs_shadow_fields.h"
77 };
78 static int max_shadow_read_write_fields =
79 	ARRAY_SIZE(shadow_read_write_fields);
80 
81 static void init_vmcs_shadow_fields(void)
82 {
83 	int i, j;
84 
85 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
86 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
87 
88 	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
89 		struct shadow_vmcs_field entry = shadow_read_only_fields[i];
90 		u16 field = entry.encoding;
91 
92 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
93 		    (i + 1 == max_shadow_read_only_fields ||
94 		     shadow_read_only_fields[i + 1].encoding != field + 1))
95 			pr_err("Missing field from shadow_read_only_field %x\n",
96 			       field + 1);
97 
98 		clear_bit(field, vmx_vmread_bitmap);
99 		if (field & 1)
100 #ifdef CONFIG_X86_64
101 			continue;
102 #else
103 			entry.offset += sizeof(u32);
104 #endif
105 		shadow_read_only_fields[j++] = entry;
106 	}
107 	max_shadow_read_only_fields = j;
108 
109 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
110 		struct shadow_vmcs_field entry = shadow_read_write_fields[i];
111 		u16 field = entry.encoding;
112 
113 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
114 		    (i + 1 == max_shadow_read_write_fields ||
115 		     shadow_read_write_fields[i + 1].encoding != field + 1))
116 			pr_err("Missing field from shadow_read_write_field %x\n",
117 			       field + 1);
118 
119 		WARN_ONCE(field >= GUEST_ES_AR_BYTES &&
120 			  field <= GUEST_TR_AR_BYTES,
121 			  "Update vmcs12_write_any() to drop reserved bits from AR_BYTES");
122 
123 		/*
124 		 * PML and the preemption timer can be emulated, but the
125 		 * processor cannot vmwrite to fields that don't exist
126 		 * on bare metal.
127 		 */
128 		switch (field) {
129 		case GUEST_PML_INDEX:
130 			if (!cpu_has_vmx_pml())
131 				continue;
132 			break;
133 		case VMX_PREEMPTION_TIMER_VALUE:
134 			if (!cpu_has_vmx_preemption_timer())
135 				continue;
136 			break;
137 		case GUEST_INTR_STATUS:
138 			if (!cpu_has_vmx_apicv())
139 				continue;
140 			break;
141 		default:
142 			break;
143 		}
144 
145 		clear_bit(field, vmx_vmwrite_bitmap);
146 		clear_bit(field, vmx_vmread_bitmap);
147 		if (field & 1)
148 #ifdef CONFIG_X86_64
149 			continue;
150 #else
151 			entry.offset += sizeof(u32);
152 #endif
153 		shadow_read_write_fields[j++] = entry;
154 	}
155 	max_shadow_read_write_fields = j;
156 }
157 
158 /*
159  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
160  * set the success or error code of an emulated VMX instruction (as specified
161  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
162  * instruction.
163  */
164 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
165 {
166 	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
167 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
168 			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
169 	return kvm_skip_emulated_instruction(vcpu);
170 }
171 
172 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
173 {
174 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
175 			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
176 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
177 			| X86_EFLAGS_CF);
178 	return kvm_skip_emulated_instruction(vcpu);
179 }
180 
181 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
182 				u32 vm_instruction_error)
183 {
184 	struct vcpu_vmx *vmx = to_vmx(vcpu);
185 
186 	/*
187 	 * failValid writes the error number to the current VMCS, which
188 	 * can't be done if there isn't a current VMCS.
189 	 */
190 	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
191 		return nested_vmx_failInvalid(vcpu);
192 
193 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
194 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
195 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
196 			| X86_EFLAGS_ZF);
197 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
198 	/*
199 	 * We don't need to force a shadow sync because
200 	 * VM_INSTRUCTION_ERROR is not shadowed
201 	 */
202 	return kvm_skip_emulated_instruction(vcpu);
203 }
204 
205 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
206 {
207 	/* TODO: not to reset guest simply here. */
208 	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
209 	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
210 }
211 
212 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
213 {
214 	return fixed_bits_valid(control, low, high);
215 }
216 
217 static inline u64 vmx_control_msr(u32 low, u32 high)
218 {
219 	return low | ((u64)high << 32);
220 }
221 
222 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
223 {
224 	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
225 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
226 	vmx->nested.need_vmcs12_to_shadow_sync = false;
227 }
228 
229 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
230 {
231 	struct vcpu_vmx *vmx = to_vmx(vcpu);
232 
233 	if (!vmx->nested.hv_evmcs)
234 		return;
235 
236 	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
237 	vmx->nested.hv_evmcs_vmptr = -1ull;
238 	vmx->nested.hv_evmcs = NULL;
239 }
240 
241 /*
242  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
243  * just stops using VMX.
244  */
245 static void free_nested(struct kvm_vcpu *vcpu)
246 {
247 	struct vcpu_vmx *vmx = to_vmx(vcpu);
248 
249 	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
250 		return;
251 
252 	kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
253 
254 	vmx->nested.vmxon = false;
255 	vmx->nested.smm.vmxon = false;
256 	free_vpid(vmx->nested.vpid02);
257 	vmx->nested.posted_intr_nv = -1;
258 	vmx->nested.current_vmptr = -1ull;
259 	if (enable_shadow_vmcs) {
260 		vmx_disable_shadow_vmcs(vmx);
261 		vmcs_clear(vmx->vmcs01.shadow_vmcs);
262 		free_vmcs(vmx->vmcs01.shadow_vmcs);
263 		vmx->vmcs01.shadow_vmcs = NULL;
264 	}
265 	kfree(vmx->nested.cached_vmcs12);
266 	vmx->nested.cached_vmcs12 = NULL;
267 	kfree(vmx->nested.cached_shadow_vmcs12);
268 	vmx->nested.cached_shadow_vmcs12 = NULL;
269 	/* Unpin physical memory we referred to in the vmcs02 */
270 	if (vmx->nested.apic_access_page) {
271 		kvm_release_page_clean(vmx->nested.apic_access_page);
272 		vmx->nested.apic_access_page = NULL;
273 	}
274 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
275 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
276 	vmx->nested.pi_desc = NULL;
277 
278 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
279 
280 	nested_release_evmcs(vcpu);
281 
282 	free_loaded_vmcs(&vmx->nested.vmcs02);
283 }
284 
285 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
286 				     struct loaded_vmcs *prev)
287 {
288 	struct vmcs_host_state *dest, *src;
289 
290 	if (unlikely(!vmx->guest_state_loaded))
291 		return;
292 
293 	src = &prev->host_state;
294 	dest = &vmx->loaded_vmcs->host_state;
295 
296 	vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base);
297 	dest->ldt_sel = src->ldt_sel;
298 #ifdef CONFIG_X86_64
299 	dest->ds_sel = src->ds_sel;
300 	dest->es_sel = src->es_sel;
301 #endif
302 }
303 
304 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
305 {
306 	struct vcpu_vmx *vmx = to_vmx(vcpu);
307 	struct loaded_vmcs *prev;
308 	int cpu;
309 
310 	if (vmx->loaded_vmcs == vmcs)
311 		return;
312 
313 	cpu = get_cpu();
314 	prev = vmx->loaded_vmcs;
315 	vmx->loaded_vmcs = vmcs;
316 	vmx_vcpu_load_vmcs(vcpu, cpu);
317 	vmx_sync_vmcs_host_state(vmx, prev);
318 	put_cpu();
319 
320 	vmx_segment_cache_clear(vmx);
321 }
322 
323 /*
324  * Ensure that the current vmcs of the logical processor is the
325  * vmcs01 of the vcpu before calling free_nested().
326  */
327 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
328 {
329 	vcpu_load(vcpu);
330 	vmx_leave_nested(vcpu);
331 	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
332 	free_nested(vcpu);
333 	vcpu_put(vcpu);
334 }
335 
336 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
337 		struct x86_exception *fault)
338 {
339 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
340 	struct vcpu_vmx *vmx = to_vmx(vcpu);
341 	u32 exit_reason;
342 	unsigned long exit_qualification = vcpu->arch.exit_qualification;
343 
344 	if (vmx->nested.pml_full) {
345 		exit_reason = EXIT_REASON_PML_FULL;
346 		vmx->nested.pml_full = false;
347 		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
348 	} else if (fault->error_code & PFERR_RSVD_MASK)
349 		exit_reason = EXIT_REASON_EPT_MISCONFIG;
350 	else
351 		exit_reason = EXIT_REASON_EPT_VIOLATION;
352 
353 	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
354 	vmcs12->guest_physical_address = fault->address;
355 }
356 
357 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
358 {
359 	WARN_ON(mmu_is_nested(vcpu));
360 
361 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
362 	kvm_init_shadow_ept_mmu(vcpu,
363 			to_vmx(vcpu)->nested.msrs.ept_caps &
364 			VMX_EPT_EXECUTE_ONLY_BIT,
365 			nested_ept_ad_enabled(vcpu),
366 			nested_ept_get_cr3(vcpu));
367 	vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
368 	vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
369 	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
370 	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
371 
372 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
373 }
374 
375 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
376 {
377 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
378 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
379 }
380 
381 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
382 					    u16 error_code)
383 {
384 	bool inequality, bit;
385 
386 	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
387 	inequality =
388 		(error_code & vmcs12->page_fault_error_code_mask) !=
389 		 vmcs12->page_fault_error_code_match;
390 	return inequality ^ bit;
391 }
392 
393 
394 /*
395  * KVM wants to inject page-faults which it got to the guest. This function
396  * checks whether in a nested guest, we need to inject them to L1 or L2.
397  */
398 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
399 {
400 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
401 	unsigned int nr = vcpu->arch.exception.nr;
402 	bool has_payload = vcpu->arch.exception.has_payload;
403 	unsigned long payload = vcpu->arch.exception.payload;
404 
405 	if (nr == PF_VECTOR) {
406 		if (vcpu->arch.exception.nested_apf) {
407 			*exit_qual = vcpu->arch.apf.nested_apf_token;
408 			return 1;
409 		}
410 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
411 						    vcpu->arch.exception.error_code)) {
412 			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
413 			return 1;
414 		}
415 	} else if (vmcs12->exception_bitmap & (1u << nr)) {
416 		if (nr == DB_VECTOR) {
417 			if (!has_payload) {
418 				payload = vcpu->arch.dr6;
419 				payload &= ~(DR6_FIXED_1 | DR6_BT);
420 				payload ^= DR6_RTM;
421 			}
422 			*exit_qual = payload;
423 		} else
424 			*exit_qual = 0;
425 		return 1;
426 	}
427 
428 	return 0;
429 }
430 
431 
432 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
433 		struct x86_exception *fault)
434 {
435 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
436 
437 	WARN_ON(!is_guest_mode(vcpu));
438 
439 	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
440 		!to_vmx(vcpu)->nested.nested_run_pending) {
441 		vmcs12->vm_exit_intr_error_code = fault->error_code;
442 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
443 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
444 				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
445 				  fault->address);
446 	} else {
447 		kvm_inject_page_fault(vcpu, fault);
448 	}
449 }
450 
451 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
452 {
453 	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
454 }
455 
456 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
457 					       struct vmcs12 *vmcs12)
458 {
459 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
460 		return 0;
461 
462 	if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) ||
463 	    CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b)))
464 		return -EINVAL;
465 
466 	return 0;
467 }
468 
469 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
470 						struct vmcs12 *vmcs12)
471 {
472 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
473 		return 0;
474 
475 	if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap)))
476 		return -EINVAL;
477 
478 	return 0;
479 }
480 
481 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
482 						struct vmcs12 *vmcs12)
483 {
484 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
485 		return 0;
486 
487 	if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr)))
488 		return -EINVAL;
489 
490 	return 0;
491 }
492 
493 /*
494  * Check if MSR is intercepted for L01 MSR bitmap.
495  */
496 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
497 {
498 	unsigned long *msr_bitmap;
499 	int f = sizeof(unsigned long);
500 
501 	if (!cpu_has_vmx_msr_bitmap())
502 		return true;
503 
504 	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
505 
506 	if (msr <= 0x1fff) {
507 		return !!test_bit(msr, msr_bitmap + 0x800 / f);
508 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
509 		msr &= 0x1fff;
510 		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
511 	}
512 
513 	return true;
514 }
515 
516 /*
517  * If a msr is allowed by L0, we should check whether it is allowed by L1.
518  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
519  */
520 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
521 					       unsigned long *msr_bitmap_nested,
522 					       u32 msr, int type)
523 {
524 	int f = sizeof(unsigned long);
525 
526 	/*
527 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
528 	 * have the write-low and read-high bitmap offsets the wrong way round.
529 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
530 	 */
531 	if (msr <= 0x1fff) {
532 		if (type & MSR_TYPE_R &&
533 		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
534 			/* read-low */
535 			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
536 
537 		if (type & MSR_TYPE_W &&
538 		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
539 			/* write-low */
540 			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
541 
542 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
543 		msr &= 0x1fff;
544 		if (type & MSR_TYPE_R &&
545 		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
546 			/* read-high */
547 			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
548 
549 		if (type & MSR_TYPE_W &&
550 		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
551 			/* write-high */
552 			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
553 
554 	}
555 }
556 
557 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
558 	int msr;
559 
560 	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
561 		unsigned word = msr / BITS_PER_LONG;
562 
563 		msr_bitmap[word] = ~0;
564 		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
565 	}
566 }
567 
568 /*
569  * Merge L0's and L1's MSR bitmap, return false to indicate that
570  * we do not use the hardware.
571  */
572 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
573 						 struct vmcs12 *vmcs12)
574 {
575 	int msr;
576 	unsigned long *msr_bitmap_l1;
577 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
578 	struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
579 
580 	/* Nothing to do if the MSR bitmap is not in use.  */
581 	if (!cpu_has_vmx_msr_bitmap() ||
582 	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
583 		return false;
584 
585 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
586 		return false;
587 
588 	msr_bitmap_l1 = (unsigned long *)map->hva;
589 
590 	/*
591 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
592 	 * 4-byte writes on 32-bit systems) up front to enable intercepts for
593 	 * the x2APIC MSR range and selectively disable them below.
594 	 */
595 	enable_x2apic_msr_intercepts(msr_bitmap_l0);
596 
597 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
598 		if (nested_cpu_has_apic_reg_virt(vmcs12)) {
599 			/*
600 			 * L0 need not intercept reads for MSRs between 0x800
601 			 * and 0x8ff, it just lets the processor take the value
602 			 * from the virtual-APIC page; take those 256 bits
603 			 * directly from the L1 bitmap.
604 			 */
605 			for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
606 				unsigned word = msr / BITS_PER_LONG;
607 
608 				msr_bitmap_l0[word] = msr_bitmap_l1[word];
609 			}
610 		}
611 
612 		nested_vmx_disable_intercept_for_msr(
613 			msr_bitmap_l1, msr_bitmap_l0,
614 			X2APIC_MSR(APIC_TASKPRI),
615 			MSR_TYPE_R | MSR_TYPE_W);
616 
617 		if (nested_cpu_has_vid(vmcs12)) {
618 			nested_vmx_disable_intercept_for_msr(
619 				msr_bitmap_l1, msr_bitmap_l0,
620 				X2APIC_MSR(APIC_EOI),
621 				MSR_TYPE_W);
622 			nested_vmx_disable_intercept_for_msr(
623 				msr_bitmap_l1, msr_bitmap_l0,
624 				X2APIC_MSR(APIC_SELF_IPI),
625 				MSR_TYPE_W);
626 		}
627 	}
628 
629 	/* KVM unconditionally exposes the FS/GS base MSRs to L1. */
630 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
631 					     MSR_FS_BASE, MSR_TYPE_RW);
632 
633 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
634 					     MSR_GS_BASE, MSR_TYPE_RW);
635 
636 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
637 					     MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
638 
639 	/*
640 	 * Checking the L0->L1 bitmap is trying to verify two things:
641 	 *
642 	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
643 	 *    ensures that we do not accidentally generate an L02 MSR bitmap
644 	 *    from the L12 MSR bitmap that is too permissive.
645 	 * 2. That L1 or L2s have actually used the MSR. This avoids
646 	 *    unnecessarily merging of the bitmap if the MSR is unused. This
647 	 *    works properly because we only update the L01 MSR bitmap lazily.
648 	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
649 	 *    updated to reflect this when L1 (or its L2s) actually write to
650 	 *    the MSR.
651 	 */
652 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
653 		nested_vmx_disable_intercept_for_msr(
654 					msr_bitmap_l1, msr_bitmap_l0,
655 					MSR_IA32_SPEC_CTRL,
656 					MSR_TYPE_R | MSR_TYPE_W);
657 
658 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
659 		nested_vmx_disable_intercept_for_msr(
660 					msr_bitmap_l1, msr_bitmap_l0,
661 					MSR_IA32_PRED_CMD,
662 					MSR_TYPE_W);
663 
664 	kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
665 
666 	return true;
667 }
668 
669 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
670 				       struct vmcs12 *vmcs12)
671 {
672 	struct kvm_host_map map;
673 	struct vmcs12 *shadow;
674 
675 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
676 	    vmcs12->vmcs_link_pointer == -1ull)
677 		return;
678 
679 	shadow = get_shadow_vmcs12(vcpu);
680 
681 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
682 		return;
683 
684 	memcpy(shadow, map.hva, VMCS12_SIZE);
685 	kvm_vcpu_unmap(vcpu, &map, false);
686 }
687 
688 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
689 					      struct vmcs12 *vmcs12)
690 {
691 	struct vcpu_vmx *vmx = to_vmx(vcpu);
692 
693 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
694 	    vmcs12->vmcs_link_pointer == -1ull)
695 		return;
696 
697 	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
698 			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
699 }
700 
701 /*
702  * In nested virtualization, check if L1 has set
703  * VM_EXIT_ACK_INTR_ON_EXIT
704  */
705 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
706 {
707 	return get_vmcs12(vcpu)->vm_exit_controls &
708 		VM_EXIT_ACK_INTR_ON_EXIT;
709 }
710 
711 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
712 {
713 	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
714 }
715 
716 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
717 					  struct vmcs12 *vmcs12)
718 {
719 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
720 	    CC(!page_address_valid(vcpu, vmcs12->apic_access_addr)))
721 		return -EINVAL;
722 	else
723 		return 0;
724 }
725 
726 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
727 					   struct vmcs12 *vmcs12)
728 {
729 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
730 	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
731 	    !nested_cpu_has_vid(vmcs12) &&
732 	    !nested_cpu_has_posted_intr(vmcs12))
733 		return 0;
734 
735 	/*
736 	 * If virtualize x2apic mode is enabled,
737 	 * virtualize apic access must be disabled.
738 	 */
739 	if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) &&
740 	       nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)))
741 		return -EINVAL;
742 
743 	/*
744 	 * If virtual interrupt delivery is enabled,
745 	 * we must exit on external interrupts.
746 	 */
747 	if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu)))
748 		return -EINVAL;
749 
750 	/*
751 	 * bits 15:8 should be zero in posted_intr_nv,
752 	 * the descriptor address has been already checked
753 	 * in nested_get_vmcs12_pages.
754 	 *
755 	 * bits 5:0 of posted_intr_desc_addr should be zero.
756 	 */
757 	if (nested_cpu_has_posted_intr(vmcs12) &&
758 	   (CC(!nested_cpu_has_vid(vmcs12)) ||
759 	    CC(!nested_exit_intr_ack_set(vcpu)) ||
760 	    CC((vmcs12->posted_intr_nv & 0xff00)) ||
761 	    CC((vmcs12->posted_intr_desc_addr & 0x3f)) ||
762 	    CC((vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu)))))
763 		return -EINVAL;
764 
765 	/* tpr shadow is needed by all apicv features. */
766 	if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)))
767 		return -EINVAL;
768 
769 	return 0;
770 }
771 
772 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
773 				       u32 count, u64 addr)
774 {
775 	int maxphyaddr;
776 
777 	if (count == 0)
778 		return 0;
779 	maxphyaddr = cpuid_maxphyaddr(vcpu);
780 	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
781 	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
782 		return -EINVAL;
783 
784 	return 0;
785 }
786 
787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
788 						     struct vmcs12 *vmcs12)
789 {
790 	if (CC(nested_vmx_check_msr_switch(vcpu,
791 					   vmcs12->vm_exit_msr_load_count,
792 					   vmcs12->vm_exit_msr_load_addr)) ||
793 	    CC(nested_vmx_check_msr_switch(vcpu,
794 					   vmcs12->vm_exit_msr_store_count,
795 					   vmcs12->vm_exit_msr_store_addr)))
796 		return -EINVAL;
797 
798 	return 0;
799 }
800 
801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
802                                                       struct vmcs12 *vmcs12)
803 {
804 	if (CC(nested_vmx_check_msr_switch(vcpu,
805 					   vmcs12->vm_entry_msr_load_count,
806 					   vmcs12->vm_entry_msr_load_addr)))
807                 return -EINVAL;
808 
809 	return 0;
810 }
811 
812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
813 					 struct vmcs12 *vmcs12)
814 {
815 	if (!nested_cpu_has_pml(vmcs12))
816 		return 0;
817 
818 	if (CC(!nested_cpu_has_ept(vmcs12)) ||
819 	    CC(!page_address_valid(vcpu, vmcs12->pml_address)))
820 		return -EINVAL;
821 
822 	return 0;
823 }
824 
825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
826 							struct vmcs12 *vmcs12)
827 {
828 	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
829 	       !nested_cpu_has_ept(vmcs12)))
830 		return -EINVAL;
831 	return 0;
832 }
833 
834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
835 							 struct vmcs12 *vmcs12)
836 {
837 	if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
838 	       !nested_cpu_has_ept(vmcs12)))
839 		return -EINVAL;
840 	return 0;
841 }
842 
843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
844 						 struct vmcs12 *vmcs12)
845 {
846 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
847 		return 0;
848 
849 	if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) ||
850 	    CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap)))
851 		return -EINVAL;
852 
853 	return 0;
854 }
855 
856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
857 				       struct vmx_msr_entry *e)
858 {
859 	/* x2APIC MSR accesses are not allowed */
860 	if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8))
861 		return -EINVAL;
862 	if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */
863 	    CC(e->index == MSR_IA32_UCODE_REV))
864 		return -EINVAL;
865 	if (CC(e->reserved != 0))
866 		return -EINVAL;
867 	return 0;
868 }
869 
870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
871 				     struct vmx_msr_entry *e)
872 {
873 	if (CC(e->index == MSR_FS_BASE) ||
874 	    CC(e->index == MSR_GS_BASE) ||
875 	    CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */
876 	    nested_vmx_msr_check_common(vcpu, e))
877 		return -EINVAL;
878 	return 0;
879 }
880 
881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
882 				      struct vmx_msr_entry *e)
883 {
884 	if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */
885 	    nested_vmx_msr_check_common(vcpu, e))
886 		return -EINVAL;
887 	return 0;
888 }
889 
890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu)
891 {
892 	struct vcpu_vmx *vmx = to_vmx(vcpu);
893 	u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
894 				       vmx->nested.msrs.misc_high);
895 
896 	return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER;
897 }
898 
899 /*
900  * Load guest's/host's msr at nested entry/exit.
901  * return 0 for success, entry index for failure.
902  *
903  * One of the failure modes for MSR load/store is when a list exceeds the
904  * virtual hardware's capacity. To maintain compatibility with hardware inasmuch
905  * as possible, process all valid entries before failing rather than precheck
906  * for a capacity violation.
907  */
908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
909 {
910 	u32 i;
911 	struct vmx_msr_entry e;
912 	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
913 
914 	for (i = 0; i < count; i++) {
915 		if (unlikely(i >= max_msr_list_size))
916 			goto fail;
917 
918 		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
919 					&e, sizeof(e))) {
920 			pr_debug_ratelimited(
921 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
922 				__func__, i, gpa + i * sizeof(e));
923 			goto fail;
924 		}
925 		if (nested_vmx_load_msr_check(vcpu, &e)) {
926 			pr_debug_ratelimited(
927 				"%s check failed (%u, 0x%x, 0x%x)\n",
928 				__func__, i, e.index, e.reserved);
929 			goto fail;
930 		}
931 		if (kvm_set_msr(vcpu, e.index, e.value)) {
932 			pr_debug_ratelimited(
933 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
934 				__func__, i, e.index, e.value);
935 			goto fail;
936 		}
937 	}
938 	return 0;
939 fail:
940 	return i + 1;
941 }
942 
943 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
944 					    u32 msr_index,
945 					    u64 *data)
946 {
947 	struct vcpu_vmx *vmx = to_vmx(vcpu);
948 
949 	/*
950 	 * If the L0 hypervisor stored a more accurate value for the TSC that
951 	 * does not include the time taken for emulation of the L2->L1
952 	 * VM-exit in L0, use the more accurate value.
953 	 */
954 	if (msr_index == MSR_IA32_TSC) {
955 		int index = vmx_find_msr_index(&vmx->msr_autostore.guest,
956 					       MSR_IA32_TSC);
957 
958 		if (index >= 0) {
959 			u64 val = vmx->msr_autostore.guest.val[index].value;
960 
961 			*data = kvm_read_l1_tsc(vcpu, val);
962 			return true;
963 		}
964 	}
965 
966 	if (kvm_get_msr(vcpu, msr_index, data)) {
967 		pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
968 			msr_index);
969 		return false;
970 	}
971 	return true;
972 }
973 
974 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
975 				     struct vmx_msr_entry *e)
976 {
977 	if (kvm_vcpu_read_guest(vcpu,
978 				gpa + i * sizeof(*e),
979 				e, 2 * sizeof(u32))) {
980 		pr_debug_ratelimited(
981 			"%s cannot read MSR entry (%u, 0x%08llx)\n",
982 			__func__, i, gpa + i * sizeof(*e));
983 		return false;
984 	}
985 	if (nested_vmx_store_msr_check(vcpu, e)) {
986 		pr_debug_ratelimited(
987 			"%s check failed (%u, 0x%x, 0x%x)\n",
988 			__func__, i, e->index, e->reserved);
989 		return false;
990 	}
991 	return true;
992 }
993 
994 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
995 {
996 	u64 data;
997 	u32 i;
998 	struct vmx_msr_entry e;
999 	u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu);
1000 
1001 	for (i = 0; i < count; i++) {
1002 		if (unlikely(i >= max_msr_list_size))
1003 			return -EINVAL;
1004 
1005 		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1006 			return -EINVAL;
1007 
1008 		if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
1009 			return -EINVAL;
1010 
1011 		if (kvm_vcpu_write_guest(vcpu,
1012 					 gpa + i * sizeof(e) +
1013 					     offsetof(struct vmx_msr_entry, value),
1014 					 &data, sizeof(data))) {
1015 			pr_debug_ratelimited(
1016 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
1017 				__func__, i, e.index, data);
1018 			return -EINVAL;
1019 		}
1020 	}
1021 	return 0;
1022 }
1023 
1024 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
1025 {
1026 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1027 	u32 count = vmcs12->vm_exit_msr_store_count;
1028 	u64 gpa = vmcs12->vm_exit_msr_store_addr;
1029 	struct vmx_msr_entry e;
1030 	u32 i;
1031 
1032 	for (i = 0; i < count; i++) {
1033 		if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
1034 			return false;
1035 
1036 		if (e.index == msr_index)
1037 			return true;
1038 	}
1039 	return false;
1040 }
1041 
1042 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
1043 					   u32 msr_index)
1044 {
1045 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1046 	struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
1047 	bool in_vmcs12_store_list;
1048 	int msr_autostore_index;
1049 	bool in_autostore_list;
1050 	int last;
1051 
1052 	msr_autostore_index = vmx_find_msr_index(autostore, msr_index);
1053 	in_autostore_list = msr_autostore_index >= 0;
1054 	in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
1055 
1056 	if (in_vmcs12_store_list && !in_autostore_list) {
1057 		if (autostore->nr == NR_LOADSTORE_MSRS) {
1058 			/*
1059 			 * Emulated VMEntry does not fail here.  Instead a less
1060 			 * accurate value will be returned by
1061 			 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
1062 			 * instead of reading the value from the vmcs02 VMExit
1063 			 * MSR-store area.
1064 			 */
1065 			pr_warn_ratelimited(
1066 				"Not enough msr entries in msr_autostore.  Can't add msr %x\n",
1067 				msr_index);
1068 			return;
1069 		}
1070 		last = autostore->nr++;
1071 		autostore->val[last].index = msr_index;
1072 	} else if (!in_vmcs12_store_list && in_autostore_list) {
1073 		last = --autostore->nr;
1074 		autostore->val[msr_autostore_index] = autostore->val[last];
1075 	}
1076 }
1077 
1078 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
1079 {
1080 	unsigned long invalid_mask;
1081 
1082 	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
1083 	return (val & invalid_mask) == 0;
1084 }
1085 
1086 /*
1087  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
1088  * emulating VM entry into a guest with EPT enabled.
1089  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
1090  * is assigned to entry_failure_code on failure.
1091  */
1092 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
1093 			       u32 *entry_failure_code)
1094 {
1095 	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
1096 		if (CC(!nested_cr3_valid(vcpu, cr3))) {
1097 			*entry_failure_code = ENTRY_FAIL_DEFAULT;
1098 			return -EINVAL;
1099 		}
1100 
1101 		/*
1102 		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
1103 		 * must not be dereferenced.
1104 		 */
1105 		if (is_pae_paging(vcpu) && !nested_ept) {
1106 			if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
1107 				*entry_failure_code = ENTRY_FAIL_PDPTE;
1108 				return -EINVAL;
1109 			}
1110 		}
1111 	}
1112 
1113 	if (!nested_ept)
1114 		kvm_mmu_new_cr3(vcpu, cr3, false);
1115 
1116 	vcpu->arch.cr3 = cr3;
1117 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
1118 
1119 	kvm_init_mmu(vcpu, false);
1120 
1121 	return 0;
1122 }
1123 
1124 /*
1125  * Returns if KVM is able to config CPU to tag TLB entries
1126  * populated by L2 differently than TLB entries populated
1127  * by L1.
1128  *
1129  * If L0 uses EPT, L1 and L2 run with different EPTP because
1130  * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
1131  * are tagged with different EPTP.
1132  *
1133  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
1134  * with different VPID (L1 entries are tagged with vmx->vpid
1135  * while L2 entries are tagged with vmx->nested.vpid02).
1136  */
1137 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
1138 {
1139 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1140 
1141 	return enable_ept ||
1142 	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
1143 }
1144 
1145 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
1146 {
1147 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1148 
1149 	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
1150 }
1151 
1152 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
1153 {
1154 	superset &= mask;
1155 	subset &= mask;
1156 
1157 	return (superset | subset) == superset;
1158 }
1159 
1160 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
1161 {
1162 	const u64 feature_and_reserved =
1163 		/* feature (except bit 48; see below) */
1164 		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1165 		/* reserved */
1166 		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1167 	u64 vmx_basic = vmx->nested.msrs.basic;
1168 
1169 	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1170 		return -EINVAL;
1171 
1172 	/*
1173 	 * KVM does not emulate a version of VMX that constrains physical
1174 	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1175 	 */
1176 	if (data & BIT_ULL(48))
1177 		return -EINVAL;
1178 
1179 	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1180 	    vmx_basic_vmcs_revision_id(data))
1181 		return -EINVAL;
1182 
1183 	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1184 		return -EINVAL;
1185 
1186 	vmx->nested.msrs.basic = data;
1187 	return 0;
1188 }
1189 
1190 static int
1191 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1192 {
1193 	u64 supported;
1194 	u32 *lowp, *highp;
1195 
1196 	switch (msr_index) {
1197 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1198 		lowp = &vmx->nested.msrs.pinbased_ctls_low;
1199 		highp = &vmx->nested.msrs.pinbased_ctls_high;
1200 		break;
1201 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1202 		lowp = &vmx->nested.msrs.procbased_ctls_low;
1203 		highp = &vmx->nested.msrs.procbased_ctls_high;
1204 		break;
1205 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1206 		lowp = &vmx->nested.msrs.exit_ctls_low;
1207 		highp = &vmx->nested.msrs.exit_ctls_high;
1208 		break;
1209 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1210 		lowp = &vmx->nested.msrs.entry_ctls_low;
1211 		highp = &vmx->nested.msrs.entry_ctls_high;
1212 		break;
1213 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1214 		lowp = &vmx->nested.msrs.secondary_ctls_low;
1215 		highp = &vmx->nested.msrs.secondary_ctls_high;
1216 		break;
1217 	default:
1218 		BUG();
1219 	}
1220 
1221 	supported = vmx_control_msr(*lowp, *highp);
1222 
1223 	/* Check must-be-1 bits are still 1. */
1224 	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1225 		return -EINVAL;
1226 
1227 	/* Check must-be-0 bits are still 0. */
1228 	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1229 		return -EINVAL;
1230 
1231 	*lowp = data;
1232 	*highp = data >> 32;
1233 	return 0;
1234 }
1235 
1236 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1237 {
1238 	const u64 feature_and_reserved_bits =
1239 		/* feature */
1240 		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1241 		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1242 		/* reserved */
1243 		GENMASK_ULL(13, 9) | BIT_ULL(31);
1244 	u64 vmx_misc;
1245 
1246 	vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1247 				   vmx->nested.msrs.misc_high);
1248 
1249 	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1250 		return -EINVAL;
1251 
1252 	if ((vmx->nested.msrs.pinbased_ctls_high &
1253 	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
1254 	    vmx_misc_preemption_timer_rate(data) !=
1255 	    vmx_misc_preemption_timer_rate(vmx_misc))
1256 		return -EINVAL;
1257 
1258 	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1259 		return -EINVAL;
1260 
1261 	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1262 		return -EINVAL;
1263 
1264 	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1265 		return -EINVAL;
1266 
1267 	vmx->nested.msrs.misc_low = data;
1268 	vmx->nested.msrs.misc_high = data >> 32;
1269 
1270 	return 0;
1271 }
1272 
1273 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1274 {
1275 	u64 vmx_ept_vpid_cap;
1276 
1277 	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1278 					   vmx->nested.msrs.vpid_caps);
1279 
1280 	/* Every bit is either reserved or a feature bit. */
1281 	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1282 		return -EINVAL;
1283 
1284 	vmx->nested.msrs.ept_caps = data;
1285 	vmx->nested.msrs.vpid_caps = data >> 32;
1286 	return 0;
1287 }
1288 
1289 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1290 {
1291 	u64 *msr;
1292 
1293 	switch (msr_index) {
1294 	case MSR_IA32_VMX_CR0_FIXED0:
1295 		msr = &vmx->nested.msrs.cr0_fixed0;
1296 		break;
1297 	case MSR_IA32_VMX_CR4_FIXED0:
1298 		msr = &vmx->nested.msrs.cr4_fixed0;
1299 		break;
1300 	default:
1301 		BUG();
1302 	}
1303 
1304 	/*
1305 	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1306 	 * must be 1 in the restored value.
1307 	 */
1308 	if (!is_bitwise_subset(data, *msr, -1ULL))
1309 		return -EINVAL;
1310 
1311 	*msr = data;
1312 	return 0;
1313 }
1314 
1315 /*
1316  * Called when userspace is restoring VMX MSRs.
1317  *
1318  * Returns 0 on success, non-0 otherwise.
1319  */
1320 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1321 {
1322 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1323 
1324 	/*
1325 	 * Don't allow changes to the VMX capability MSRs while the vCPU
1326 	 * is in VMX operation.
1327 	 */
1328 	if (vmx->nested.vmxon)
1329 		return -EBUSY;
1330 
1331 	switch (msr_index) {
1332 	case MSR_IA32_VMX_BASIC:
1333 		return vmx_restore_vmx_basic(vmx, data);
1334 	case MSR_IA32_VMX_PINBASED_CTLS:
1335 	case MSR_IA32_VMX_PROCBASED_CTLS:
1336 	case MSR_IA32_VMX_EXIT_CTLS:
1337 	case MSR_IA32_VMX_ENTRY_CTLS:
1338 		/*
1339 		 * The "non-true" VMX capability MSRs are generated from the
1340 		 * "true" MSRs, so we do not support restoring them directly.
1341 		 *
1342 		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1343 		 * should restore the "true" MSRs with the must-be-1 bits
1344 		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1345 		 * DEFAULT SETTINGS".
1346 		 */
1347 		return -EINVAL;
1348 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1349 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1350 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1351 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1352 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1353 		return vmx_restore_control_msr(vmx, msr_index, data);
1354 	case MSR_IA32_VMX_MISC:
1355 		return vmx_restore_vmx_misc(vmx, data);
1356 	case MSR_IA32_VMX_CR0_FIXED0:
1357 	case MSR_IA32_VMX_CR4_FIXED0:
1358 		return vmx_restore_fixed0_msr(vmx, msr_index, data);
1359 	case MSR_IA32_VMX_CR0_FIXED1:
1360 	case MSR_IA32_VMX_CR4_FIXED1:
1361 		/*
1362 		 * These MSRs are generated based on the vCPU's CPUID, so we
1363 		 * do not support restoring them directly.
1364 		 */
1365 		return -EINVAL;
1366 	case MSR_IA32_VMX_EPT_VPID_CAP:
1367 		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1368 	case MSR_IA32_VMX_VMCS_ENUM:
1369 		vmx->nested.msrs.vmcs_enum = data;
1370 		return 0;
1371 	case MSR_IA32_VMX_VMFUNC:
1372 		if (data & ~vmx->nested.msrs.vmfunc_controls)
1373 			return -EINVAL;
1374 		vmx->nested.msrs.vmfunc_controls = data;
1375 		return 0;
1376 	default:
1377 		/*
1378 		 * The rest of the VMX capability MSRs do not support restore.
1379 		 */
1380 		return -EINVAL;
1381 	}
1382 }
1383 
1384 /* Returns 0 on success, non-0 otherwise. */
1385 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1386 {
1387 	switch (msr_index) {
1388 	case MSR_IA32_VMX_BASIC:
1389 		*pdata = msrs->basic;
1390 		break;
1391 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1392 	case MSR_IA32_VMX_PINBASED_CTLS:
1393 		*pdata = vmx_control_msr(
1394 			msrs->pinbased_ctls_low,
1395 			msrs->pinbased_ctls_high);
1396 		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1397 			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1398 		break;
1399 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1400 	case MSR_IA32_VMX_PROCBASED_CTLS:
1401 		*pdata = vmx_control_msr(
1402 			msrs->procbased_ctls_low,
1403 			msrs->procbased_ctls_high);
1404 		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1405 			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1406 		break;
1407 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1408 	case MSR_IA32_VMX_EXIT_CTLS:
1409 		*pdata = vmx_control_msr(
1410 			msrs->exit_ctls_low,
1411 			msrs->exit_ctls_high);
1412 		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1413 			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1414 		break;
1415 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1416 	case MSR_IA32_VMX_ENTRY_CTLS:
1417 		*pdata = vmx_control_msr(
1418 			msrs->entry_ctls_low,
1419 			msrs->entry_ctls_high);
1420 		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1421 			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1422 		break;
1423 	case MSR_IA32_VMX_MISC:
1424 		*pdata = vmx_control_msr(
1425 			msrs->misc_low,
1426 			msrs->misc_high);
1427 		break;
1428 	case MSR_IA32_VMX_CR0_FIXED0:
1429 		*pdata = msrs->cr0_fixed0;
1430 		break;
1431 	case MSR_IA32_VMX_CR0_FIXED1:
1432 		*pdata = msrs->cr0_fixed1;
1433 		break;
1434 	case MSR_IA32_VMX_CR4_FIXED0:
1435 		*pdata = msrs->cr4_fixed0;
1436 		break;
1437 	case MSR_IA32_VMX_CR4_FIXED1:
1438 		*pdata = msrs->cr4_fixed1;
1439 		break;
1440 	case MSR_IA32_VMX_VMCS_ENUM:
1441 		*pdata = msrs->vmcs_enum;
1442 		break;
1443 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1444 		*pdata = vmx_control_msr(
1445 			msrs->secondary_ctls_low,
1446 			msrs->secondary_ctls_high);
1447 		break;
1448 	case MSR_IA32_VMX_EPT_VPID_CAP:
1449 		*pdata = msrs->ept_caps |
1450 			((u64)msrs->vpid_caps << 32);
1451 		break;
1452 	case MSR_IA32_VMX_VMFUNC:
1453 		*pdata = msrs->vmfunc_controls;
1454 		break;
1455 	default:
1456 		return 1;
1457 	}
1458 
1459 	return 0;
1460 }
1461 
1462 /*
1463  * Copy the writable VMCS shadow fields back to the VMCS12, in case they have
1464  * been modified by the L1 guest.  Note, "writable" in this context means
1465  * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of
1466  * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only"
1467  * VM-exit information fields (which are actually writable if the vCPU is
1468  * configured to support "VMWRITE to any supported field in the VMCS").
1469  */
1470 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1471 {
1472 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1473 	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1474 	struct shadow_vmcs_field field;
1475 	unsigned long val;
1476 	int i;
1477 
1478 	if (WARN_ON(!shadow_vmcs))
1479 		return;
1480 
1481 	preempt_disable();
1482 
1483 	vmcs_load(shadow_vmcs);
1484 
1485 	for (i = 0; i < max_shadow_read_write_fields; i++) {
1486 		field = shadow_read_write_fields[i];
1487 		val = __vmcs_readl(field.encoding);
1488 		vmcs12_write_any(vmcs12, field.encoding, field.offset, val);
1489 	}
1490 
1491 	vmcs_clear(shadow_vmcs);
1492 	vmcs_load(vmx->loaded_vmcs->vmcs);
1493 
1494 	preempt_enable();
1495 }
1496 
1497 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1498 {
1499 	const struct shadow_vmcs_field *fields[] = {
1500 		shadow_read_write_fields,
1501 		shadow_read_only_fields
1502 	};
1503 	const int max_fields[] = {
1504 		max_shadow_read_write_fields,
1505 		max_shadow_read_only_fields
1506 	};
1507 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1508 	struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu);
1509 	struct shadow_vmcs_field field;
1510 	unsigned long val;
1511 	int i, q;
1512 
1513 	if (WARN_ON(!shadow_vmcs))
1514 		return;
1515 
1516 	vmcs_load(shadow_vmcs);
1517 
1518 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
1519 		for (i = 0; i < max_fields[q]; i++) {
1520 			field = fields[q][i];
1521 			val = vmcs12_read_any(vmcs12, field.encoding,
1522 					      field.offset);
1523 			__vmcs_writel(field.encoding, val);
1524 		}
1525 	}
1526 
1527 	vmcs_clear(shadow_vmcs);
1528 	vmcs_load(vmx->loaded_vmcs->vmcs);
1529 }
1530 
1531 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1532 {
1533 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1534 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1535 
1536 	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1537 	vmcs12->tpr_threshold = evmcs->tpr_threshold;
1538 	vmcs12->guest_rip = evmcs->guest_rip;
1539 
1540 	if (unlikely(!(evmcs->hv_clean_fields &
1541 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1542 		vmcs12->guest_rsp = evmcs->guest_rsp;
1543 		vmcs12->guest_rflags = evmcs->guest_rflags;
1544 		vmcs12->guest_interruptibility_info =
1545 			evmcs->guest_interruptibility_info;
1546 	}
1547 
1548 	if (unlikely(!(evmcs->hv_clean_fields &
1549 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1550 		vmcs12->cpu_based_vm_exec_control =
1551 			evmcs->cpu_based_vm_exec_control;
1552 	}
1553 
1554 	if (unlikely(!(evmcs->hv_clean_fields &
1555 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) {
1556 		vmcs12->exception_bitmap = evmcs->exception_bitmap;
1557 	}
1558 
1559 	if (unlikely(!(evmcs->hv_clean_fields &
1560 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1561 		vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1562 	}
1563 
1564 	if (unlikely(!(evmcs->hv_clean_fields &
1565 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1566 		vmcs12->vm_entry_intr_info_field =
1567 			evmcs->vm_entry_intr_info_field;
1568 		vmcs12->vm_entry_exception_error_code =
1569 			evmcs->vm_entry_exception_error_code;
1570 		vmcs12->vm_entry_instruction_len =
1571 			evmcs->vm_entry_instruction_len;
1572 	}
1573 
1574 	if (unlikely(!(evmcs->hv_clean_fields &
1575 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1576 		vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1577 		vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1578 		vmcs12->host_cr0 = evmcs->host_cr0;
1579 		vmcs12->host_cr3 = evmcs->host_cr3;
1580 		vmcs12->host_cr4 = evmcs->host_cr4;
1581 		vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1582 		vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1583 		vmcs12->host_rip = evmcs->host_rip;
1584 		vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1585 		vmcs12->host_es_selector = evmcs->host_es_selector;
1586 		vmcs12->host_cs_selector = evmcs->host_cs_selector;
1587 		vmcs12->host_ss_selector = evmcs->host_ss_selector;
1588 		vmcs12->host_ds_selector = evmcs->host_ds_selector;
1589 		vmcs12->host_fs_selector = evmcs->host_fs_selector;
1590 		vmcs12->host_gs_selector = evmcs->host_gs_selector;
1591 		vmcs12->host_tr_selector = evmcs->host_tr_selector;
1592 	}
1593 
1594 	if (unlikely(!(evmcs->hv_clean_fields &
1595 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) {
1596 		vmcs12->pin_based_vm_exec_control =
1597 			evmcs->pin_based_vm_exec_control;
1598 		vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1599 		vmcs12->secondary_vm_exec_control =
1600 			evmcs->secondary_vm_exec_control;
1601 	}
1602 
1603 	if (unlikely(!(evmcs->hv_clean_fields &
1604 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1605 		vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1606 		vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1607 	}
1608 
1609 	if (unlikely(!(evmcs->hv_clean_fields &
1610 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1611 		vmcs12->msr_bitmap = evmcs->msr_bitmap;
1612 	}
1613 
1614 	if (unlikely(!(evmcs->hv_clean_fields &
1615 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1616 		vmcs12->guest_es_base = evmcs->guest_es_base;
1617 		vmcs12->guest_cs_base = evmcs->guest_cs_base;
1618 		vmcs12->guest_ss_base = evmcs->guest_ss_base;
1619 		vmcs12->guest_ds_base = evmcs->guest_ds_base;
1620 		vmcs12->guest_fs_base = evmcs->guest_fs_base;
1621 		vmcs12->guest_gs_base = evmcs->guest_gs_base;
1622 		vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1623 		vmcs12->guest_tr_base = evmcs->guest_tr_base;
1624 		vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1625 		vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1626 		vmcs12->guest_es_limit = evmcs->guest_es_limit;
1627 		vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1628 		vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1629 		vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1630 		vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1631 		vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1632 		vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1633 		vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1634 		vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1635 		vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1636 		vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1637 		vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1638 		vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1639 		vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1640 		vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1641 		vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1642 		vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1643 		vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1644 		vmcs12->guest_es_selector = evmcs->guest_es_selector;
1645 		vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1646 		vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1647 		vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1648 		vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1649 		vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1650 		vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1651 		vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1652 	}
1653 
1654 	if (unlikely(!(evmcs->hv_clean_fields &
1655 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1656 		vmcs12->tsc_offset = evmcs->tsc_offset;
1657 		vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1658 		vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1659 	}
1660 
1661 	if (unlikely(!(evmcs->hv_clean_fields &
1662 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1663 		vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1664 		vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1665 		vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1666 		vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1667 		vmcs12->guest_cr0 = evmcs->guest_cr0;
1668 		vmcs12->guest_cr3 = evmcs->guest_cr3;
1669 		vmcs12->guest_cr4 = evmcs->guest_cr4;
1670 		vmcs12->guest_dr7 = evmcs->guest_dr7;
1671 	}
1672 
1673 	if (unlikely(!(evmcs->hv_clean_fields &
1674 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1675 		vmcs12->host_fs_base = evmcs->host_fs_base;
1676 		vmcs12->host_gs_base = evmcs->host_gs_base;
1677 		vmcs12->host_tr_base = evmcs->host_tr_base;
1678 		vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1679 		vmcs12->host_idtr_base = evmcs->host_idtr_base;
1680 		vmcs12->host_rsp = evmcs->host_rsp;
1681 	}
1682 
1683 	if (unlikely(!(evmcs->hv_clean_fields &
1684 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1685 		vmcs12->ept_pointer = evmcs->ept_pointer;
1686 		vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1687 	}
1688 
1689 	if (unlikely(!(evmcs->hv_clean_fields &
1690 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1691 		vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1692 		vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1693 		vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1694 		vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1695 		vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1696 		vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1697 		vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1698 		vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1699 		vmcs12->guest_pending_dbg_exceptions =
1700 			evmcs->guest_pending_dbg_exceptions;
1701 		vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1702 		vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1703 		vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1704 		vmcs12->guest_activity_state = evmcs->guest_activity_state;
1705 		vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1706 	}
1707 
1708 	/*
1709 	 * Not used?
1710 	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1711 	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1712 	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1713 	 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1714 	 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1715 	 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1716 	 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1717 	 * vmcs12->page_fault_error_code_mask =
1718 	 *		evmcs->page_fault_error_code_mask;
1719 	 * vmcs12->page_fault_error_code_match =
1720 	 *		evmcs->page_fault_error_code_match;
1721 	 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1722 	 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1723 	 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1724 	 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1725 	 */
1726 
1727 	/*
1728 	 * Read only fields:
1729 	 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1730 	 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1731 	 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1732 	 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1733 	 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1734 	 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1735 	 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1736 	 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1737 	 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1738 	 * vmcs12->exit_qualification = evmcs->exit_qualification;
1739 	 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1740 	 *
1741 	 * Not present in struct vmcs12:
1742 	 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1743 	 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1744 	 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1745 	 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1746 	 */
1747 
1748 	return 0;
1749 }
1750 
1751 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1752 {
1753 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1754 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1755 
1756 	/*
1757 	 * Should not be changed by KVM:
1758 	 *
1759 	 * evmcs->host_es_selector = vmcs12->host_es_selector;
1760 	 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1761 	 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1762 	 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1763 	 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1764 	 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1765 	 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1766 	 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1767 	 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1768 	 * evmcs->host_cr0 = vmcs12->host_cr0;
1769 	 * evmcs->host_cr3 = vmcs12->host_cr3;
1770 	 * evmcs->host_cr4 = vmcs12->host_cr4;
1771 	 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1772 	 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1773 	 * evmcs->host_rip = vmcs12->host_rip;
1774 	 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1775 	 * evmcs->host_fs_base = vmcs12->host_fs_base;
1776 	 * evmcs->host_gs_base = vmcs12->host_gs_base;
1777 	 * evmcs->host_tr_base = vmcs12->host_tr_base;
1778 	 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1779 	 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1780 	 * evmcs->host_rsp = vmcs12->host_rsp;
1781 	 * sync_vmcs02_to_vmcs12() doesn't read these:
1782 	 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1783 	 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1784 	 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1785 	 * evmcs->ept_pointer = vmcs12->ept_pointer;
1786 	 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1787 	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1788 	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1789 	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1790 	 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1791 	 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1792 	 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1793 	 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1794 	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1795 	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1796 	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1797 	 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1798 	 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1799 	 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1800 	 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1801 	 * evmcs->page_fault_error_code_mask =
1802 	 *		vmcs12->page_fault_error_code_mask;
1803 	 * evmcs->page_fault_error_code_match =
1804 	 *		vmcs12->page_fault_error_code_match;
1805 	 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1806 	 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1807 	 * evmcs->tsc_offset = vmcs12->tsc_offset;
1808 	 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1809 	 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1810 	 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1811 	 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1812 	 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1813 	 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1814 	 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1815 	 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1816 	 *
1817 	 * Not present in struct vmcs12:
1818 	 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1819 	 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1820 	 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1821 	 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1822 	 */
1823 
1824 	evmcs->guest_es_selector = vmcs12->guest_es_selector;
1825 	evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1826 	evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1827 	evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1828 	evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1829 	evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1830 	evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1831 	evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1832 
1833 	evmcs->guest_es_limit = vmcs12->guest_es_limit;
1834 	evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1835 	evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1836 	evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1837 	evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1838 	evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1839 	evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1840 	evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1841 	evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1842 	evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1843 
1844 	evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1845 	evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1846 	evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1847 	evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1848 	evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1849 	evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1850 	evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1851 	evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1852 
1853 	evmcs->guest_es_base = vmcs12->guest_es_base;
1854 	evmcs->guest_cs_base = vmcs12->guest_cs_base;
1855 	evmcs->guest_ss_base = vmcs12->guest_ss_base;
1856 	evmcs->guest_ds_base = vmcs12->guest_ds_base;
1857 	evmcs->guest_fs_base = vmcs12->guest_fs_base;
1858 	evmcs->guest_gs_base = vmcs12->guest_gs_base;
1859 	evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1860 	evmcs->guest_tr_base = vmcs12->guest_tr_base;
1861 	evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1862 	evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1863 
1864 	evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1865 	evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1866 
1867 	evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1868 	evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1869 	evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1870 	evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1871 
1872 	evmcs->guest_pending_dbg_exceptions =
1873 		vmcs12->guest_pending_dbg_exceptions;
1874 	evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1875 	evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1876 
1877 	evmcs->guest_activity_state = vmcs12->guest_activity_state;
1878 	evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1879 
1880 	evmcs->guest_cr0 = vmcs12->guest_cr0;
1881 	evmcs->guest_cr3 = vmcs12->guest_cr3;
1882 	evmcs->guest_cr4 = vmcs12->guest_cr4;
1883 	evmcs->guest_dr7 = vmcs12->guest_dr7;
1884 
1885 	evmcs->guest_physical_address = vmcs12->guest_physical_address;
1886 
1887 	evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1888 	evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1889 	evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1890 	evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1891 	evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1892 	evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1893 	evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1894 	evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1895 
1896 	evmcs->exit_qualification = vmcs12->exit_qualification;
1897 
1898 	evmcs->guest_linear_address = vmcs12->guest_linear_address;
1899 	evmcs->guest_rsp = vmcs12->guest_rsp;
1900 	evmcs->guest_rflags = vmcs12->guest_rflags;
1901 
1902 	evmcs->guest_interruptibility_info =
1903 		vmcs12->guest_interruptibility_info;
1904 	evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1905 	evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1906 	evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1907 	evmcs->vm_entry_exception_error_code =
1908 		vmcs12->vm_entry_exception_error_code;
1909 	evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1910 
1911 	evmcs->guest_rip = vmcs12->guest_rip;
1912 
1913 	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1914 
1915 	return 0;
1916 }
1917 
1918 /*
1919  * This is an equivalent of the nested hypervisor executing the vmptrld
1920  * instruction.
1921  */
1922 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1923 						 bool from_launch)
1924 {
1925 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1926 	bool evmcs_gpa_changed = false;
1927 	u64 evmcs_gpa;
1928 
1929 	if (likely(!vmx->nested.enlightened_vmcs_enabled))
1930 		return 1;
1931 
1932 	if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
1933 		return 1;
1934 
1935 	if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
1936 		if (!vmx->nested.hv_evmcs)
1937 			vmx->nested.current_vmptr = -1ull;
1938 
1939 		nested_release_evmcs(vcpu);
1940 
1941 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
1942 				 &vmx->nested.hv_evmcs_map))
1943 			return 0;
1944 
1945 		vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1946 
1947 		/*
1948 		 * Currently, KVM only supports eVMCS version 1
1949 		 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1950 		 * value to first u32 field of eVMCS which should specify eVMCS
1951 		 * VersionNumber.
1952 		 *
1953 		 * Guest should be aware of supported eVMCS versions by host by
1954 		 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1955 		 * expected to set this CPUID leaf according to the value
1956 		 * returned in vmcs_version from nested_enable_evmcs().
1957 		 *
1958 		 * However, it turns out that Microsoft Hyper-V fails to comply
1959 		 * to their own invented interface: When Hyper-V use eVMCS, it
1960 		 * just sets first u32 field of eVMCS to revision_id specified
1961 		 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1962 		 * which is one of the supported versions specified in
1963 		 * CPUID.0x4000000A.EAX[0:15].
1964 		 *
1965 		 * To overcome Hyper-V bug, we accept here either a supported
1966 		 * eVMCS version or VMCS12 revision_id as valid values for first
1967 		 * u32 field of eVMCS.
1968 		 */
1969 		if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1970 		    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1971 			nested_release_evmcs(vcpu);
1972 			return 0;
1973 		}
1974 
1975 		vmx->nested.dirty_vmcs12 = true;
1976 		vmx->nested.hv_evmcs_vmptr = evmcs_gpa;
1977 
1978 		evmcs_gpa_changed = true;
1979 		/*
1980 		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1981 		 * reloaded from guest's memory (read only fields, fields not
1982 		 * present in struct hv_enlightened_vmcs, ...). Make sure there
1983 		 * are no leftovers.
1984 		 */
1985 		if (from_launch) {
1986 			struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1987 			memset(vmcs12, 0, sizeof(*vmcs12));
1988 			vmcs12->hdr.revision_id = VMCS12_REVISION;
1989 		}
1990 
1991 	}
1992 
1993 	/*
1994 	 * Clean fields data can't de used on VMLAUNCH and when we switch
1995 	 * between different L2 guests as KVM keeps a single VMCS12 per L1.
1996 	 */
1997 	if (from_launch || evmcs_gpa_changed)
1998 		vmx->nested.hv_evmcs->hv_clean_fields &=
1999 			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2000 
2001 	return 1;
2002 }
2003 
2004 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
2005 {
2006 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2007 
2008 	/*
2009 	 * hv_evmcs may end up being not mapped after migration (when
2010 	 * L2 was running), map it here to make sure vmcs12 changes are
2011 	 * properly reflected.
2012 	 */
2013 	if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
2014 		nested_vmx_handle_enlightened_vmptrld(vcpu, false);
2015 
2016 	if (vmx->nested.hv_evmcs) {
2017 		copy_vmcs12_to_enlightened(vmx);
2018 		/* All fields are clean */
2019 		vmx->nested.hv_evmcs->hv_clean_fields |=
2020 			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
2021 	} else {
2022 		copy_vmcs12_to_shadow(vmx);
2023 	}
2024 
2025 	vmx->nested.need_vmcs12_to_shadow_sync = false;
2026 }
2027 
2028 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
2029 {
2030 	struct vcpu_vmx *vmx =
2031 		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
2032 
2033 	vmx->nested.preemption_timer_expired = true;
2034 	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
2035 	kvm_vcpu_kick(&vmx->vcpu);
2036 
2037 	return HRTIMER_NORESTART;
2038 }
2039 
2040 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
2041 {
2042 	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
2043 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2044 
2045 	/*
2046 	 * A timer value of zero is architecturally guaranteed to cause
2047 	 * a VMExit prior to executing any instructions in the guest.
2048 	 */
2049 	if (preemption_timeout == 0) {
2050 		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
2051 		return;
2052 	}
2053 
2054 	if (vcpu->arch.virtual_tsc_khz == 0)
2055 		return;
2056 
2057 	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
2058 	preemption_timeout *= 1000000;
2059 	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
2060 	hrtimer_start(&vmx->nested.preemption_timer,
2061 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
2062 }
2063 
2064 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2065 {
2066 	if (vmx->nested.nested_run_pending &&
2067 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
2068 		return vmcs12->guest_ia32_efer;
2069 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
2070 		return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
2071 	else
2072 		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
2073 }
2074 
2075 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
2076 {
2077 	/*
2078 	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
2079 	 * according to L0's settings (vmcs12 is irrelevant here).  Host
2080 	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
2081 	 * will be set as needed prior to VMLAUNCH/VMRESUME.
2082 	 */
2083 	if (vmx->nested.vmcs02_initialized)
2084 		return;
2085 	vmx->nested.vmcs02_initialized = true;
2086 
2087 	/*
2088 	 * We don't care what the EPTP value is we just need to guarantee
2089 	 * it's valid so we don't get a false positive when doing early
2090 	 * consistency checks.
2091 	 */
2092 	if (enable_ept && nested_early_check)
2093 		vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
2094 
2095 	/* All VMFUNCs are currently emulated through L0 vmexits.  */
2096 	if (cpu_has_vmx_vmfunc())
2097 		vmcs_write64(VM_FUNCTION_CONTROL, 0);
2098 
2099 	if (cpu_has_vmx_posted_intr())
2100 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
2101 
2102 	if (cpu_has_vmx_msr_bitmap())
2103 		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
2104 
2105 	/*
2106 	 * The PML address never changes, so it is constant in vmcs02.
2107 	 * Conceptually we want to copy the PML index from vmcs01 here,
2108 	 * and then back to vmcs01 on nested vmexit.  But since we flush
2109 	 * the log and reset GUEST_PML_INDEX on each vmexit, the PML
2110 	 * index is also effectively constant in vmcs02.
2111 	 */
2112 	if (enable_pml) {
2113 		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
2114 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2115 	}
2116 
2117 	if (cpu_has_vmx_encls_vmexit())
2118 		vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2119 
2120 	/*
2121 	 * Set the MSR load/store lists to match L0's settings.  Only the
2122 	 * addresses are constant (for vmcs02), the counts can change based
2123 	 * on L2's behavior, e.g. switching to/from long mode.
2124 	 */
2125 	vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
2126 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
2127 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
2128 
2129 	vmx_set_constant_host_state(vmx);
2130 }
2131 
2132 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx,
2133 				      struct vmcs12 *vmcs12)
2134 {
2135 	prepare_vmcs02_constant_state(vmx);
2136 
2137 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
2138 
2139 	if (enable_vpid) {
2140 		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
2141 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
2142 		else
2143 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2144 	}
2145 }
2146 
2147 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2148 {
2149 	u32 exec_control, vmcs12_exec_ctrl;
2150 	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
2151 
2152 	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
2153 		prepare_vmcs02_early_rare(vmx, vmcs12);
2154 
2155 	/*
2156 	 * PIN CONTROLS
2157 	 */
2158 	exec_control = vmx_pin_based_exec_ctrl(vmx);
2159 	exec_control |= (vmcs12->pin_based_vm_exec_control &
2160 			 ~PIN_BASED_VMX_PREEMPTION_TIMER);
2161 
2162 	/* Posted interrupts setting is only taken from vmcs12.  */
2163 	if (nested_cpu_has_posted_intr(vmcs12)) {
2164 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2165 		vmx->nested.pi_pending = false;
2166 	} else {
2167 		exec_control &= ~PIN_BASED_POSTED_INTR;
2168 	}
2169 	pin_controls_set(vmx, exec_control);
2170 
2171 	/*
2172 	 * EXEC CONTROLS
2173 	 */
2174 	exec_control = vmx_exec_control(vmx); /* L0's desires */
2175 	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2176 	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2177 	exec_control &= ~CPU_BASED_TPR_SHADOW;
2178 	exec_control |= vmcs12->cpu_based_vm_exec_control;
2179 
2180 	vmx->nested.l1_tpr_threshold = -1;
2181 	if (exec_control & CPU_BASED_TPR_SHADOW)
2182 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2183 #ifdef CONFIG_X86_64
2184 	else
2185 		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2186 				CPU_BASED_CR8_STORE_EXITING;
2187 #endif
2188 
2189 	/*
2190 	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2191 	 * for I/O port accesses.
2192 	 */
2193 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2194 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2195 
2196 	/*
2197 	 * This bit will be computed in nested_get_vmcs12_pages, because
2198 	 * we do not have access to L1's MSR bitmap yet.  For now, keep
2199 	 * the same bit as before, hoping to avoid multiple VMWRITEs that
2200 	 * only set/clear this bit.
2201 	 */
2202 	exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
2203 	exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS;
2204 
2205 	exec_controls_set(vmx, exec_control);
2206 
2207 	/*
2208 	 * SECONDARY EXEC CONTROLS
2209 	 */
2210 	if (cpu_has_secondary_exec_ctrls()) {
2211 		exec_control = vmx->secondary_exec_control;
2212 
2213 		/* Take the following fields only from vmcs12 */
2214 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2215 				  SECONDARY_EXEC_ENABLE_INVPCID |
2216 				  SECONDARY_EXEC_RDTSCP |
2217 				  SECONDARY_EXEC_XSAVES |
2218 				  SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
2219 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2220 				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2221 				  SECONDARY_EXEC_ENABLE_VMFUNC);
2222 		if (nested_cpu_has(vmcs12,
2223 				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2224 			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2225 				~SECONDARY_EXEC_ENABLE_PML;
2226 			exec_control |= vmcs12_exec_ctrl;
2227 		}
2228 
2229 		/* VMCS shadowing for L2 is emulated for now */
2230 		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2231 
2232 		/*
2233 		 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4()
2234 		 * will not have to rewrite the controls just for this bit.
2235 		 */
2236 		if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() &&
2237 		    (vmcs12->guest_cr4 & X86_CR4_UMIP))
2238 			exec_control |= SECONDARY_EXEC_DESC;
2239 
2240 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2241 			vmcs_write16(GUEST_INTR_STATUS,
2242 				vmcs12->guest_intr_status);
2243 
2244 		secondary_exec_controls_set(vmx, exec_control);
2245 	}
2246 
2247 	/*
2248 	 * ENTRY CONTROLS
2249 	 *
2250 	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2251 	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2252 	 * on the related bits (if supported by the CPU) in the hope that
2253 	 * we can avoid VMWrites during vmx_set_efer().
2254 	 */
2255 	exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2256 			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2257 	if (cpu_has_load_ia32_efer()) {
2258 		if (guest_efer & EFER_LMA)
2259 			exec_control |= VM_ENTRY_IA32E_MODE;
2260 		if (guest_efer != host_efer)
2261 			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2262 	}
2263 	vm_entry_controls_set(vmx, exec_control);
2264 
2265 	/*
2266 	 * EXIT CONTROLS
2267 	 *
2268 	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2269 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2270 	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2271 	 */
2272 	exec_control = vmx_vmexit_ctrl();
2273 	if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2274 		exec_control |= VM_EXIT_LOAD_IA32_EFER;
2275 	vm_exit_controls_set(vmx, exec_control);
2276 
2277 	/*
2278 	 * Interrupt/Exception Fields
2279 	 */
2280 	if (vmx->nested.nested_run_pending) {
2281 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2282 			     vmcs12->vm_entry_intr_info_field);
2283 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2284 			     vmcs12->vm_entry_exception_error_code);
2285 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2286 			     vmcs12->vm_entry_instruction_len);
2287 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2288 			     vmcs12->guest_interruptibility_info);
2289 		vmx->loaded_vmcs->nmi_known_unmasked =
2290 			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2291 	} else {
2292 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2293 	}
2294 }
2295 
2296 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2297 {
2298 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2299 
2300 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2301 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2302 		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2303 		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2304 		vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2305 		vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2306 		vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2307 		vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2308 		vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2309 		vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2310 		vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2311 		vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2312 		vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2313 		vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2314 		vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2315 		vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2316 		vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2317 		vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2318 		vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2319 		vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2320 		vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2321 		vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2322 		vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2323 		vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2324 		vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2325 		vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2326 		vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2327 		vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2328 		vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2329 		vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2330 		vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2331 		vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2332 		vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2333 		vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2334 		vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2335 		vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2336 		vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2337 		vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2338 	}
2339 
2340 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2341 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2342 		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2343 		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2344 			    vmcs12->guest_pending_dbg_exceptions);
2345 		vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2346 		vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2347 
2348 		/*
2349 		 * L1 may access the L2's PDPTR, so save them to construct
2350 		 * vmcs12
2351 		 */
2352 		if (enable_ept) {
2353 			vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2354 			vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2355 			vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2356 			vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2357 		}
2358 
2359 		if (kvm_mpx_supported() && vmx->nested.nested_run_pending &&
2360 		    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2361 			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2362 	}
2363 
2364 	if (nested_cpu_has_xsaves(vmcs12))
2365 		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2366 
2367 	/*
2368 	 * Whether page-faults are trapped is determined by a combination of
2369 	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2370 	 * If enable_ept, L0 doesn't care about page faults and we should
2371 	 * set all of these to L1's desires. However, if !enable_ept, L0 does
2372 	 * care about (at least some) page faults, and because it is not easy
2373 	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
2374 	 * to exit on each and every L2 page fault. This is done by setting
2375 	 * MASK=MATCH=0 and (see below) EB.PF=1.
2376 	 * Note that below we don't need special code to set EB.PF beyond the
2377 	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2378 	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2379 	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2380 	 */
2381 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2382 		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2383 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2384 		enable_ept ? vmcs12->page_fault_error_code_match : 0);
2385 
2386 	if (cpu_has_vmx_apicv()) {
2387 		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2388 		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2389 		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2390 		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2391 	}
2392 
2393 	/*
2394 	 * Make sure the msr_autostore list is up to date before we set the
2395 	 * count in the vmcs02.
2396 	 */
2397 	prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
2398 
2399 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
2400 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2401 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2402 
2403 	set_cr4_guest_host_mask(vmx);
2404 }
2405 
2406 /*
2407  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2408  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2409  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2410  * guest in a way that will both be appropriate to L1's requests, and our
2411  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2412  * function also has additional necessary side-effects, like setting various
2413  * vcpu->arch fields.
2414  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2415  * is assigned to entry_failure_code on failure.
2416  */
2417 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2418 			  u32 *entry_failure_code)
2419 {
2420 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2421 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2422 	bool load_guest_pdptrs_vmcs12 = false;
2423 
2424 	if (vmx->nested.dirty_vmcs12 || hv_evmcs) {
2425 		prepare_vmcs02_rare(vmx, vmcs12);
2426 		vmx->nested.dirty_vmcs12 = false;
2427 
2428 		load_guest_pdptrs_vmcs12 = !hv_evmcs ||
2429 			!(hv_evmcs->hv_clean_fields &
2430 			  HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1);
2431 	}
2432 
2433 	if (vmx->nested.nested_run_pending &&
2434 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2435 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2436 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2437 	} else {
2438 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2439 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2440 	}
2441 	if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
2442 	    !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
2443 		vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2444 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2445 
2446 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2447 	 * bitwise-or of what L1 wants to trap for L2, and what we want to
2448 	 * trap. Note that CR0.TS also needs updating - we do this later.
2449 	 */
2450 	update_exception_bitmap(vcpu);
2451 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2452 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2453 
2454 	if (vmx->nested.nested_run_pending &&
2455 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2456 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2457 		vcpu->arch.pat = vmcs12->guest_ia32_pat;
2458 	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2459 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2460 	}
2461 
2462 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2463 
2464 	if (kvm_has_tsc_control)
2465 		decache_tsc_multiplier(vmx);
2466 
2467 	if (enable_vpid) {
2468 		/*
2469 		 * There is no direct mapping between vpid02 and vpid12, the
2470 		 * vpid02 is per-vCPU for L0 and reused while the value of
2471 		 * vpid12 is changed w/ one invvpid during nested vmentry.
2472 		 * The vpid12 is allocated by L1 for L2, so it will not
2473 		 * influence global bitmap(for vpid01 and vpid02 allocation)
2474 		 * even if spawn a lot of nested vCPUs.
2475 		 */
2476 		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2477 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2478 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2479 				__vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2480 			}
2481 		} else {
2482 			/*
2483 			 * If L1 use EPT, then L0 needs to execute INVEPT on
2484 			 * EPTP02 instead of EPTP01. Therefore, delay TLB
2485 			 * flush until vmcs02->eptp is fully updated by
2486 			 * KVM_REQ_LOAD_CR3. Note that this assumes
2487 			 * KVM_REQ_TLB_FLUSH is evaluated after
2488 			 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2489 			 */
2490 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2491 		}
2492 	}
2493 
2494 	if (nested_cpu_has_ept(vmcs12))
2495 		nested_ept_init_mmu_context(vcpu);
2496 
2497 	/*
2498 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2499 	 * bits which we consider mandatory enabled.
2500 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
2501 	 * the specifications by L1; It's not enough to take
2502 	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2503 	 * have more bits than L1 expected.
2504 	 */
2505 	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2506 	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2507 
2508 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2509 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2510 
2511 	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2512 	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2513 	vmx_set_efer(vcpu, vcpu->arch.efer);
2514 
2515 	/*
2516 	 * Guest state is invalid and unrestricted guest is disabled,
2517 	 * which means L1 attempted VMEntry to L2 with invalid state.
2518 	 * Fail the VMEntry.
2519 	 */
2520 	if (vmx->emulation_required) {
2521 		*entry_failure_code = ENTRY_FAIL_DEFAULT;
2522 		return -EINVAL;
2523 	}
2524 
2525 	/* Shadow page tables on either EPT or shadow page tables. */
2526 	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2527 				entry_failure_code))
2528 		return -EINVAL;
2529 
2530 	/*
2531 	 * Immediately write vmcs02.GUEST_CR3.  It will be propagated to vmcs12
2532 	 * on nested VM-Exit, which can occur without actually running L2 and
2533 	 * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
2534 	 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
2535 	 * transition to HLT instead of running L2.
2536 	 */
2537 	if (enable_ept)
2538 		vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
2539 
2540 	/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
2541 	if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
2542 	    is_pae_paging(vcpu)) {
2543 		vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2544 		vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2545 		vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2546 		vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2547 	}
2548 
2549 	if (!enable_ept)
2550 		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2551 
2552 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2553 	    SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
2554 			    vmcs12->guest_ia32_perf_global_ctrl))
2555 		return -EINVAL;
2556 
2557 	kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2558 	kvm_rip_write(vcpu, vmcs12->guest_rip);
2559 	return 0;
2560 }
2561 
2562 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2563 {
2564 	if (CC(!nested_cpu_has_nmi_exiting(vmcs12) &&
2565 	       nested_cpu_has_virtual_nmis(vmcs12)))
2566 		return -EINVAL;
2567 
2568 	if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
2569 	       nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
2570 		return -EINVAL;
2571 
2572 	return 0;
2573 }
2574 
2575 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2576 {
2577 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2578 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2579 
2580 	/* Check for memory type validity */
2581 	switch (address & VMX_EPTP_MT_MASK) {
2582 	case VMX_EPTP_MT_UC:
2583 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
2584 			return false;
2585 		break;
2586 	case VMX_EPTP_MT_WB:
2587 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT)))
2588 			return false;
2589 		break;
2590 	default:
2591 		return false;
2592 	}
2593 
2594 	/* only 4 levels page-walk length are valid */
2595 	if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
2596 		return false;
2597 
2598 	/* Reserved bits should not be set */
2599 	if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
2600 		return false;
2601 
2602 	/* AD, if set, should be supported */
2603 	if (address & VMX_EPTP_AD_ENABLE_BIT) {
2604 		if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
2605 			return false;
2606 	}
2607 
2608 	return true;
2609 }
2610 
2611 /*
2612  * Checks related to VM-Execution Control Fields
2613  */
2614 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2615                                               struct vmcs12 *vmcs12)
2616 {
2617 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2618 
2619 	if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2620 				   vmx->nested.msrs.pinbased_ctls_low,
2621 				   vmx->nested.msrs.pinbased_ctls_high)) ||
2622 	    CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2623 				   vmx->nested.msrs.procbased_ctls_low,
2624 				   vmx->nested.msrs.procbased_ctls_high)))
2625 		return -EINVAL;
2626 
2627 	if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2628 	    CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control,
2629 				   vmx->nested.msrs.secondary_ctls_low,
2630 				   vmx->nested.msrs.secondary_ctls_high)))
2631 		return -EINVAL;
2632 
2633 	if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) ||
2634 	    nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2635 	    nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2636 	    nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2637 	    nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2638 	    nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2639 	    nested_vmx_check_nmi_controls(vmcs12) ||
2640 	    nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2641 	    nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2642 	    nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2643 	    nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2644 	    CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2645 		return -EINVAL;
2646 
2647 	if (!nested_cpu_has_preemption_timer(vmcs12) &&
2648 	    nested_cpu_has_save_preemption_timer(vmcs12))
2649 		return -EINVAL;
2650 
2651 	if (nested_cpu_has_ept(vmcs12) &&
2652 	    CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
2653 		return -EINVAL;
2654 
2655 	if (nested_cpu_has_vmfunc(vmcs12)) {
2656 		if (CC(vmcs12->vm_function_control &
2657 		       ~vmx->nested.msrs.vmfunc_controls))
2658 			return -EINVAL;
2659 
2660 		if (nested_cpu_has_eptp_switching(vmcs12)) {
2661 			if (CC(!nested_cpu_has_ept(vmcs12)) ||
2662 			    CC(!page_address_valid(vcpu, vmcs12->eptp_list_address)))
2663 				return -EINVAL;
2664 		}
2665 	}
2666 
2667 	return 0;
2668 }
2669 
2670 /*
2671  * Checks related to VM-Exit Control Fields
2672  */
2673 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2674                                          struct vmcs12 *vmcs12)
2675 {
2676 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2677 
2678 	if (CC(!vmx_control_verify(vmcs12->vm_exit_controls,
2679 				    vmx->nested.msrs.exit_ctls_low,
2680 				    vmx->nested.msrs.exit_ctls_high)) ||
2681 	    CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12)))
2682 		return -EINVAL;
2683 
2684 	return 0;
2685 }
2686 
2687 /*
2688  * Checks related to VM-Entry Control Fields
2689  */
2690 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2691 					  struct vmcs12 *vmcs12)
2692 {
2693 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2694 
2695 	if (CC(!vmx_control_verify(vmcs12->vm_entry_controls,
2696 				    vmx->nested.msrs.entry_ctls_low,
2697 				    vmx->nested.msrs.entry_ctls_high)))
2698 		return -EINVAL;
2699 
2700 	/*
2701 	 * From the Intel SDM, volume 3:
2702 	 * Fields relevant to VM-entry event injection must be set properly.
2703 	 * These fields are the VM-entry interruption-information field, the
2704 	 * VM-entry exception error code, and the VM-entry instruction length.
2705 	 */
2706 	if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2707 		u32 intr_info = vmcs12->vm_entry_intr_info_field;
2708 		u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2709 		u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2710 		bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2711 		bool should_have_error_code;
2712 		bool urg = nested_cpu_has2(vmcs12,
2713 					   SECONDARY_EXEC_UNRESTRICTED_GUEST);
2714 		bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2715 
2716 		/* VM-entry interruption-info field: interruption type */
2717 		if (CC(intr_type == INTR_TYPE_RESERVED) ||
2718 		    CC(intr_type == INTR_TYPE_OTHER_EVENT &&
2719 		       !nested_cpu_supports_monitor_trap_flag(vcpu)))
2720 			return -EINVAL;
2721 
2722 		/* VM-entry interruption-info field: vector */
2723 		if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2724 		    CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2725 		    CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2726 			return -EINVAL;
2727 
2728 		/* VM-entry interruption-info field: deliver error code */
2729 		should_have_error_code =
2730 			intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2731 			x86_exception_has_error_code(vector);
2732 		if (CC(has_error_code != should_have_error_code))
2733 			return -EINVAL;
2734 
2735 		/* VM-entry exception error code */
2736 		if (CC(has_error_code &&
2737 		       vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
2738 			return -EINVAL;
2739 
2740 		/* VM-entry interruption-info field: reserved bits */
2741 		if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK))
2742 			return -EINVAL;
2743 
2744 		/* VM-entry instruction length */
2745 		switch (intr_type) {
2746 		case INTR_TYPE_SOFT_EXCEPTION:
2747 		case INTR_TYPE_SOFT_INTR:
2748 		case INTR_TYPE_PRIV_SW_EXCEPTION:
2749 			if (CC(vmcs12->vm_entry_instruction_len > 15) ||
2750 			    CC(vmcs12->vm_entry_instruction_len == 0 &&
2751 			    CC(!nested_cpu_has_zero_length_injection(vcpu))))
2752 				return -EINVAL;
2753 		}
2754 	}
2755 
2756 	if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2757 		return -EINVAL;
2758 
2759 	return 0;
2760 }
2761 
2762 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2763 				     struct vmcs12 *vmcs12)
2764 {
2765 	if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2766 	    nested_check_vm_exit_controls(vcpu, vmcs12) ||
2767 	    nested_check_vm_entry_controls(vcpu, vmcs12))
2768 		return -EINVAL;
2769 
2770 	return 0;
2771 }
2772 
2773 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2774 				       struct vmcs12 *vmcs12)
2775 {
2776 	bool ia32e;
2777 
2778 	if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) ||
2779 	    CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) ||
2780 	    CC(!nested_cr3_valid(vcpu, vmcs12->host_cr3)))
2781 		return -EINVAL;
2782 
2783 	if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) ||
2784 	    CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu)))
2785 		return -EINVAL;
2786 
2787 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2788 	    CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
2789 		return -EINVAL;
2790 
2791 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2792 	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2793 					   vmcs12->host_ia32_perf_global_ctrl)))
2794 		return -EINVAL;
2795 
2796 #ifdef CONFIG_X86_64
2797 	ia32e = !!(vcpu->arch.efer & EFER_LMA);
2798 #else
2799 	ia32e = false;
2800 #endif
2801 
2802 	if (ia32e) {
2803 		if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
2804 		    CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
2805 			return -EINVAL;
2806 	} else {
2807 		if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
2808 		    CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
2809 		    CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
2810 		    CC((vmcs12->host_rip) >> 32))
2811 			return -EINVAL;
2812 	}
2813 
2814 	if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2815 	    CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2816 	    CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2817 	    CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2818 	    CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2819 	    CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2820 	    CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) ||
2821 	    CC(vmcs12->host_cs_selector == 0) ||
2822 	    CC(vmcs12->host_tr_selector == 0) ||
2823 	    CC(vmcs12->host_ss_selector == 0 && !ia32e))
2824 		return -EINVAL;
2825 
2826 #ifdef CONFIG_X86_64
2827 	if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
2828 	    CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
2829 	    CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
2830 	    CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) ||
2831 	    CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
2832 	    CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
2833 		return -EINVAL;
2834 #endif
2835 
2836 	/*
2837 	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2838 	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2839 	 * the values of the LMA and LME bits in the field must each be that of
2840 	 * the host address-space size VM-exit control.
2841 	 */
2842 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2843 		if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) ||
2844 		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) ||
2845 		    CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)))
2846 			return -EINVAL;
2847 	}
2848 
2849 	return 0;
2850 }
2851 
2852 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2853 					  struct vmcs12 *vmcs12)
2854 {
2855 	int r = 0;
2856 	struct vmcs12 *shadow;
2857 	struct kvm_host_map map;
2858 
2859 	if (vmcs12->vmcs_link_pointer == -1ull)
2860 		return 0;
2861 
2862 	if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer)))
2863 		return -EINVAL;
2864 
2865 	if (CC(kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map)))
2866 		return -EINVAL;
2867 
2868 	shadow = map.hva;
2869 
2870 	if (CC(shadow->hdr.revision_id != VMCS12_REVISION) ||
2871 	    CC(shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12)))
2872 		r = -EINVAL;
2873 
2874 	kvm_vcpu_unmap(vcpu, &map, false);
2875 	return r;
2876 }
2877 
2878 /*
2879  * Checks related to Guest Non-register State
2880  */
2881 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2882 {
2883 	if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2884 	       vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT))
2885 		return -EINVAL;
2886 
2887 	return 0;
2888 }
2889 
2890 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2891 					struct vmcs12 *vmcs12,
2892 					u32 *exit_qual)
2893 {
2894 	bool ia32e;
2895 
2896 	*exit_qual = ENTRY_FAIL_DEFAULT;
2897 
2898 	if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
2899 	    CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
2900 		return -EINVAL;
2901 
2902 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2903 	    CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
2904 		return -EINVAL;
2905 
2906 	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2907 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2908 		return -EINVAL;
2909 	}
2910 
2911 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
2912 	    CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
2913 					   vmcs12->guest_ia32_perf_global_ctrl)))
2914 		return -EINVAL;
2915 
2916 	/*
2917 	 * If the load IA32_EFER VM-entry control is 1, the following checks
2918 	 * are performed on the field for the IA32_EFER MSR:
2919 	 * - Bits reserved in the IA32_EFER MSR must be 0.
2920 	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2921 	 *   the IA-32e mode guest VM-exit control. It must also be identical
2922 	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2923 	 *   CR0.PG) is 1.
2924 	 */
2925 	if (to_vmx(vcpu)->nested.nested_run_pending &&
2926 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2927 		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2928 		if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) ||
2929 		    CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) ||
2930 		    CC(((vmcs12->guest_cr0 & X86_CR0_PG) &&
2931 		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))))
2932 			return -EINVAL;
2933 	}
2934 
2935 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2936 	    (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) ||
2937 	     CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))))
2938 		return -EINVAL;
2939 
2940 	if (nested_check_guest_non_reg_state(vmcs12))
2941 		return -EINVAL;
2942 
2943 	return 0;
2944 }
2945 
2946 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2947 {
2948 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2949 	unsigned long cr3, cr4;
2950 	bool vm_fail;
2951 
2952 	if (!nested_early_check)
2953 		return 0;
2954 
2955 	if (vmx->msr_autoload.host.nr)
2956 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2957 	if (vmx->msr_autoload.guest.nr)
2958 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2959 
2960 	preempt_disable();
2961 
2962 	vmx_prepare_switch_to_guest(vcpu);
2963 
2964 	/*
2965 	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2966 	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2967 	 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2968 	 * there is no need to preserve other bits or save/restore the field.
2969 	 */
2970 	vmcs_writel(GUEST_RFLAGS, 0);
2971 
2972 	cr3 = __get_current_cr3_fast();
2973 	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2974 		vmcs_writel(HOST_CR3, cr3);
2975 		vmx->loaded_vmcs->host_state.cr3 = cr3;
2976 	}
2977 
2978 	cr4 = cr4_read_shadow();
2979 	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2980 		vmcs_writel(HOST_CR4, cr4);
2981 		vmx->loaded_vmcs->host_state.cr4 = cr4;
2982 	}
2983 
2984 	asm(
2985 		"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2986 		"cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2987 		"je 1f \n\t"
2988 		__ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2989 		"mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2990 		"1: \n\t"
2991 		"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2992 
2993 		/* Check if vmlaunch or vmresume is needed */
2994 		"cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2995 
2996 		/*
2997 		 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2998 		 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2999 		 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
3000 		 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
3001 		 */
3002 		"call vmx_vmenter\n\t"
3003 
3004 		CC_SET(be)
3005 	      : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
3006 	      :	[HOST_RSP]"r"((unsigned long)HOST_RSP),
3007 		[loaded_vmcs]"r"(vmx->loaded_vmcs),
3008 		[launched]"i"(offsetof(struct loaded_vmcs, launched)),
3009 		[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
3010 		[wordsize]"i"(sizeof(ulong))
3011 	      : "memory"
3012 	);
3013 
3014 	if (vmx->msr_autoload.host.nr)
3015 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3016 	if (vmx->msr_autoload.guest.nr)
3017 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3018 
3019 	if (vm_fail) {
3020 		u32 error = vmcs_read32(VM_INSTRUCTION_ERROR);
3021 
3022 		preempt_enable();
3023 
3024 		trace_kvm_nested_vmenter_failed(
3025 			"early hardware check VM-instruction error: ", error);
3026 		WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3027 		return 1;
3028 	}
3029 
3030 	/*
3031 	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
3032 	 */
3033 	local_irq_enable();
3034 	if (hw_breakpoint_active())
3035 		set_debugreg(__this_cpu_read(cpu_dr7), 7);
3036 	preempt_enable();
3037 
3038 	/*
3039 	 * A non-failing VMEntry means we somehow entered guest mode with
3040 	 * an illegal RIP, and that's just the tip of the iceberg.  There
3041 	 * is no telling what memory has been modified or what state has
3042 	 * been exposed to unknown code.  Hitting this all but guarantees
3043 	 * a (very critical) hardware issue.
3044 	 */
3045 	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
3046 		VMX_EXIT_REASONS_FAILED_VMENTRY));
3047 
3048 	return 0;
3049 }
3050 
3051 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
3052 						 struct vmcs12 *vmcs12);
3053 
3054 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
3055 {
3056 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3057 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3058 	struct kvm_host_map *map;
3059 	struct page *page;
3060 	u64 hpa;
3061 
3062 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3063 		/*
3064 		 * Translate L1 physical address to host physical
3065 		 * address for vmcs02. Keep the page pinned, so this
3066 		 * physical address remains valid. We keep a reference
3067 		 * to it so we can release it later.
3068 		 */
3069 		if (vmx->nested.apic_access_page) { /* shouldn't happen */
3070 			kvm_release_page_clean(vmx->nested.apic_access_page);
3071 			vmx->nested.apic_access_page = NULL;
3072 		}
3073 		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
3074 		if (!is_error_page(page)) {
3075 			vmx->nested.apic_access_page = page;
3076 			hpa = page_to_phys(vmx->nested.apic_access_page);
3077 			vmcs_write64(APIC_ACCESS_ADDR, hpa);
3078 		} else {
3079 			pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
3080 					     __func__);
3081 			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3082 			vcpu->run->internal.suberror =
3083 				KVM_INTERNAL_ERROR_EMULATION;
3084 			vcpu->run->internal.ndata = 0;
3085 			return false;
3086 		}
3087 	}
3088 
3089 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3090 		map = &vmx->nested.virtual_apic_map;
3091 
3092 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
3093 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
3094 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
3095 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
3096 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3097 			/*
3098 			 * The processor will never use the TPR shadow, simply
3099 			 * clear the bit from the execution control.  Such a
3100 			 * configuration is useless, but it happens in tests.
3101 			 * For any other configuration, failing the vm entry is
3102 			 * _not_ what the processor does but it's basically the
3103 			 * only possibility we have.
3104 			 */
3105 			exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW);
3106 		} else {
3107 			/*
3108 			 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to
3109 			 * force VM-Entry to fail.
3110 			 */
3111 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
3112 		}
3113 	}
3114 
3115 	if (nested_cpu_has_posted_intr(vmcs12)) {
3116 		map = &vmx->nested.pi_desc_map;
3117 
3118 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
3119 			vmx->nested.pi_desc =
3120 				(struct pi_desc *)(((void *)map->hva) +
3121 				offset_in_page(vmcs12->posted_intr_desc_addr));
3122 			vmcs_write64(POSTED_INTR_DESC_ADDR,
3123 				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
3124 		}
3125 	}
3126 	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
3127 		exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3128 	else
3129 		exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
3130 	return true;
3131 }
3132 
3133 /*
3134  * Intel's VMX Instruction Reference specifies a common set of prerequisites
3135  * for running VMX instructions (except VMXON, whose prerequisites are
3136  * slightly different). It also specifies what exception to inject otherwise.
3137  * Note that many of these exceptions have priority over VM exits, so they
3138  * don't have to be checked again here.
3139  */
3140 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
3141 {
3142 	if (!to_vmx(vcpu)->nested.vmxon) {
3143 		kvm_queue_exception(vcpu, UD_VECTOR);
3144 		return 0;
3145 	}
3146 
3147 	if (vmx_get_cpl(vcpu)) {
3148 		kvm_inject_gp(vcpu, 0);
3149 		return 0;
3150 	}
3151 
3152 	return 1;
3153 }
3154 
3155 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
3156 {
3157 	u8 rvi = vmx_get_rvi();
3158 	u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
3159 
3160 	return ((rvi & 0xf0) > (vppr & 0xf0));
3161 }
3162 
3163 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3164 				   struct vmcs12 *vmcs12);
3165 
3166 /*
3167  * If from_vmentry is false, this is being called from state restore (either RSM
3168  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
3169  *
3170  * Returns:
3171  *	NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
3172  *	NVMX_ENTRY_VMFAIL:  Consistency check VMFail
3173  *	NVMX_ENTRY_VMEXIT:  Consistency check VMExit
3174  *	NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
3175  */
3176 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
3177 							bool from_vmentry)
3178 {
3179 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3180 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3181 	bool evaluate_pending_interrupts;
3182 	u32 exit_reason = EXIT_REASON_INVALID_STATE;
3183 	u32 exit_qual;
3184 
3185 	evaluate_pending_interrupts = exec_controls_get(vmx) &
3186 		(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
3187 	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
3188 		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
3189 
3190 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
3191 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3192 	if (kvm_mpx_supported() &&
3193 		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
3194 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3195 
3196 	/*
3197 	 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and*
3198 	 * nested early checks are disabled.  In the event of a "late" VM-Fail,
3199 	 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its
3200 	 * software model to the pre-VMEntry host state.  When EPT is disabled,
3201 	 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes
3202 	 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3.  Stuffing
3203 	 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to
3204 	 * the correct value.  Smashing vmcs01.GUEST_CR3 is safe because nested
3205 	 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is
3206 	 * guaranteed to be overwritten with a shadow CR3 prior to re-entering
3207 	 * L1.  Don't stuff vmcs01.GUEST_CR3 when using nested early checks as
3208 	 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks
3209 	 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail
3210 	 * path would need to manually save/restore vmcs01.GUEST_CR3.
3211 	 */
3212 	if (!enable_ept && !nested_early_check)
3213 		vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3214 
3215 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
3216 
3217 	prepare_vmcs02_early(vmx, vmcs12);
3218 
3219 	if (from_vmentry) {
3220 		if (unlikely(!nested_get_vmcs12_pages(vcpu)))
3221 			return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
3222 
3223 		if (nested_vmx_check_vmentry_hw(vcpu)) {
3224 			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3225 			return NVMX_VMENTRY_VMFAIL;
3226 		}
3227 
3228 		if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
3229 			goto vmentry_fail_vmexit;
3230 	}
3231 
3232 	enter_guest_mode(vcpu);
3233 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3234 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
3235 
3236 	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
3237 		goto vmentry_fail_vmexit_guest_mode;
3238 
3239 	if (from_vmentry) {
3240 		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
3241 		exit_qual = nested_vmx_load_msr(vcpu,
3242 						vmcs12->vm_entry_msr_load_addr,
3243 						vmcs12->vm_entry_msr_load_count);
3244 		if (exit_qual)
3245 			goto vmentry_fail_vmexit_guest_mode;
3246 	} else {
3247 		/*
3248 		 * The MMU is not initialized to point at the right entities yet and
3249 		 * "get pages" would need to read data from the guest (i.e. we will
3250 		 * need to perform gpa to hpa translation). Request a call
3251 		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3252 		 * have already been set at vmentry time and should not be reset.
3253 		 */
3254 		kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3255 	}
3256 
3257 	/*
3258 	 * If L1 had a pending IRQ/NMI until it executed
3259 	 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3260 	 * disallowed (e.g. interrupts disabled), L0 needs to
3261 	 * evaluate if this pending event should cause an exit from L2
3262 	 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3263 	 * intercept EXTERNAL_INTERRUPT).
3264 	 *
3265 	 * Usually this would be handled by the processor noticing an
3266 	 * IRQ/NMI window request, or checking RVI during evaluation of
3267 	 * pending virtual interrupts.  However, this setting was done
3268 	 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3269 	 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3270 	 */
3271 	if (unlikely(evaluate_pending_interrupts))
3272 		kvm_make_request(KVM_REQ_EVENT, vcpu);
3273 
3274 	/*
3275 	 * Do not start the preemption timer hrtimer until after we know
3276 	 * we are successful, so that only nested_vmx_vmexit needs to cancel
3277 	 * the timer.
3278 	 */
3279 	vmx->nested.preemption_timer_expired = false;
3280 	if (nested_cpu_has_preemption_timer(vmcs12))
3281 		vmx_start_preemption_timer(vcpu);
3282 
3283 	/*
3284 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3285 	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3286 	 * returned as far as L1 is concerned. It will only return (and set
3287 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3288 	 */
3289 	return NVMX_VMENTRY_SUCCESS;
3290 
3291 	/*
3292 	 * A failed consistency check that leads to a VMExit during L1's
3293 	 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3294 	 * 26.7 "VM-entry failures during or after loading guest state".
3295 	 */
3296 vmentry_fail_vmexit_guest_mode:
3297 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3298 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3299 	leave_guest_mode(vcpu);
3300 
3301 vmentry_fail_vmexit:
3302 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3303 
3304 	if (!from_vmentry)
3305 		return NVMX_VMENTRY_VMEXIT;
3306 
3307 	load_vmcs12_host_state(vcpu, vmcs12);
3308 	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3309 	vmcs12->exit_qualification = exit_qual;
3310 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3311 		vmx->nested.need_vmcs12_to_shadow_sync = true;
3312 	return NVMX_VMENTRY_VMEXIT;
3313 }
3314 
3315 /*
3316  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3317  * for running an L2 nested guest.
3318  */
3319 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3320 {
3321 	struct vmcs12 *vmcs12;
3322 	enum nvmx_vmentry_status status;
3323 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3324 	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3325 
3326 	if (!nested_vmx_check_permission(vcpu))
3327 		return 1;
3328 
3329 	if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
3330 		return 1;
3331 
3332 	if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3333 		return nested_vmx_failInvalid(vcpu);
3334 
3335 	vmcs12 = get_vmcs12(vcpu);
3336 
3337 	/*
3338 	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3339 	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3340 	 * rather than RFLAGS.ZF, and no error number is stored to the
3341 	 * VM-instruction error field.
3342 	 */
3343 	if (vmcs12->hdr.shadow_vmcs)
3344 		return nested_vmx_failInvalid(vcpu);
3345 
3346 	if (vmx->nested.hv_evmcs) {
3347 		copy_enlightened_to_vmcs12(vmx);
3348 		/* Enlightened VMCS doesn't have launch state */
3349 		vmcs12->launch_state = !launch;
3350 	} else if (enable_shadow_vmcs) {
3351 		copy_shadow_to_vmcs12(vmx);
3352 	}
3353 
3354 	/*
3355 	 * The nested entry process starts with enforcing various prerequisites
3356 	 * on vmcs12 as required by the Intel SDM, and act appropriately when
3357 	 * they fail: As the SDM explains, some conditions should cause the
3358 	 * instruction to fail, while others will cause the instruction to seem
3359 	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3360 	 * To speed up the normal (success) code path, we should avoid checking
3361 	 * for misconfigurations which will anyway be caught by the processor
3362 	 * when using the merged vmcs02.
3363 	 */
3364 	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3365 		return nested_vmx_failValid(vcpu,
3366 			VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3367 
3368 	if (vmcs12->launch_state == launch)
3369 		return nested_vmx_failValid(vcpu,
3370 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3371 			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3372 
3373 	if (nested_vmx_check_controls(vcpu, vmcs12))
3374 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3375 
3376 	if (nested_vmx_check_host_state(vcpu, vmcs12))
3377 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3378 
3379 	/*
3380 	 * We're finally done with prerequisite checking, and can start with
3381 	 * the nested entry.
3382 	 */
3383 	vmx->nested.nested_run_pending = 1;
3384 	status = nested_vmx_enter_non_root_mode(vcpu, true);
3385 	if (unlikely(status != NVMX_VMENTRY_SUCCESS))
3386 		goto vmentry_failed;
3387 
3388 	/* Hide L1D cache contents from the nested guest.  */
3389 	vmx->vcpu.arch.l1tf_flush_l1d = true;
3390 
3391 	/*
3392 	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3393 	 * also be used as part of restoring nVMX state for
3394 	 * snapshot restore (migration).
3395 	 *
3396 	 * In this flow, it is assumed that vmcs12 cache was
3397 	 * trasferred as part of captured nVMX state and should
3398 	 * therefore not be read from guest memory (which may not
3399 	 * exist on destination host yet).
3400 	 */
3401 	nested_cache_shadow_vmcs12(vcpu, vmcs12);
3402 
3403 	/*
3404 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3405 	 * awakened by event injection or by an NMI-window VM-exit or
3406 	 * by an interrupt-window VM-exit, halt the vcpu.
3407 	 */
3408 	if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3409 	    !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3410 	    !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3411 	    !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3412 	      (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3413 		vmx->nested.nested_run_pending = 0;
3414 		return kvm_vcpu_halt(vcpu);
3415 	}
3416 	return 1;
3417 
3418 vmentry_failed:
3419 	vmx->nested.nested_run_pending = 0;
3420 	if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
3421 		return 0;
3422 	if (status == NVMX_VMENTRY_VMEXIT)
3423 		return 1;
3424 	WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
3425 	return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3426 }
3427 
3428 /*
3429  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3430  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3431  * This function returns the new value we should put in vmcs12.guest_cr0.
3432  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3433  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3434  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3435  *     didn't trap the bit, because if L1 did, so would L0).
3436  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3437  *     been modified by L2, and L1 knows it. So just leave the old value of
3438  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3439  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3440  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3441  *     changed these bits, and therefore they need to be updated, but L0
3442  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3443  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3444  */
3445 static inline unsigned long
3446 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3447 {
3448 	return
3449 	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3450 	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3451 	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3452 			vcpu->arch.cr0_guest_owned_bits));
3453 }
3454 
3455 static inline unsigned long
3456 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3457 {
3458 	return
3459 	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3460 	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3461 	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3462 			vcpu->arch.cr4_guest_owned_bits));
3463 }
3464 
3465 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3466 				      struct vmcs12 *vmcs12)
3467 {
3468 	u32 idt_vectoring;
3469 	unsigned int nr;
3470 
3471 	if (vcpu->arch.exception.injected) {
3472 		nr = vcpu->arch.exception.nr;
3473 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3474 
3475 		if (kvm_exception_is_soft(nr)) {
3476 			vmcs12->vm_exit_instruction_len =
3477 				vcpu->arch.event_exit_inst_len;
3478 			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3479 		} else
3480 			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3481 
3482 		if (vcpu->arch.exception.has_error_code) {
3483 			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3484 			vmcs12->idt_vectoring_error_code =
3485 				vcpu->arch.exception.error_code;
3486 		}
3487 
3488 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3489 	} else if (vcpu->arch.nmi_injected) {
3490 		vmcs12->idt_vectoring_info_field =
3491 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3492 	} else if (vcpu->arch.interrupt.injected) {
3493 		nr = vcpu->arch.interrupt.nr;
3494 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3495 
3496 		if (vcpu->arch.interrupt.soft) {
3497 			idt_vectoring |= INTR_TYPE_SOFT_INTR;
3498 			vmcs12->vm_entry_instruction_len =
3499 				vcpu->arch.event_exit_inst_len;
3500 		} else
3501 			idt_vectoring |= INTR_TYPE_EXT_INTR;
3502 
3503 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3504 	}
3505 }
3506 
3507 
3508 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3509 {
3510 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3511 	gfn_t gfn;
3512 
3513 	/*
3514 	 * Don't need to mark the APIC access page dirty; it is never
3515 	 * written to by the CPU during APIC virtualization.
3516 	 */
3517 
3518 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3519 		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3520 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3521 	}
3522 
3523 	if (nested_cpu_has_posted_intr(vmcs12)) {
3524 		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3525 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3526 	}
3527 }
3528 
3529 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3530 {
3531 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3532 	int max_irr;
3533 	void *vapic_page;
3534 	u16 status;
3535 
3536 	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3537 		return;
3538 
3539 	vmx->nested.pi_pending = false;
3540 	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3541 		return;
3542 
3543 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3544 	if (max_irr != 256) {
3545 		vapic_page = vmx->nested.virtual_apic_map.hva;
3546 		if (!vapic_page)
3547 			return;
3548 
3549 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3550 			vapic_page, &max_irr);
3551 		status = vmcs_read16(GUEST_INTR_STATUS);
3552 		if ((u8)max_irr > ((u8)status & 0xff)) {
3553 			status &= ~0xff;
3554 			status |= (u8)max_irr;
3555 			vmcs_write16(GUEST_INTR_STATUS, status);
3556 		}
3557 	}
3558 
3559 	nested_mark_vmcs12_pages_dirty(vcpu);
3560 }
3561 
3562 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3563 					       unsigned long exit_qual)
3564 {
3565 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3566 	unsigned int nr = vcpu->arch.exception.nr;
3567 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
3568 
3569 	if (vcpu->arch.exception.has_error_code) {
3570 		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3571 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3572 	}
3573 
3574 	if (kvm_exception_is_soft(nr))
3575 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3576 	else
3577 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
3578 
3579 	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3580 	    vmx_get_nmi_mask(vcpu))
3581 		intr_info |= INTR_INFO_UNBLOCK_NMI;
3582 
3583 	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3584 }
3585 
3586 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3587 {
3588 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3589 	unsigned long exit_qual;
3590 	bool block_nested_events =
3591 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3592 	struct kvm_lapic *apic = vcpu->arch.apic;
3593 
3594 	if (lapic_in_kernel(vcpu) &&
3595 		test_bit(KVM_APIC_INIT, &apic->pending_events)) {
3596 		if (block_nested_events)
3597 			return -EBUSY;
3598 		clear_bit(KVM_APIC_INIT, &apic->pending_events);
3599 		nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
3600 		return 0;
3601 	}
3602 
3603 	if (vcpu->arch.exception.pending &&
3604 		nested_vmx_check_exception(vcpu, &exit_qual)) {
3605 		if (block_nested_events)
3606 			return -EBUSY;
3607 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3608 		return 0;
3609 	}
3610 
3611 	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3612 	    vmx->nested.preemption_timer_expired) {
3613 		if (block_nested_events)
3614 			return -EBUSY;
3615 		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3616 		return 0;
3617 	}
3618 
3619 	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3620 		if (block_nested_events)
3621 			return -EBUSY;
3622 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3623 				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3624 				  INTR_INFO_VALID_MASK, 0);
3625 		/*
3626 		 * The NMI-triggered VM exit counts as injection:
3627 		 * clear this one and block further NMIs.
3628 		 */
3629 		vcpu->arch.nmi_pending = 0;
3630 		vmx_set_nmi_mask(vcpu, true);
3631 		return 0;
3632 	}
3633 
3634 	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3635 	    nested_exit_on_intr(vcpu)) {
3636 		if (block_nested_events)
3637 			return -EBUSY;
3638 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3639 		return 0;
3640 	}
3641 
3642 	vmx_complete_nested_posted_interrupt(vcpu);
3643 	return 0;
3644 }
3645 
3646 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3647 {
3648 	ktime_t remaining =
3649 		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3650 	u64 value;
3651 
3652 	if (ktime_to_ns(remaining) <= 0)
3653 		return 0;
3654 
3655 	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3656 	do_div(value, 1000000);
3657 	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3658 }
3659 
3660 static bool is_vmcs12_ext_field(unsigned long field)
3661 {
3662 	switch (field) {
3663 	case GUEST_ES_SELECTOR:
3664 	case GUEST_CS_SELECTOR:
3665 	case GUEST_SS_SELECTOR:
3666 	case GUEST_DS_SELECTOR:
3667 	case GUEST_FS_SELECTOR:
3668 	case GUEST_GS_SELECTOR:
3669 	case GUEST_LDTR_SELECTOR:
3670 	case GUEST_TR_SELECTOR:
3671 	case GUEST_ES_LIMIT:
3672 	case GUEST_CS_LIMIT:
3673 	case GUEST_SS_LIMIT:
3674 	case GUEST_DS_LIMIT:
3675 	case GUEST_FS_LIMIT:
3676 	case GUEST_GS_LIMIT:
3677 	case GUEST_LDTR_LIMIT:
3678 	case GUEST_TR_LIMIT:
3679 	case GUEST_GDTR_LIMIT:
3680 	case GUEST_IDTR_LIMIT:
3681 	case GUEST_ES_AR_BYTES:
3682 	case GUEST_DS_AR_BYTES:
3683 	case GUEST_FS_AR_BYTES:
3684 	case GUEST_GS_AR_BYTES:
3685 	case GUEST_LDTR_AR_BYTES:
3686 	case GUEST_TR_AR_BYTES:
3687 	case GUEST_ES_BASE:
3688 	case GUEST_CS_BASE:
3689 	case GUEST_SS_BASE:
3690 	case GUEST_DS_BASE:
3691 	case GUEST_FS_BASE:
3692 	case GUEST_GS_BASE:
3693 	case GUEST_LDTR_BASE:
3694 	case GUEST_TR_BASE:
3695 	case GUEST_GDTR_BASE:
3696 	case GUEST_IDTR_BASE:
3697 	case GUEST_PENDING_DBG_EXCEPTIONS:
3698 	case GUEST_BNDCFGS:
3699 		return true;
3700 	default:
3701 		break;
3702 	}
3703 
3704 	return false;
3705 }
3706 
3707 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3708 				       struct vmcs12 *vmcs12)
3709 {
3710 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3711 
3712 	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3713 	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3714 	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3715 	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3716 	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3717 	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3718 	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3719 	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3720 	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3721 	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3722 	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3723 	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3724 	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3725 	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3726 	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3727 	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3728 	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3729 	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3730 	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3731 	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3732 	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3733 	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3734 	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3735 	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3736 	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3737 	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3738 	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3739 	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3740 	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3741 	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3742 	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3743 	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3744 	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3745 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3746 	vmcs12->guest_pending_dbg_exceptions =
3747 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3748 	if (kvm_mpx_supported())
3749 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3750 
3751 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
3752 }
3753 
3754 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu,
3755 				       struct vmcs12 *vmcs12)
3756 {
3757 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3758 	int cpu;
3759 
3760 	if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare)
3761 		return;
3762 
3763 
3764 	WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01);
3765 
3766 	cpu = get_cpu();
3767 	vmx->loaded_vmcs = &vmx->nested.vmcs02;
3768 	vmx_vcpu_load(&vmx->vcpu, cpu);
3769 
3770 	sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3771 
3772 	vmx->loaded_vmcs = &vmx->vmcs01;
3773 	vmx_vcpu_load(&vmx->vcpu, cpu);
3774 	put_cpu();
3775 }
3776 
3777 /*
3778  * Update the guest state fields of vmcs12 to reflect changes that
3779  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3780  * VM-entry controls is also updated, since this is really a guest
3781  * state bit.)
3782  */
3783 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3784 {
3785 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3786 
3787 	if (vmx->nested.hv_evmcs)
3788 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
3789 
3790 	vmx->nested.need_sync_vmcs02_to_vmcs12_rare = !vmx->nested.hv_evmcs;
3791 
3792 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3793 	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3794 
3795 	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3796 	vmcs12->guest_rip = kvm_rip_read(vcpu);
3797 	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3798 
3799 	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3800 	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3801 
3802 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3803 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3804 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3805 
3806 	vmcs12->guest_interruptibility_info =
3807 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3808 
3809 	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3810 		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3811 	else
3812 		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3813 
3814 	if (nested_cpu_has_preemption_timer(vmcs12) &&
3815 	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3816 			vmcs12->vmx_preemption_timer_value =
3817 				vmx_get_preemption_timer_value(vcpu);
3818 
3819 	/*
3820 	 * In some cases (usually, nested EPT), L2 is allowed to change its
3821 	 * own CR3 without exiting. If it has changed it, we must keep it.
3822 	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3823 	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3824 	 *
3825 	 * Additionally, restore L2's PDPTR to vmcs12.
3826 	 */
3827 	if (enable_ept) {
3828 		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3829 		if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) {
3830 			vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3831 			vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3832 			vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3833 			vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3834 		}
3835 	}
3836 
3837 	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3838 
3839 	if (nested_cpu_has_vid(vmcs12))
3840 		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3841 
3842 	vmcs12->vm_entry_controls =
3843 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3844 		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3845 
3846 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS)
3847 		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3848 
3849 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3850 		vmcs12->guest_ia32_efer = vcpu->arch.efer;
3851 }
3852 
3853 /*
3854  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3855  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3856  * and this function updates it to reflect the changes to the guest state while
3857  * L2 was running (and perhaps made some exits which were handled directly by L0
3858  * without going back to L1), and to reflect the exit reason.
3859  * Note that we do not have to copy here all VMCS fields, just those that
3860  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3861  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3862  * which already writes to vmcs12 directly.
3863  */
3864 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3865 			   u32 exit_reason, u32 exit_intr_info,
3866 			   unsigned long exit_qualification)
3867 {
3868 	/* update exit information fields: */
3869 	vmcs12->vm_exit_reason = exit_reason;
3870 	vmcs12->exit_qualification = exit_qualification;
3871 	vmcs12->vm_exit_intr_info = exit_intr_info;
3872 
3873 	vmcs12->idt_vectoring_info_field = 0;
3874 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3875 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3876 
3877 	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3878 		vmcs12->launch_state = 1;
3879 
3880 		/* vm_entry_intr_info_field is cleared on exit. Emulate this
3881 		 * instead of reading the real value. */
3882 		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3883 
3884 		/*
3885 		 * Transfer the event that L0 or L1 may wanted to inject into
3886 		 * L2 to IDT_VECTORING_INFO_FIELD.
3887 		 */
3888 		vmcs12_save_pending_event(vcpu, vmcs12);
3889 
3890 		/*
3891 		 * According to spec, there's no need to store the guest's
3892 		 * MSRs if the exit is due to a VM-entry failure that occurs
3893 		 * during or after loading the guest state. Since this exit
3894 		 * does not fall in that category, we need to save the MSRs.
3895 		 */
3896 		if (nested_vmx_store_msr(vcpu,
3897 					 vmcs12->vm_exit_msr_store_addr,
3898 					 vmcs12->vm_exit_msr_store_count))
3899 			nested_vmx_abort(vcpu,
3900 					 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3901 	}
3902 
3903 	/*
3904 	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3905 	 * preserved above and would only end up incorrectly in L1.
3906 	 */
3907 	vcpu->arch.nmi_injected = false;
3908 	kvm_clear_exception_queue(vcpu);
3909 	kvm_clear_interrupt_queue(vcpu);
3910 }
3911 
3912 /*
3913  * A part of what we need to when the nested L2 guest exits and we want to
3914  * run its L1 parent, is to reset L1's guest state to the host state specified
3915  * in vmcs12.
3916  * This function is to be called not only on normal nested exit, but also on
3917  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3918  * Failures During or After Loading Guest State").
3919  * This function should be called when the active VMCS is L1's (vmcs01).
3920  */
3921 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3922 				   struct vmcs12 *vmcs12)
3923 {
3924 	struct kvm_segment seg;
3925 	u32 entry_failure_code;
3926 
3927 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3928 		vcpu->arch.efer = vmcs12->host_ia32_efer;
3929 	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3930 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3931 	else
3932 		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3933 	vmx_set_efer(vcpu, vcpu->arch.efer);
3934 
3935 	kvm_rsp_write(vcpu, vmcs12->host_rsp);
3936 	kvm_rip_write(vcpu, vmcs12->host_rip);
3937 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3938 	vmx_set_interrupt_shadow(vcpu, 0);
3939 
3940 	/*
3941 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3942 	 * actually changed, because vmx_set_cr0 refers to efer set above.
3943 	 *
3944 	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3945 	 * (KVM doesn't change it);
3946 	 */
3947 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3948 	vmx_set_cr0(vcpu, vmcs12->host_cr0);
3949 
3950 	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
3951 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3952 	vmx_set_cr4(vcpu, vmcs12->host_cr4);
3953 
3954 	nested_ept_uninit_mmu_context(vcpu);
3955 
3956 	/*
3957 	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
3958 	 * couldn't have changed.
3959 	 */
3960 	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3961 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3962 
3963 	if (!enable_ept)
3964 		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3965 
3966 	/*
3967 	 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3968 	 * VMEntry/VMExit. Thus, no need to flush TLB.
3969 	 *
3970 	 * If vmcs12 doesn't use VPID, L1 expects TLB to be
3971 	 * flushed on every VMEntry/VMExit.
3972 	 *
3973 	 * Otherwise, we can preserve TLB entries as long as we are
3974 	 * able to tag L1 TLB entries differently than L2 TLB entries.
3975 	 *
3976 	 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3977 	 * and therefore we request the TLB flush to happen only after VMCS EPTP
3978 	 * has been set by KVM_REQ_LOAD_CR3.
3979 	 */
3980 	if (enable_vpid &&
3981 	    (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3982 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3983 	}
3984 
3985 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3986 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3987 	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3988 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3989 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3990 	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3991 	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3992 
3993 	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3994 	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3995 		vmcs_write64(GUEST_BNDCFGS, 0);
3996 
3997 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3998 		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3999 		vcpu->arch.pat = vmcs12->host_ia32_pat;
4000 	}
4001 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
4002 		SET_MSR_OR_WARN(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
4003 				vmcs12->host_ia32_perf_global_ctrl);
4004 
4005 	/* Set L1 segment info according to Intel SDM
4006 	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
4007 	seg = (struct kvm_segment) {
4008 		.base = 0,
4009 		.limit = 0xFFFFFFFF,
4010 		.selector = vmcs12->host_cs_selector,
4011 		.type = 11,
4012 		.present = 1,
4013 		.s = 1,
4014 		.g = 1
4015 	};
4016 	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
4017 		seg.l = 1;
4018 	else
4019 		seg.db = 1;
4020 	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
4021 	seg = (struct kvm_segment) {
4022 		.base = 0,
4023 		.limit = 0xFFFFFFFF,
4024 		.type = 3,
4025 		.present = 1,
4026 		.s = 1,
4027 		.db = 1,
4028 		.g = 1
4029 	};
4030 	seg.selector = vmcs12->host_ds_selector;
4031 	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
4032 	seg.selector = vmcs12->host_es_selector;
4033 	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
4034 	seg.selector = vmcs12->host_ss_selector;
4035 	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
4036 	seg.selector = vmcs12->host_fs_selector;
4037 	seg.base = vmcs12->host_fs_base;
4038 	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
4039 	seg.selector = vmcs12->host_gs_selector;
4040 	seg.base = vmcs12->host_gs_base;
4041 	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
4042 	seg = (struct kvm_segment) {
4043 		.base = vmcs12->host_tr_base,
4044 		.limit = 0x67,
4045 		.selector = vmcs12->host_tr_selector,
4046 		.type = 11,
4047 		.present = 1
4048 	};
4049 	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
4050 
4051 	kvm_set_dr(vcpu, 7, 0x400);
4052 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
4053 
4054 	if (cpu_has_vmx_msr_bitmap())
4055 		vmx_update_msr_bitmap(vcpu);
4056 
4057 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
4058 				vmcs12->vm_exit_msr_load_count))
4059 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4060 }
4061 
4062 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
4063 {
4064 	struct shared_msr_entry *efer_msr;
4065 	unsigned int i;
4066 
4067 	if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
4068 		return vmcs_read64(GUEST_IA32_EFER);
4069 
4070 	if (cpu_has_load_ia32_efer())
4071 		return host_efer;
4072 
4073 	for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
4074 		if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
4075 			return vmx->msr_autoload.guest.val[i].value;
4076 	}
4077 
4078 	efer_msr = find_msr_entry(vmx, MSR_EFER);
4079 	if (efer_msr)
4080 		return efer_msr->data;
4081 
4082 	return host_efer;
4083 }
4084 
4085 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
4086 {
4087 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4088 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4089 	struct vmx_msr_entry g, h;
4090 	gpa_t gpa;
4091 	u32 i, j;
4092 
4093 	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
4094 
4095 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
4096 		/*
4097 		 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
4098 		 * as vmcs01.GUEST_DR7 contains a userspace defined value
4099 		 * and vcpu->arch.dr7 is not squirreled away before the
4100 		 * nested VMENTER (not worth adding a variable in nested_vmx).
4101 		 */
4102 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
4103 			kvm_set_dr(vcpu, 7, DR7_FIXED_1);
4104 		else
4105 			WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
4106 	}
4107 
4108 	/*
4109 	 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
4110 	 * handle a variety of side effects to KVM's software model.
4111 	 */
4112 	vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
4113 
4114 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
4115 	vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
4116 
4117 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
4118 	vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
4119 
4120 	nested_ept_uninit_mmu_context(vcpu);
4121 	vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
4122 	kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
4123 
4124 	/*
4125 	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
4126 	 * from vmcs01 (if necessary).  The PDPTRs are not loaded on
4127 	 * VMFail, like everything else we just need to ensure our
4128 	 * software model is up-to-date.
4129 	 */
4130 	if (enable_ept)
4131 		ept_save_pdptrs(vcpu);
4132 
4133 	kvm_mmu_reset_context(vcpu);
4134 
4135 	if (cpu_has_vmx_msr_bitmap())
4136 		vmx_update_msr_bitmap(vcpu);
4137 
4138 	/*
4139 	 * This nasty bit of open coding is a compromise between blindly
4140 	 * loading L1's MSRs using the exit load lists (incorrect emulation
4141 	 * of VMFail), leaving the nested VM's MSRs in the software model
4142 	 * (incorrect behavior) and snapshotting the modified MSRs (too
4143 	 * expensive since the lists are unbound by hardware).  For each
4144 	 * MSR that was (prematurely) loaded from the nested VMEntry load
4145 	 * list, reload it from the exit load list if it exists and differs
4146 	 * from the guest value.  The intent is to stuff host state as
4147 	 * silently as possible, not to fully process the exit load list.
4148 	 */
4149 	for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
4150 		gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
4151 		if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
4152 			pr_debug_ratelimited(
4153 				"%s read MSR index failed (%u, 0x%08llx)\n",
4154 				__func__, i, gpa);
4155 			goto vmabort;
4156 		}
4157 
4158 		for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
4159 			gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
4160 			if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
4161 				pr_debug_ratelimited(
4162 					"%s read MSR failed (%u, 0x%08llx)\n",
4163 					__func__, j, gpa);
4164 				goto vmabort;
4165 			}
4166 			if (h.index != g.index)
4167 				continue;
4168 			if (h.value == g.value)
4169 				break;
4170 
4171 			if (nested_vmx_load_msr_check(vcpu, &h)) {
4172 				pr_debug_ratelimited(
4173 					"%s check failed (%u, 0x%x, 0x%x)\n",
4174 					__func__, j, h.index, h.reserved);
4175 				goto vmabort;
4176 			}
4177 
4178 			if (kvm_set_msr(vcpu, h.index, h.value)) {
4179 				pr_debug_ratelimited(
4180 					"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
4181 					__func__, j, h.index, h.value);
4182 				goto vmabort;
4183 			}
4184 		}
4185 	}
4186 
4187 	return;
4188 
4189 vmabort:
4190 	nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
4191 }
4192 
4193 /*
4194  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
4195  * and modify vmcs12 to make it see what it would expect to see there if
4196  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
4197  */
4198 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
4199 		       u32 exit_intr_info, unsigned long exit_qualification)
4200 {
4201 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4202 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
4203 
4204 	/* trying to cancel vmlaunch/vmresume is a bug */
4205 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
4206 
4207 	leave_guest_mode(vcpu);
4208 
4209 	if (nested_cpu_has_preemption_timer(vmcs12))
4210 		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
4211 
4212 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
4213 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
4214 
4215 	if (likely(!vmx->fail)) {
4216 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
4217 
4218 		if (exit_reason != -1)
4219 			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
4220 				       exit_qualification);
4221 
4222 		/*
4223 		 * Must happen outside of sync_vmcs02_to_vmcs12() as it will
4224 		 * also be used to capture vmcs12 cache as part of
4225 		 * capturing nVMX state for snapshot (migration).
4226 		 *
4227 		 * Otherwise, this flush will dirty guest memory at a
4228 		 * point it is already assumed by user-space to be
4229 		 * immutable.
4230 		 */
4231 		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
4232 	} else {
4233 		/*
4234 		 * The only expected VM-instruction error is "VM entry with
4235 		 * invalid control field(s)." Anything else indicates a
4236 		 * problem with L0.  And we should never get here with a
4237 		 * VMFail of any type if early consistency checks are enabled.
4238 		 */
4239 		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
4240 			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4241 		WARN_ON_ONCE(nested_early_check);
4242 	}
4243 
4244 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
4245 
4246 	/* Update any VMCS fields that might have changed while L2 ran */
4247 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
4248 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
4249 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
4250 	if (vmx->nested.l1_tpr_threshold != -1)
4251 		vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
4252 
4253 	if (kvm_has_tsc_control)
4254 		decache_tsc_multiplier(vmx);
4255 
4256 	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
4257 		vmx->nested.change_vmcs01_virtual_apic_mode = false;
4258 		vmx_set_virtual_apic_mode(vcpu);
4259 	}
4260 
4261 	/* Unpin physical memory we referred to in vmcs02 */
4262 	if (vmx->nested.apic_access_page) {
4263 		kvm_release_page_clean(vmx->nested.apic_access_page);
4264 		vmx->nested.apic_access_page = NULL;
4265 	}
4266 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
4267 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
4268 	vmx->nested.pi_desc = NULL;
4269 
4270 	/*
4271 	 * We are now running in L2, mmu_notifier will force to reload the
4272 	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
4273 	 */
4274 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4275 
4276 	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
4277 		vmx->nested.need_vmcs12_to_shadow_sync = true;
4278 
4279 	/* in case we halted in L2 */
4280 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4281 
4282 	if (likely(!vmx->fail)) {
4283 		/*
4284 		 * TODO: SDM says that with acknowledge interrupt on
4285 		 * exit, bit 31 of the VM-exit interrupt information
4286 		 * (valid interrupt) is always set to 1 on
4287 		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
4288 		 * need kvm_cpu_has_interrupt().  See the commit
4289 		 * message for details.
4290 		 */
4291 		if (nested_exit_intr_ack_set(vcpu) &&
4292 		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
4293 		    kvm_cpu_has_interrupt(vcpu)) {
4294 			int irq = kvm_cpu_get_interrupt(vcpu);
4295 			WARN_ON(irq < 0);
4296 			vmcs12->vm_exit_intr_info = irq |
4297 				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
4298 		}
4299 
4300 		if (exit_reason != -1)
4301 			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
4302 						       vmcs12->exit_qualification,
4303 						       vmcs12->idt_vectoring_info_field,
4304 						       vmcs12->vm_exit_intr_info,
4305 						       vmcs12->vm_exit_intr_error_code,
4306 						       KVM_ISA_VMX);
4307 
4308 		load_vmcs12_host_state(vcpu, vmcs12);
4309 
4310 		return;
4311 	}
4312 
4313 	/*
4314 	 * After an early L2 VM-entry failure, we're now back
4315 	 * in L1 which thinks it just finished a VMLAUNCH or
4316 	 * VMRESUME instruction, so we need to set the failure
4317 	 * flag and the VM-instruction error field of the VMCS
4318 	 * accordingly, and skip the emulated instruction.
4319 	 */
4320 	(void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
4321 
4322 	/*
4323 	 * Restore L1's host state to KVM's software model.  We're here
4324 	 * because a consistency check was caught by hardware, which
4325 	 * means some amount of guest state has been propagated to KVM's
4326 	 * model and needs to be unwound to the host's state.
4327 	 */
4328 	nested_vmx_restore_host_state(vcpu);
4329 
4330 	vmx->fail = 0;
4331 }
4332 
4333 /*
4334  * Decode the memory-address operand of a vmx instruction, as recorded on an
4335  * exit caused by such an instruction (run by a guest hypervisor).
4336  * On success, returns 0. When the operand is invalid, returns 1 and throws
4337  * #UD or #GP.
4338  */
4339 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4340 			u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
4341 {
4342 	gva_t off;
4343 	bool exn;
4344 	struct kvm_segment s;
4345 
4346 	/*
4347 	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4348 	 * Execution", on an exit, vmx_instruction_info holds most of the
4349 	 * addressing components of the operand. Only the displacement part
4350 	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4351 	 * For how an actual address is calculated from all these components,
4352 	 * refer to Vol. 1, "Operand Addressing".
4353 	 */
4354 	int  scaling = vmx_instruction_info & 3;
4355 	int  addr_size = (vmx_instruction_info >> 7) & 7;
4356 	bool is_reg = vmx_instruction_info & (1u << 10);
4357 	int  seg_reg = (vmx_instruction_info >> 15) & 7;
4358 	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4359 	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4360 	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4361 	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4362 
4363 	if (is_reg) {
4364 		kvm_queue_exception(vcpu, UD_VECTOR);
4365 		return 1;
4366 	}
4367 
4368 	/* Addr = segment_base + offset */
4369 	/* offset = base + [index * scale] + displacement */
4370 	off = exit_qualification; /* holds the displacement */
4371 	if (addr_size == 1)
4372 		off = (gva_t)sign_extend64(off, 31);
4373 	else if (addr_size == 0)
4374 		off = (gva_t)sign_extend64(off, 15);
4375 	if (base_is_valid)
4376 		off += kvm_register_read(vcpu, base_reg);
4377 	if (index_is_valid)
4378 		off += kvm_register_read(vcpu, index_reg)<<scaling;
4379 	vmx_get_segment(vcpu, &s, seg_reg);
4380 
4381 	/*
4382 	 * The effective address, i.e. @off, of a memory operand is truncated
4383 	 * based on the address size of the instruction.  Note that this is
4384 	 * the *effective address*, i.e. the address prior to accounting for
4385 	 * the segment's base.
4386 	 */
4387 	if (addr_size == 1) /* 32 bit */
4388 		off &= 0xffffffff;
4389 	else if (addr_size == 0) /* 16 bit */
4390 		off &= 0xffff;
4391 
4392 	/* Checks for #GP/#SS exceptions. */
4393 	exn = false;
4394 	if (is_long_mode(vcpu)) {
4395 		/*
4396 		 * The virtual/linear address is never truncated in 64-bit
4397 		 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4398 		 * address when using FS/GS with a non-zero base.
4399 		 */
4400 		if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS)
4401 			*ret = s.base + off;
4402 		else
4403 			*ret = off;
4404 
4405 		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
4406 		 * non-canonical form. This is the only check on the memory
4407 		 * destination for long mode!
4408 		 */
4409 		exn = is_noncanonical_address(*ret, vcpu);
4410 	} else {
4411 		/*
4412 		 * When not in long mode, the virtual/linear address is
4413 		 * unconditionally truncated to 32 bits regardless of the
4414 		 * address size.
4415 		 */
4416 		*ret = (s.base + off) & 0xffffffff;
4417 
4418 		/* Protected mode: apply checks for segment validity in the
4419 		 * following order:
4420 		 * - segment type check (#GP(0) may be thrown)
4421 		 * - usability check (#GP(0)/#SS(0))
4422 		 * - limit check (#GP(0)/#SS(0))
4423 		 */
4424 		if (wr)
4425 			/* #GP(0) if the destination operand is located in a
4426 			 * read-only data segment or any code segment.
4427 			 */
4428 			exn = ((s.type & 0xa) == 0 || (s.type & 8));
4429 		else
4430 			/* #GP(0) if the source operand is located in an
4431 			 * execute-only code segment
4432 			 */
4433 			exn = ((s.type & 0xa) == 8);
4434 		if (exn) {
4435 			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4436 			return 1;
4437 		}
4438 		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4439 		 */
4440 		exn = (s.unusable != 0);
4441 
4442 		/*
4443 		 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4444 		 * outside the segment limit.  All CPUs that support VMX ignore
4445 		 * limit checks for flat segments, i.e. segments with base==0,
4446 		 * limit==0xffffffff and of type expand-up data or code.
4447 		 */
4448 		if (!(s.base == 0 && s.limit == 0xffffffff &&
4449 		     ((s.type & 8) || !(s.type & 4))))
4450 			exn = exn || ((u64)off + len - 1 > s.limit);
4451 	}
4452 	if (exn) {
4453 		kvm_queue_exception_e(vcpu,
4454 				      seg_reg == VCPU_SREG_SS ?
4455 						SS_VECTOR : GP_VECTOR,
4456 				      0);
4457 		return 1;
4458 	}
4459 
4460 	return 0;
4461 }
4462 
4463 void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
4464 {
4465 	struct vcpu_vmx *vmx;
4466 
4467 	if (!nested_vmx_allowed(vcpu))
4468 		return;
4469 
4470 	vmx = to_vmx(vcpu);
4471 	if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
4472 		vmx->nested.msrs.entry_ctls_high |=
4473 				VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4474 		vmx->nested.msrs.exit_ctls_high |=
4475 				VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4476 	} else {
4477 		vmx->nested.msrs.entry_ctls_high &=
4478 				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4479 		vmx->nested.msrs.exit_ctls_high &=
4480 				~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4481 	}
4482 }
4483 
4484 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4485 {
4486 	gva_t gva;
4487 	struct x86_exception e;
4488 
4489 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4490 				vmcs_read32(VMX_INSTRUCTION_INFO), false,
4491 				sizeof(*vmpointer), &gva))
4492 		return 1;
4493 
4494 	if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4495 		kvm_inject_page_fault(vcpu, &e);
4496 		return 1;
4497 	}
4498 
4499 	return 0;
4500 }
4501 
4502 /*
4503  * Allocate a shadow VMCS and associate it with the currently loaded
4504  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4505  * VMCS is also VMCLEARed, so that it is ready for use.
4506  */
4507 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4508 {
4509 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4510 	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4511 
4512 	/*
4513 	 * We should allocate a shadow vmcs for vmcs01 only when L1
4514 	 * executes VMXON and free it when L1 executes VMXOFF.
4515 	 * As it is invalid to execute VMXON twice, we shouldn't reach
4516 	 * here when vmcs01 already have an allocated shadow vmcs.
4517 	 */
4518 	WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4519 
4520 	if (!loaded_vmcs->shadow_vmcs) {
4521 		loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4522 		if (loaded_vmcs->shadow_vmcs)
4523 			vmcs_clear(loaded_vmcs->shadow_vmcs);
4524 	}
4525 	return loaded_vmcs->shadow_vmcs;
4526 }
4527 
4528 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4529 {
4530 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4531 	int r;
4532 
4533 	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4534 	if (r < 0)
4535 		goto out_vmcs02;
4536 
4537 	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4538 	if (!vmx->nested.cached_vmcs12)
4539 		goto out_cached_vmcs12;
4540 
4541 	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4542 	if (!vmx->nested.cached_shadow_vmcs12)
4543 		goto out_cached_shadow_vmcs12;
4544 
4545 	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4546 		goto out_shadow_vmcs;
4547 
4548 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4549 		     HRTIMER_MODE_REL_PINNED);
4550 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4551 
4552 	vmx->nested.vpid02 = allocate_vpid();
4553 
4554 	vmx->nested.vmcs02_initialized = false;
4555 	vmx->nested.vmxon = true;
4556 
4557 	if (pt_mode == PT_MODE_HOST_GUEST) {
4558 		vmx->pt_desc.guest.ctl = 0;
4559 		pt_update_intercept_for_msr(vmx);
4560 	}
4561 
4562 	return 0;
4563 
4564 out_shadow_vmcs:
4565 	kfree(vmx->nested.cached_shadow_vmcs12);
4566 
4567 out_cached_shadow_vmcs12:
4568 	kfree(vmx->nested.cached_vmcs12);
4569 
4570 out_cached_vmcs12:
4571 	free_loaded_vmcs(&vmx->nested.vmcs02);
4572 
4573 out_vmcs02:
4574 	return -ENOMEM;
4575 }
4576 
4577 /*
4578  * Emulate the VMXON instruction.
4579  * Currently, we just remember that VMX is active, and do not save or even
4580  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4581  * do not currently need to store anything in that guest-allocated memory
4582  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4583  * argument is different from the VMXON pointer (which the spec says they do).
4584  */
4585 static int handle_vmon(struct kvm_vcpu *vcpu)
4586 {
4587 	int ret;
4588 	gpa_t vmptr;
4589 	uint32_t revision;
4590 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4591 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4592 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4593 
4594 	/*
4595 	 * The Intel VMX Instruction Reference lists a bunch of bits that are
4596 	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4597 	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4598 	 * Otherwise, we should fail with #UD.  But most faulting conditions
4599 	 * have already been checked by hardware, prior to the VM-exit for
4600 	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4601 	 * that bit set to 1 in non-root mode.
4602 	 */
4603 	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4604 		kvm_queue_exception(vcpu, UD_VECTOR);
4605 		return 1;
4606 	}
4607 
4608 	/* CPL=0 must be checked manually. */
4609 	if (vmx_get_cpl(vcpu)) {
4610 		kvm_inject_gp(vcpu, 0);
4611 		return 1;
4612 	}
4613 
4614 	if (vmx->nested.vmxon)
4615 		return nested_vmx_failValid(vcpu,
4616 			VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4617 
4618 	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4619 			!= VMXON_NEEDED_FEATURES) {
4620 		kvm_inject_gp(vcpu, 0);
4621 		return 1;
4622 	}
4623 
4624 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4625 		return 1;
4626 
4627 	/*
4628 	 * SDM 3: 24.11.5
4629 	 * The first 4 bytes of VMXON region contain the supported
4630 	 * VMCS revision identifier
4631 	 *
4632 	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4633 	 * which replaces physical address width with 32
4634 	 */
4635 	if (!page_address_valid(vcpu, vmptr))
4636 		return nested_vmx_failInvalid(vcpu);
4637 
4638 	if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4639 	    revision != VMCS12_REVISION)
4640 		return nested_vmx_failInvalid(vcpu);
4641 
4642 	vmx->nested.vmxon_ptr = vmptr;
4643 	ret = enter_vmx_operation(vcpu);
4644 	if (ret)
4645 		return ret;
4646 
4647 	return nested_vmx_succeed(vcpu);
4648 }
4649 
4650 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4651 {
4652 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4653 
4654 	if (vmx->nested.current_vmptr == -1ull)
4655 		return;
4656 
4657 	copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
4658 
4659 	if (enable_shadow_vmcs) {
4660 		/* copy to memory all shadowed fields in case
4661 		   they were modified */
4662 		copy_shadow_to_vmcs12(vmx);
4663 		vmx_disable_shadow_vmcs(vmx);
4664 	}
4665 	vmx->nested.posted_intr_nv = -1;
4666 
4667 	/* Flush VMCS12 to guest memory */
4668 	kvm_vcpu_write_guest_page(vcpu,
4669 				  vmx->nested.current_vmptr >> PAGE_SHIFT,
4670 				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4671 
4672 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4673 
4674 	vmx->nested.current_vmptr = -1ull;
4675 }
4676 
4677 /* Emulate the VMXOFF instruction */
4678 static int handle_vmoff(struct kvm_vcpu *vcpu)
4679 {
4680 	if (!nested_vmx_check_permission(vcpu))
4681 		return 1;
4682 
4683 	free_nested(vcpu);
4684 
4685 	/* Process a latched INIT during time CPU was in VMX operation */
4686 	kvm_make_request(KVM_REQ_EVENT, vcpu);
4687 
4688 	return nested_vmx_succeed(vcpu);
4689 }
4690 
4691 /* Emulate the VMCLEAR instruction */
4692 static int handle_vmclear(struct kvm_vcpu *vcpu)
4693 {
4694 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4695 	u32 zero = 0;
4696 	gpa_t vmptr;
4697 	u64 evmcs_gpa;
4698 
4699 	if (!nested_vmx_check_permission(vcpu))
4700 		return 1;
4701 
4702 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4703 		return 1;
4704 
4705 	if (!page_address_valid(vcpu, vmptr))
4706 		return nested_vmx_failValid(vcpu,
4707 			VMXERR_VMCLEAR_INVALID_ADDRESS);
4708 
4709 	if (vmptr == vmx->nested.vmxon_ptr)
4710 		return nested_vmx_failValid(vcpu,
4711 			VMXERR_VMCLEAR_VMXON_POINTER);
4712 
4713 	/*
4714 	 * When Enlightened VMEntry is enabled on the calling CPU we treat
4715 	 * memory area pointer by vmptr as Enlightened VMCS (as there's no good
4716 	 * way to distinguish it from VMCS12) and we must not corrupt it by
4717 	 * writing to the non-existent 'launch_state' field. The area doesn't
4718 	 * have to be the currently active EVMCS on the calling CPU and there's
4719 	 * nothing KVM has to do to transition it from 'active' to 'non-active'
4720 	 * state. It is possible that the area will stay mapped as
4721 	 * vmx->nested.hv_evmcs but this shouldn't be a problem.
4722 	 */
4723 	if (likely(!vmx->nested.enlightened_vmcs_enabled ||
4724 		   !nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
4725 		if (vmptr == vmx->nested.current_vmptr)
4726 			nested_release_vmcs12(vcpu);
4727 
4728 		kvm_vcpu_write_guest(vcpu,
4729 				     vmptr + offsetof(struct vmcs12,
4730 						      launch_state),
4731 				     &zero, sizeof(zero));
4732 	}
4733 
4734 	return nested_vmx_succeed(vcpu);
4735 }
4736 
4737 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4738 
4739 /* Emulate the VMLAUNCH instruction */
4740 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4741 {
4742 	return nested_vmx_run(vcpu, true);
4743 }
4744 
4745 /* Emulate the VMRESUME instruction */
4746 static int handle_vmresume(struct kvm_vcpu *vcpu)
4747 {
4748 
4749 	return nested_vmx_run(vcpu, false);
4750 }
4751 
4752 static int handle_vmread(struct kvm_vcpu *vcpu)
4753 {
4754 	unsigned long field;
4755 	u64 field_value;
4756 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4757 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4758 	int len;
4759 	gva_t gva = 0;
4760 	struct vmcs12 *vmcs12;
4761 	struct x86_exception e;
4762 	short offset;
4763 
4764 	if (!nested_vmx_check_permission(vcpu))
4765 		return 1;
4766 
4767 	if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4768 		return nested_vmx_failInvalid(vcpu);
4769 
4770 	if (!is_guest_mode(vcpu))
4771 		vmcs12 = get_vmcs12(vcpu);
4772 	else {
4773 		/*
4774 		 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4775 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
4776 		 */
4777 		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4778 			return nested_vmx_failInvalid(vcpu);
4779 		vmcs12 = get_shadow_vmcs12(vcpu);
4780 	}
4781 
4782 	/* Decode instruction info and find the field to read */
4783 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4784 
4785 	offset = vmcs_field_to_offset(field);
4786 	if (offset < 0)
4787 		return nested_vmx_failValid(vcpu,
4788 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4789 
4790 	if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
4791 		copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4792 
4793 	/* Read the field, zero-extended to a u64 field_value */
4794 	field_value = vmcs12_read_any(vmcs12, field, offset);
4795 
4796 	/*
4797 	 * Now copy part of this value to register or memory, as requested.
4798 	 * Note that the number of bits actually copied is 32 or 64 depending
4799 	 * on the guest's mode (32 or 64 bit), not on the given field's length.
4800 	 */
4801 	if (vmx_instruction_info & (1u << 10)) {
4802 		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4803 			field_value);
4804 	} else {
4805 		len = is_64_bit_mode(vcpu) ? 8 : 4;
4806 		if (get_vmx_mem_address(vcpu, exit_qualification,
4807 				vmx_instruction_info, true, len, &gva))
4808 			return 1;
4809 		/* _system ok, nested_vmx_check_permission has verified cpl=0 */
4810 		if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
4811 			kvm_inject_page_fault(vcpu, &e);
4812 	}
4813 
4814 	return nested_vmx_succeed(vcpu);
4815 }
4816 
4817 static bool is_shadow_field_rw(unsigned long field)
4818 {
4819 	switch (field) {
4820 #define SHADOW_FIELD_RW(x, y) case x:
4821 #include "vmcs_shadow_fields.h"
4822 		return true;
4823 	default:
4824 		break;
4825 	}
4826 	return false;
4827 }
4828 
4829 static bool is_shadow_field_ro(unsigned long field)
4830 {
4831 	switch (field) {
4832 #define SHADOW_FIELD_RO(x, y) case x:
4833 #include "vmcs_shadow_fields.h"
4834 		return true;
4835 	default:
4836 		break;
4837 	}
4838 	return false;
4839 }
4840 
4841 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4842 {
4843 	unsigned long field;
4844 	int len;
4845 	gva_t gva;
4846 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4847 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4848 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4849 
4850 	/* The value to write might be 32 or 64 bits, depending on L1's long
4851 	 * mode, and eventually we need to write that into a field of several
4852 	 * possible lengths. The code below first zero-extends the value to 64
4853 	 * bit (field_value), and then copies only the appropriate number of
4854 	 * bits into the vmcs12 field.
4855 	 */
4856 	u64 field_value = 0;
4857 	struct x86_exception e;
4858 	struct vmcs12 *vmcs12;
4859 	short offset;
4860 
4861 	if (!nested_vmx_check_permission(vcpu))
4862 		return 1;
4863 
4864 	if (vmx->nested.current_vmptr == -1ull)
4865 		return nested_vmx_failInvalid(vcpu);
4866 
4867 	if (vmx_instruction_info & (1u << 10))
4868 		field_value = kvm_register_readl(vcpu,
4869 			(((vmx_instruction_info) >> 3) & 0xf));
4870 	else {
4871 		len = is_64_bit_mode(vcpu) ? 8 : 4;
4872 		if (get_vmx_mem_address(vcpu, exit_qualification,
4873 				vmx_instruction_info, false, len, &gva))
4874 			return 1;
4875 		if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
4876 			kvm_inject_page_fault(vcpu, &e);
4877 			return 1;
4878 		}
4879 	}
4880 
4881 
4882 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4883 	/*
4884 	 * If the vCPU supports "VMWRITE to any supported field in the
4885 	 * VMCS," then the "read-only" fields are actually read/write.
4886 	 */
4887 	if (vmcs_field_readonly(field) &&
4888 	    !nested_cpu_has_vmwrite_any_field(vcpu))
4889 		return nested_vmx_failValid(vcpu,
4890 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4891 
4892 	if (!is_guest_mode(vcpu)) {
4893 		vmcs12 = get_vmcs12(vcpu);
4894 
4895 		/*
4896 		 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
4897 		 * vmcs12, else we may crush a field or consume a stale value.
4898 		 */
4899 		if (!is_shadow_field_rw(field))
4900 			copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
4901 	} else {
4902 		/*
4903 		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4904 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
4905 		 */
4906 		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4907 			return nested_vmx_failInvalid(vcpu);
4908 		vmcs12 = get_shadow_vmcs12(vcpu);
4909 	}
4910 
4911 	offset = vmcs_field_to_offset(field);
4912 	if (offset < 0)
4913 		return nested_vmx_failValid(vcpu,
4914 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4915 
4916 	/*
4917 	 * Some Intel CPUs intentionally drop the reserved bits of the AR byte
4918 	 * fields on VMWRITE.  Emulate this behavior to ensure consistent KVM
4919 	 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE
4920 	 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD
4921 	 * from L1 will return a different value than VMREAD from L2 (L1 sees
4922 	 * the stripped down value, L2 sees the full value as stored by KVM).
4923 	 */
4924 	if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
4925 		field_value &= 0x1f0ff;
4926 
4927 	vmcs12_write_any(vmcs12, field, offset, field_value);
4928 
4929 	/*
4930 	 * Do not track vmcs12 dirty-state if in guest-mode as we actually
4931 	 * dirty shadow vmcs12 instead of vmcs12.  Fields that can be updated
4932 	 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't
4933 	 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path.
4934 	 */
4935 	if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) {
4936 		/*
4937 		 * L1 can read these fields without exiting, ensure the
4938 		 * shadow VMCS is up-to-date.
4939 		 */
4940 		if (enable_shadow_vmcs && is_shadow_field_ro(field)) {
4941 			preempt_disable();
4942 			vmcs_load(vmx->vmcs01.shadow_vmcs);
4943 
4944 			__vmcs_writel(field, field_value);
4945 
4946 			vmcs_clear(vmx->vmcs01.shadow_vmcs);
4947 			vmcs_load(vmx->loaded_vmcs->vmcs);
4948 			preempt_enable();
4949 		}
4950 		vmx->nested.dirty_vmcs12 = true;
4951 	}
4952 
4953 	return nested_vmx_succeed(vcpu);
4954 }
4955 
4956 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4957 {
4958 	vmx->nested.current_vmptr = vmptr;
4959 	if (enable_shadow_vmcs) {
4960 		secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS);
4961 		vmcs_write64(VMCS_LINK_POINTER,
4962 			     __pa(vmx->vmcs01.shadow_vmcs));
4963 		vmx->nested.need_vmcs12_to_shadow_sync = true;
4964 	}
4965 	vmx->nested.dirty_vmcs12 = true;
4966 }
4967 
4968 /* Emulate the VMPTRLD instruction */
4969 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4970 {
4971 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4972 	gpa_t vmptr;
4973 
4974 	if (!nested_vmx_check_permission(vcpu))
4975 		return 1;
4976 
4977 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4978 		return 1;
4979 
4980 	if (!page_address_valid(vcpu, vmptr))
4981 		return nested_vmx_failValid(vcpu,
4982 			VMXERR_VMPTRLD_INVALID_ADDRESS);
4983 
4984 	if (vmptr == vmx->nested.vmxon_ptr)
4985 		return nested_vmx_failValid(vcpu,
4986 			VMXERR_VMPTRLD_VMXON_POINTER);
4987 
4988 	/* Forbid normal VMPTRLD if Enlightened version was used */
4989 	if (vmx->nested.hv_evmcs)
4990 		return 1;
4991 
4992 	if (vmx->nested.current_vmptr != vmptr) {
4993 		struct kvm_host_map map;
4994 		struct vmcs12 *new_vmcs12;
4995 
4996 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4997 			/*
4998 			 * Reads from an unbacked page return all 1s,
4999 			 * which means that the 32 bits located at the
5000 			 * given physical address won't match the required
5001 			 * VMCS12_REVISION identifier.
5002 			 */
5003 			return nested_vmx_failValid(vcpu,
5004 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5005 		}
5006 
5007 		new_vmcs12 = map.hva;
5008 
5009 		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5010 		    (new_vmcs12->hdr.shadow_vmcs &&
5011 		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
5012 			kvm_vcpu_unmap(vcpu, &map, false);
5013 			return nested_vmx_failValid(vcpu,
5014 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
5015 		}
5016 
5017 		nested_release_vmcs12(vcpu);
5018 
5019 		/*
5020 		 * Load VMCS12 from guest memory since it is not already
5021 		 * cached.
5022 		 */
5023 		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
5024 		kvm_vcpu_unmap(vcpu, &map, false);
5025 
5026 		set_current_vmptr(vmx, vmptr);
5027 	}
5028 
5029 	return nested_vmx_succeed(vcpu);
5030 }
5031 
5032 /* Emulate the VMPTRST instruction */
5033 static int handle_vmptrst(struct kvm_vcpu *vcpu)
5034 {
5035 	unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
5036 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5037 	gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
5038 	struct x86_exception e;
5039 	gva_t gva;
5040 
5041 	if (!nested_vmx_check_permission(vcpu))
5042 		return 1;
5043 
5044 	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
5045 		return 1;
5046 
5047 	if (get_vmx_mem_address(vcpu, exit_qual, instr_info,
5048 				true, sizeof(gpa_t), &gva))
5049 		return 1;
5050 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
5051 	if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
5052 					sizeof(gpa_t), &e)) {
5053 		kvm_inject_page_fault(vcpu, &e);
5054 		return 1;
5055 	}
5056 	return nested_vmx_succeed(vcpu);
5057 }
5058 
5059 /* Emulate the INVEPT instruction */
5060 static int handle_invept(struct kvm_vcpu *vcpu)
5061 {
5062 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5063 	u32 vmx_instruction_info, types;
5064 	unsigned long type;
5065 	gva_t gva;
5066 	struct x86_exception e;
5067 	struct {
5068 		u64 eptp, gpa;
5069 	} operand;
5070 
5071 	if (!(vmx->nested.msrs.secondary_ctls_high &
5072 	      SECONDARY_EXEC_ENABLE_EPT) ||
5073 	    !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
5074 		kvm_queue_exception(vcpu, UD_VECTOR);
5075 		return 1;
5076 	}
5077 
5078 	if (!nested_vmx_check_permission(vcpu))
5079 		return 1;
5080 
5081 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5082 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5083 
5084 	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
5085 
5086 	if (type >= 32 || !(types & (1 << type)))
5087 		return nested_vmx_failValid(vcpu,
5088 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5089 
5090 	/* According to the Intel VMX instruction reference, the memory
5091 	 * operand is read even if it isn't needed (e.g., for type==global)
5092 	 */
5093 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5094 			vmx_instruction_info, false, sizeof(operand), &gva))
5095 		return 1;
5096 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5097 		kvm_inject_page_fault(vcpu, &e);
5098 		return 1;
5099 	}
5100 
5101 	switch (type) {
5102 	case VMX_EPT_EXTENT_GLOBAL:
5103 	case VMX_EPT_EXTENT_CONTEXT:
5104 	/*
5105 	 * TODO: Sync the necessary shadow EPT roots here, rather than
5106 	 * at the next emulated VM-entry.
5107 	 */
5108 		break;
5109 	default:
5110 		BUG_ON(1);
5111 		break;
5112 	}
5113 
5114 	return nested_vmx_succeed(vcpu);
5115 }
5116 
5117 static int handle_invvpid(struct kvm_vcpu *vcpu)
5118 {
5119 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5120 	u32 vmx_instruction_info;
5121 	unsigned long type, types;
5122 	gva_t gva;
5123 	struct x86_exception e;
5124 	struct {
5125 		u64 vpid;
5126 		u64 gla;
5127 	} operand;
5128 	u16 vpid02;
5129 
5130 	if (!(vmx->nested.msrs.secondary_ctls_high &
5131 	      SECONDARY_EXEC_ENABLE_VPID) ||
5132 			!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
5133 		kvm_queue_exception(vcpu, UD_VECTOR);
5134 		return 1;
5135 	}
5136 
5137 	if (!nested_vmx_check_permission(vcpu))
5138 		return 1;
5139 
5140 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5141 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
5142 
5143 	types = (vmx->nested.msrs.vpid_caps &
5144 			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
5145 
5146 	if (type >= 32 || !(types & (1 << type)))
5147 		return nested_vmx_failValid(vcpu,
5148 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5149 
5150 	/* according to the intel vmx instruction reference, the memory
5151 	 * operand is read even if it isn't needed (e.g., for type==global)
5152 	 */
5153 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
5154 			vmx_instruction_info, false, sizeof(operand), &gva))
5155 		return 1;
5156 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
5157 		kvm_inject_page_fault(vcpu, &e);
5158 		return 1;
5159 	}
5160 	if (operand.vpid >> 16)
5161 		return nested_vmx_failValid(vcpu,
5162 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5163 
5164 	vpid02 = nested_get_vpid02(vcpu);
5165 	switch (type) {
5166 	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
5167 		if (!operand.vpid ||
5168 		    is_noncanonical_address(operand.gla, vcpu))
5169 			return nested_vmx_failValid(vcpu,
5170 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5171 		if (cpu_has_vmx_invvpid_individual_addr()) {
5172 			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
5173 				vpid02, operand.gla);
5174 		} else
5175 			__vmx_flush_tlb(vcpu, vpid02, false);
5176 		break;
5177 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
5178 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
5179 		if (!operand.vpid)
5180 			return nested_vmx_failValid(vcpu,
5181 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
5182 		__vmx_flush_tlb(vcpu, vpid02, false);
5183 		break;
5184 	case VMX_VPID_EXTENT_ALL_CONTEXT:
5185 		__vmx_flush_tlb(vcpu, vpid02, false);
5186 		break;
5187 	default:
5188 		WARN_ON_ONCE(1);
5189 		return kvm_skip_emulated_instruction(vcpu);
5190 	}
5191 
5192 	return nested_vmx_succeed(vcpu);
5193 }
5194 
5195 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
5196 				     struct vmcs12 *vmcs12)
5197 {
5198 	u32 index = kvm_rcx_read(vcpu);
5199 	u64 address;
5200 	bool accessed_dirty;
5201 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
5202 
5203 	if (!nested_cpu_has_eptp_switching(vmcs12) ||
5204 	    !nested_cpu_has_ept(vmcs12))
5205 		return 1;
5206 
5207 	if (index >= VMFUNC_EPTP_ENTRIES)
5208 		return 1;
5209 
5210 
5211 	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
5212 				     &address, index * 8, 8))
5213 		return 1;
5214 
5215 	accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
5216 
5217 	/*
5218 	 * If the (L2) guest does a vmfunc to the currently
5219 	 * active ept pointer, we don't have to do anything else
5220 	 */
5221 	if (vmcs12->ept_pointer != address) {
5222 		if (!valid_ept_address(vcpu, address))
5223 			return 1;
5224 
5225 		kvm_mmu_unload(vcpu);
5226 		mmu->ept_ad = accessed_dirty;
5227 		mmu->mmu_role.base.ad_disabled = !accessed_dirty;
5228 		vmcs12->ept_pointer = address;
5229 		/*
5230 		 * TODO: Check what's the correct approach in case
5231 		 * mmu reload fails. Currently, we just let the next
5232 		 * reload potentially fail
5233 		 */
5234 		kvm_mmu_reload(vcpu);
5235 	}
5236 
5237 	return 0;
5238 }
5239 
5240 static int handle_vmfunc(struct kvm_vcpu *vcpu)
5241 {
5242 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5243 	struct vmcs12 *vmcs12;
5244 	u32 function = kvm_rax_read(vcpu);
5245 
5246 	/*
5247 	 * VMFUNC is only supported for nested guests, but we always enable the
5248 	 * secondary control for simplicity; for non-nested mode, fake that we
5249 	 * didn't by injecting #UD.
5250 	 */
5251 	if (!is_guest_mode(vcpu)) {
5252 		kvm_queue_exception(vcpu, UD_VECTOR);
5253 		return 1;
5254 	}
5255 
5256 	vmcs12 = get_vmcs12(vcpu);
5257 	if ((vmcs12->vm_function_control & (1 << function)) == 0)
5258 		goto fail;
5259 
5260 	switch (function) {
5261 	case 0:
5262 		if (nested_vmx_eptp_switching(vcpu, vmcs12))
5263 			goto fail;
5264 		break;
5265 	default:
5266 		goto fail;
5267 	}
5268 	return kvm_skip_emulated_instruction(vcpu);
5269 
5270 fail:
5271 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
5272 			  vmcs_read32(VM_EXIT_INTR_INFO),
5273 			  vmcs_readl(EXIT_QUALIFICATION));
5274 	return 1;
5275 }
5276 
5277 
5278 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
5279 				       struct vmcs12 *vmcs12)
5280 {
5281 	unsigned long exit_qualification;
5282 	gpa_t bitmap, last_bitmap;
5283 	unsigned int port;
5284 	int size;
5285 	u8 b;
5286 
5287 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
5288 		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
5289 
5290 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5291 
5292 	port = exit_qualification >> 16;
5293 	size = (exit_qualification & 7) + 1;
5294 
5295 	last_bitmap = (gpa_t)-1;
5296 	b = -1;
5297 
5298 	while (size > 0) {
5299 		if (port < 0x8000)
5300 			bitmap = vmcs12->io_bitmap_a;
5301 		else if (port < 0x10000)
5302 			bitmap = vmcs12->io_bitmap_b;
5303 		else
5304 			return true;
5305 		bitmap += (port & 0x7fff) / 8;
5306 
5307 		if (last_bitmap != bitmap)
5308 			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
5309 				return true;
5310 		if (b & (1 << (port & 7)))
5311 			return true;
5312 
5313 		port++;
5314 		size--;
5315 		last_bitmap = bitmap;
5316 	}
5317 
5318 	return false;
5319 }
5320 
5321 /*
5322  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
5323  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
5324  * disinterest in the current event (read or write a specific MSR) by using an
5325  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
5326  */
5327 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
5328 	struct vmcs12 *vmcs12, u32 exit_reason)
5329 {
5330 	u32 msr_index = kvm_rcx_read(vcpu);
5331 	gpa_t bitmap;
5332 
5333 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
5334 		return true;
5335 
5336 	/*
5337 	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
5338 	 * for the four combinations of read/write and low/high MSR numbers.
5339 	 * First we need to figure out which of the four to use:
5340 	 */
5341 	bitmap = vmcs12->msr_bitmap;
5342 	if (exit_reason == EXIT_REASON_MSR_WRITE)
5343 		bitmap += 2048;
5344 	if (msr_index >= 0xc0000000) {
5345 		msr_index -= 0xc0000000;
5346 		bitmap += 1024;
5347 	}
5348 
5349 	/* Then read the msr_index'th bit from this bitmap: */
5350 	if (msr_index < 1024*8) {
5351 		unsigned char b;
5352 		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
5353 			return true;
5354 		return 1 & (b >> (msr_index & 7));
5355 	} else
5356 		return true; /* let L1 handle the wrong parameter */
5357 }
5358 
5359 /*
5360  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
5361  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
5362  * intercept (via guest_host_mask etc.) the current event.
5363  */
5364 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
5365 	struct vmcs12 *vmcs12)
5366 {
5367 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
5368 	int cr = exit_qualification & 15;
5369 	int reg;
5370 	unsigned long val;
5371 
5372 	switch ((exit_qualification >> 4) & 3) {
5373 	case 0: /* mov to cr */
5374 		reg = (exit_qualification >> 8) & 15;
5375 		val = kvm_register_readl(vcpu, reg);
5376 		switch (cr) {
5377 		case 0:
5378 			if (vmcs12->cr0_guest_host_mask &
5379 			    (val ^ vmcs12->cr0_read_shadow))
5380 				return true;
5381 			break;
5382 		case 3:
5383 			if ((vmcs12->cr3_target_count >= 1 &&
5384 					vmcs12->cr3_target_value0 == val) ||
5385 				(vmcs12->cr3_target_count >= 2 &&
5386 					vmcs12->cr3_target_value1 == val) ||
5387 				(vmcs12->cr3_target_count >= 3 &&
5388 					vmcs12->cr3_target_value2 == val) ||
5389 				(vmcs12->cr3_target_count >= 4 &&
5390 					vmcs12->cr3_target_value3 == val))
5391 				return false;
5392 			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
5393 				return true;
5394 			break;
5395 		case 4:
5396 			if (vmcs12->cr4_guest_host_mask &
5397 			    (vmcs12->cr4_read_shadow ^ val))
5398 				return true;
5399 			break;
5400 		case 8:
5401 			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
5402 				return true;
5403 			break;
5404 		}
5405 		break;
5406 	case 2: /* clts */
5407 		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
5408 		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
5409 			return true;
5410 		break;
5411 	case 1: /* mov from cr */
5412 		switch (cr) {
5413 		case 3:
5414 			if (vmcs12->cpu_based_vm_exec_control &
5415 			    CPU_BASED_CR3_STORE_EXITING)
5416 				return true;
5417 			break;
5418 		case 8:
5419 			if (vmcs12->cpu_based_vm_exec_control &
5420 			    CPU_BASED_CR8_STORE_EXITING)
5421 				return true;
5422 			break;
5423 		}
5424 		break;
5425 	case 3: /* lmsw */
5426 		/*
5427 		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5428 		 * cr0. Other attempted changes are ignored, with no exit.
5429 		 */
5430 		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5431 		if (vmcs12->cr0_guest_host_mask & 0xe &
5432 		    (val ^ vmcs12->cr0_read_shadow))
5433 			return true;
5434 		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5435 		    !(vmcs12->cr0_read_shadow & 0x1) &&
5436 		    (val & 0x1))
5437 			return true;
5438 		break;
5439 	}
5440 	return false;
5441 }
5442 
5443 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5444 	struct vmcs12 *vmcs12, gpa_t bitmap)
5445 {
5446 	u32 vmx_instruction_info;
5447 	unsigned long field;
5448 	u8 b;
5449 
5450 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
5451 		return true;
5452 
5453 	/* Decode instruction info and find the field to access */
5454 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5455 	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5456 
5457 	/* Out-of-range fields always cause a VM exit from L2 to L1 */
5458 	if (field >> 15)
5459 		return true;
5460 
5461 	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5462 		return true;
5463 
5464 	return 1 & (b >> (field & 7));
5465 }
5466 
5467 /*
5468  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5469  * should handle it ourselves in L0 (and then continue L2). Only call this
5470  * when in is_guest_mode (L2).
5471  */
5472 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5473 {
5474 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5475 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5476 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5477 
5478 	if (vmx->nested.nested_run_pending)
5479 		return false;
5480 
5481 	if (unlikely(vmx->fail)) {
5482 		trace_kvm_nested_vmenter_failed(
5483 			"hardware VM-instruction error: ",
5484 			vmcs_read32(VM_INSTRUCTION_ERROR));
5485 		return true;
5486 	}
5487 
5488 	/*
5489 	 * The host physical addresses of some pages of guest memory
5490 	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5491 	 * Page). The CPU may write to these pages via their host
5492 	 * physical address while L2 is running, bypassing any
5493 	 * address-translation-based dirty tracking (e.g. EPT write
5494 	 * protection).
5495 	 *
5496 	 * Mark them dirty on every exit from L2 to prevent them from
5497 	 * getting out of sync with dirty tracking.
5498 	 */
5499 	nested_mark_vmcs12_pages_dirty(vcpu);
5500 
5501 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5502 				vmcs_readl(EXIT_QUALIFICATION),
5503 				vmx->idt_vectoring_info,
5504 				intr_info,
5505 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5506 				KVM_ISA_VMX);
5507 
5508 	switch (exit_reason) {
5509 	case EXIT_REASON_EXCEPTION_NMI:
5510 		if (is_nmi(intr_info))
5511 			return false;
5512 		else if (is_page_fault(intr_info))
5513 			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5514 		else if (is_debug(intr_info) &&
5515 			 vcpu->guest_debug &
5516 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5517 			return false;
5518 		else if (is_breakpoint(intr_info) &&
5519 			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5520 			return false;
5521 		return vmcs12->exception_bitmap &
5522 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
5523 	case EXIT_REASON_EXTERNAL_INTERRUPT:
5524 		return false;
5525 	case EXIT_REASON_TRIPLE_FAULT:
5526 		return true;
5527 	case EXIT_REASON_PENDING_INTERRUPT:
5528 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5529 	case EXIT_REASON_NMI_WINDOW:
5530 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5531 	case EXIT_REASON_TASK_SWITCH:
5532 		return true;
5533 	case EXIT_REASON_CPUID:
5534 		return true;
5535 	case EXIT_REASON_HLT:
5536 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5537 	case EXIT_REASON_INVD:
5538 		return true;
5539 	case EXIT_REASON_INVLPG:
5540 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5541 	case EXIT_REASON_RDPMC:
5542 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5543 	case EXIT_REASON_RDRAND:
5544 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5545 	case EXIT_REASON_RDSEED:
5546 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5547 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5548 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5549 	case EXIT_REASON_VMREAD:
5550 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5551 			vmcs12->vmread_bitmap);
5552 	case EXIT_REASON_VMWRITE:
5553 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5554 			vmcs12->vmwrite_bitmap);
5555 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5556 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5557 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5558 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5559 	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5560 		/*
5561 		 * VMX instructions trap unconditionally. This allows L1 to
5562 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5563 		 */
5564 		return true;
5565 	case EXIT_REASON_CR_ACCESS:
5566 		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5567 	case EXIT_REASON_DR_ACCESS:
5568 		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5569 	case EXIT_REASON_IO_INSTRUCTION:
5570 		return nested_vmx_exit_handled_io(vcpu, vmcs12);
5571 	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5572 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5573 	case EXIT_REASON_MSR_READ:
5574 	case EXIT_REASON_MSR_WRITE:
5575 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5576 	case EXIT_REASON_INVALID_STATE:
5577 		return true;
5578 	case EXIT_REASON_MWAIT_INSTRUCTION:
5579 		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5580 	case EXIT_REASON_MONITOR_TRAP_FLAG:
5581 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5582 	case EXIT_REASON_MONITOR_INSTRUCTION:
5583 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5584 	case EXIT_REASON_PAUSE_INSTRUCTION:
5585 		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5586 			nested_cpu_has2(vmcs12,
5587 				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5588 	case EXIT_REASON_MCE_DURING_VMENTRY:
5589 		return false;
5590 	case EXIT_REASON_TPR_BELOW_THRESHOLD:
5591 		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5592 	case EXIT_REASON_APIC_ACCESS:
5593 	case EXIT_REASON_APIC_WRITE:
5594 	case EXIT_REASON_EOI_INDUCED:
5595 		/*
5596 		 * The controls for "virtualize APIC accesses," "APIC-
5597 		 * register virtualization," and "virtual-interrupt
5598 		 * delivery" only come from vmcs12.
5599 		 */
5600 		return true;
5601 	case EXIT_REASON_EPT_VIOLATION:
5602 		/*
5603 		 * L0 always deals with the EPT violation. If nested EPT is
5604 		 * used, and the nested mmu code discovers that the address is
5605 		 * missing in the guest EPT table (EPT12), the EPT violation
5606 		 * will be injected with nested_ept_inject_page_fault()
5607 		 */
5608 		return false;
5609 	case EXIT_REASON_EPT_MISCONFIG:
5610 		/*
5611 		 * L2 never uses directly L1's EPT, but rather L0's own EPT
5612 		 * table (shadow on EPT) or a merged EPT table that L0 built
5613 		 * (EPT on EPT). So any problems with the structure of the
5614 		 * table is L0's fault.
5615 		 */
5616 		return false;
5617 	case EXIT_REASON_INVPCID:
5618 		return
5619 			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5620 			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5621 	case EXIT_REASON_WBINVD:
5622 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5623 	case EXIT_REASON_XSETBV:
5624 		return true;
5625 	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5626 		/*
5627 		 * This should never happen, since it is not possible to
5628 		 * set XSS to a non-zero value---neither in L1 nor in L2.
5629 		 * If if it were, XSS would have to be checked against
5630 		 * the XSS exit bitmap in vmcs12.
5631 		 */
5632 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5633 	case EXIT_REASON_PREEMPTION_TIMER:
5634 		return false;
5635 	case EXIT_REASON_PML_FULL:
5636 		/* We emulate PML support to L1. */
5637 		return false;
5638 	case EXIT_REASON_VMFUNC:
5639 		/* VM functions are emulated through L2->L0 vmexits. */
5640 		return false;
5641 	case EXIT_REASON_ENCLS:
5642 		/* SGX is never exposed to L1 */
5643 		return false;
5644 	case EXIT_REASON_UMWAIT:
5645 	case EXIT_REASON_TPAUSE:
5646 		return nested_cpu_has2(vmcs12,
5647 			SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE);
5648 	default:
5649 		return true;
5650 	}
5651 }
5652 
5653 
5654 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5655 				struct kvm_nested_state __user *user_kvm_nested_state,
5656 				u32 user_data_size)
5657 {
5658 	struct vcpu_vmx *vmx;
5659 	struct vmcs12 *vmcs12;
5660 	struct kvm_nested_state kvm_state = {
5661 		.flags = 0,
5662 		.format = KVM_STATE_NESTED_FORMAT_VMX,
5663 		.size = sizeof(kvm_state),
5664 		.hdr.vmx.vmxon_pa = -1ull,
5665 		.hdr.vmx.vmcs12_pa = -1ull,
5666 	};
5667 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5668 		&user_kvm_nested_state->data.vmx[0];
5669 
5670 	if (!vcpu)
5671 		return kvm_state.size + sizeof(*user_vmx_nested_state);
5672 
5673 	vmx = to_vmx(vcpu);
5674 	vmcs12 = get_vmcs12(vcpu);
5675 
5676 	if (nested_vmx_allowed(vcpu) &&
5677 	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5678 		kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5679 		kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr;
5680 
5681 		if (vmx_has_valid_vmcs12(vcpu)) {
5682 			kvm_state.size += sizeof(user_vmx_nested_state->vmcs12);
5683 
5684 			if (vmx->nested.hv_evmcs)
5685 				kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5686 
5687 			if (is_guest_mode(vcpu) &&
5688 			    nested_cpu_has_shadow_vmcs(vmcs12) &&
5689 			    vmcs12->vmcs_link_pointer != -1ull)
5690 				kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12);
5691 		}
5692 
5693 		if (vmx->nested.smm.vmxon)
5694 			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5695 
5696 		if (vmx->nested.smm.guest_mode)
5697 			kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5698 
5699 		if (is_guest_mode(vcpu)) {
5700 			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5701 
5702 			if (vmx->nested.nested_run_pending)
5703 				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5704 		}
5705 	}
5706 
5707 	if (user_data_size < kvm_state.size)
5708 		goto out;
5709 
5710 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5711 		return -EFAULT;
5712 
5713 	if (!vmx_has_valid_vmcs12(vcpu))
5714 		goto out;
5715 
5716 	/*
5717 	 * When running L2, the authoritative vmcs12 state is in the
5718 	 * vmcs02. When running L1, the authoritative vmcs12 state is
5719 	 * in the shadow or enlightened vmcs linked to vmcs01, unless
5720 	 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative
5721 	 * vmcs12 state is in the vmcs12 already.
5722 	 */
5723 	if (is_guest_mode(vcpu)) {
5724 		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
5725 		sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
5726 	} else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
5727 		if (vmx->nested.hv_evmcs)
5728 			copy_enlightened_to_vmcs12(vmx);
5729 		else if (enable_shadow_vmcs)
5730 			copy_shadow_to_vmcs12(vmx);
5731 	}
5732 
5733 	BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
5734 	BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE);
5735 
5736 	/*
5737 	 * Copy over the full allocated size of vmcs12 rather than just the size
5738 	 * of the struct.
5739 	 */
5740 	if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE))
5741 		return -EFAULT;
5742 
5743 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5744 	    vmcs12->vmcs_link_pointer != -1ull) {
5745 		if (copy_to_user(user_vmx_nested_state->shadow_vmcs12,
5746 				 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5747 			return -EFAULT;
5748 	}
5749 
5750 out:
5751 	return kvm_state.size;
5752 }
5753 
5754 /*
5755  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5756  */
5757 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5758 {
5759 	if (is_guest_mode(vcpu)) {
5760 		to_vmx(vcpu)->nested.nested_run_pending = 0;
5761 		nested_vmx_vmexit(vcpu, -1, 0, 0);
5762 	}
5763 	free_nested(vcpu);
5764 }
5765 
5766 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5767 				struct kvm_nested_state __user *user_kvm_nested_state,
5768 				struct kvm_nested_state *kvm_state)
5769 {
5770 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5771 	struct vmcs12 *vmcs12;
5772 	u32 exit_qual;
5773 	struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
5774 		&user_kvm_nested_state->data.vmx[0];
5775 	int ret;
5776 
5777 	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX)
5778 		return -EINVAL;
5779 
5780 	if (kvm_state->hdr.vmx.vmxon_pa == -1ull) {
5781 		if (kvm_state->hdr.vmx.smm.flags)
5782 			return -EINVAL;
5783 
5784 		if (kvm_state->hdr.vmx.vmcs12_pa != -1ull)
5785 			return -EINVAL;
5786 
5787 		/*
5788 		 * KVM_STATE_NESTED_EVMCS used to signal that KVM should
5789 		 * enable eVMCS capability on vCPU. However, since then
5790 		 * code was changed such that flag signals vmcs12 should
5791 		 * be copied into eVMCS in guest memory.
5792 		 *
5793 		 * To preserve backwards compatability, allow user
5794 		 * to set this flag even when there is no VMXON region.
5795 		 */
5796 		if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS)
5797 			return -EINVAL;
5798 	} else {
5799 		if (!nested_vmx_allowed(vcpu))
5800 			return -EINVAL;
5801 
5802 		if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa))
5803 			return -EINVAL;
5804 	}
5805 
5806 	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5807 	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5808 		return -EINVAL;
5809 
5810 	if (kvm_state->hdr.vmx.smm.flags &
5811 	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5812 		return -EINVAL;
5813 
5814 	/*
5815 	 * SMM temporarily disables VMX, so we cannot be in guest mode,
5816 	 * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5817 	 * must be zero.
5818 	 */
5819 	if (is_smm(vcpu) ?
5820 		(kvm_state->flags &
5821 		 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING))
5822 		: kvm_state->hdr.vmx.smm.flags)
5823 		return -EINVAL;
5824 
5825 	if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5826 	    !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5827 		return -EINVAL;
5828 
5829 	if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) &&
5830 		(!nested_vmx_allowed(vcpu) || !vmx->nested.enlightened_vmcs_enabled))
5831 			return -EINVAL;
5832 
5833 	vmx_leave_nested(vcpu);
5834 
5835 	if (kvm_state->hdr.vmx.vmxon_pa == -1ull)
5836 		return 0;
5837 
5838 	vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa;
5839 	ret = enter_vmx_operation(vcpu);
5840 	if (ret)
5841 		return ret;
5842 
5843 	/* Empty 'VMXON' state is permitted */
5844 	if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5845 		return 0;
5846 
5847 	if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
5848 		if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
5849 		    !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa))
5850 			return -EINVAL;
5851 
5852 		set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
5853 	} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5854 		/*
5855 		 * Sync eVMCS upon entry as we may not have
5856 		 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5857 		 */
5858 		vmx->nested.need_vmcs12_to_shadow_sync = true;
5859 	} else {
5860 		return -EINVAL;
5861 	}
5862 
5863 	if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5864 		vmx->nested.smm.vmxon = true;
5865 		vmx->nested.vmxon = false;
5866 
5867 		if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5868 			vmx->nested.smm.guest_mode = true;
5869 	}
5870 
5871 	vmcs12 = get_vmcs12(vcpu);
5872 	if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12)))
5873 		return -EFAULT;
5874 
5875 	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5876 		return -EINVAL;
5877 
5878 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5879 		return 0;
5880 
5881 	vmx->nested.nested_run_pending =
5882 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5883 
5884 	ret = -EINVAL;
5885 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5886 	    vmcs12->vmcs_link_pointer != -1ull) {
5887 		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5888 
5889 		if (kvm_state->size <
5890 		    sizeof(*kvm_state) +
5891 		    sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12))
5892 			goto error_guest_mode;
5893 
5894 		if (copy_from_user(shadow_vmcs12,
5895 				   user_vmx_nested_state->shadow_vmcs12,
5896 				   sizeof(*shadow_vmcs12))) {
5897 			ret = -EFAULT;
5898 			goto error_guest_mode;
5899 		}
5900 
5901 		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5902 		    !shadow_vmcs12->hdr.shadow_vmcs)
5903 			goto error_guest_mode;
5904 	}
5905 
5906 	if (nested_vmx_check_controls(vcpu, vmcs12) ||
5907 	    nested_vmx_check_host_state(vcpu, vmcs12) ||
5908 	    nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5909 		goto error_guest_mode;
5910 
5911 	vmx->nested.dirty_vmcs12 = true;
5912 	ret = nested_vmx_enter_non_root_mode(vcpu, false);
5913 	if (ret)
5914 		goto error_guest_mode;
5915 
5916 	return 0;
5917 
5918 error_guest_mode:
5919 	vmx->nested.nested_run_pending = 0;
5920 	return ret;
5921 }
5922 
5923 void nested_vmx_set_vmcs_shadowing_bitmap(void)
5924 {
5925 	if (enable_shadow_vmcs) {
5926 		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5927 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
5928 	}
5929 }
5930 
5931 /*
5932  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5933  * returned for the various VMX controls MSRs when nested VMX is enabled.
5934  * The same values should also be used to verify that vmcs12 control fields are
5935  * valid during nested entry from L1 to L2.
5936  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5937  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5938  * bit in the high half is on if the corresponding bit in the control field
5939  * may be on. See also vmx_control_verify().
5940  */
5941 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5942 				bool apicv)
5943 {
5944 	/*
5945 	 * Note that as a general rule, the high half of the MSRs (bits in
5946 	 * the control fields which may be 1) should be initialized by the
5947 	 * intersection of the underlying hardware's MSR (i.e., features which
5948 	 * can be supported) and the list of features we want to expose -
5949 	 * because they are known to be properly supported in our code.
5950 	 * Also, usually, the low half of the MSRs (bits which must be 1) can
5951 	 * be set to 0, meaning that L1 may turn off any of these bits. The
5952 	 * reason is that if one of these bits is necessary, it will appear
5953 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5954 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
5955 	 * nested_vmx_exit_reflected() will not pass related exits to L1.
5956 	 * These rules have exceptions below.
5957 	 */
5958 
5959 	/* pin-based controls */
5960 	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5961 		msrs->pinbased_ctls_low,
5962 		msrs->pinbased_ctls_high);
5963 	msrs->pinbased_ctls_low |=
5964 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5965 	msrs->pinbased_ctls_high &=
5966 		PIN_BASED_EXT_INTR_MASK |
5967 		PIN_BASED_NMI_EXITING |
5968 		PIN_BASED_VIRTUAL_NMIS |
5969 		(apicv ? PIN_BASED_POSTED_INTR : 0);
5970 	msrs->pinbased_ctls_high |=
5971 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5972 		PIN_BASED_VMX_PREEMPTION_TIMER;
5973 
5974 	/* exit controls */
5975 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5976 		msrs->exit_ctls_low,
5977 		msrs->exit_ctls_high);
5978 	msrs->exit_ctls_low =
5979 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5980 
5981 	msrs->exit_ctls_high &=
5982 #ifdef CONFIG_X86_64
5983 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
5984 #endif
5985 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5986 	msrs->exit_ctls_high |=
5987 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5988 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5989 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5990 
5991 	/* We support free control of debug control saving. */
5992 	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5993 
5994 	/* entry controls */
5995 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5996 		msrs->entry_ctls_low,
5997 		msrs->entry_ctls_high);
5998 	msrs->entry_ctls_low =
5999 		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
6000 	msrs->entry_ctls_high &=
6001 #ifdef CONFIG_X86_64
6002 		VM_ENTRY_IA32E_MODE |
6003 #endif
6004 		VM_ENTRY_LOAD_IA32_PAT;
6005 	msrs->entry_ctls_high |=
6006 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
6007 
6008 	/* We support free control of debug control loading. */
6009 	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
6010 
6011 	/* cpu-based controls */
6012 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
6013 		msrs->procbased_ctls_low,
6014 		msrs->procbased_ctls_high);
6015 	msrs->procbased_ctls_low =
6016 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
6017 	msrs->procbased_ctls_high &=
6018 		CPU_BASED_VIRTUAL_INTR_PENDING |
6019 		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
6020 		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
6021 		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
6022 		CPU_BASED_CR3_STORE_EXITING |
6023 #ifdef CONFIG_X86_64
6024 		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
6025 #endif
6026 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
6027 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
6028 		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
6029 		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
6030 		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
6031 	/*
6032 	 * We can allow some features even when not supported by the
6033 	 * hardware. For example, L1 can specify an MSR bitmap - and we
6034 	 * can use it to avoid exits to L1 - even when L0 runs L2
6035 	 * without MSR bitmaps.
6036 	 */
6037 	msrs->procbased_ctls_high |=
6038 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
6039 		CPU_BASED_USE_MSR_BITMAPS;
6040 
6041 	/* We support free control of CR3 access interception. */
6042 	msrs->procbased_ctls_low &=
6043 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
6044 
6045 	/*
6046 	 * secondary cpu-based controls.  Do not include those that
6047 	 * depend on CPUID bits, they are added later by vmx_cpuid_update.
6048 	 */
6049 	if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
6050 		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
6051 		      msrs->secondary_ctls_low,
6052 		      msrs->secondary_ctls_high);
6053 
6054 	msrs->secondary_ctls_low = 0;
6055 	msrs->secondary_ctls_high &=
6056 		SECONDARY_EXEC_DESC |
6057 		SECONDARY_EXEC_RDTSCP |
6058 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
6059 		SECONDARY_EXEC_WBINVD_EXITING |
6060 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
6061 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
6062 		SECONDARY_EXEC_RDRAND_EXITING |
6063 		SECONDARY_EXEC_ENABLE_INVPCID |
6064 		SECONDARY_EXEC_RDSEED_EXITING |
6065 		SECONDARY_EXEC_XSAVES;
6066 
6067 	/*
6068 	 * We can emulate "VMCS shadowing," even if the hardware
6069 	 * doesn't support it.
6070 	 */
6071 	msrs->secondary_ctls_high |=
6072 		SECONDARY_EXEC_SHADOW_VMCS;
6073 
6074 	if (enable_ept) {
6075 		/* nested EPT: emulate EPT also to L1 */
6076 		msrs->secondary_ctls_high |=
6077 			SECONDARY_EXEC_ENABLE_EPT;
6078 		msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
6079 			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
6080 		if (cpu_has_vmx_ept_execute_only())
6081 			msrs->ept_caps |=
6082 				VMX_EPT_EXECUTE_ONLY_BIT;
6083 		msrs->ept_caps &= ept_caps;
6084 		msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
6085 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
6086 			VMX_EPT_1GB_PAGE_BIT;
6087 		if (enable_ept_ad_bits) {
6088 			msrs->secondary_ctls_high |=
6089 				SECONDARY_EXEC_ENABLE_PML;
6090 			msrs->ept_caps |= VMX_EPT_AD_BIT;
6091 		}
6092 	}
6093 
6094 	if (cpu_has_vmx_vmfunc()) {
6095 		msrs->secondary_ctls_high |=
6096 			SECONDARY_EXEC_ENABLE_VMFUNC;
6097 		/*
6098 		 * Advertise EPTP switching unconditionally
6099 		 * since we emulate it
6100 		 */
6101 		if (enable_ept)
6102 			msrs->vmfunc_controls =
6103 				VMX_VMFUNC_EPTP_SWITCHING;
6104 	}
6105 
6106 	/*
6107 	 * Old versions of KVM use the single-context version without
6108 	 * checking for support, so declare that it is supported even
6109 	 * though it is treated as global context.  The alternative is
6110 	 * not failing the single-context invvpid, and it is worse.
6111 	 */
6112 	if (enable_vpid) {
6113 		msrs->secondary_ctls_high |=
6114 			SECONDARY_EXEC_ENABLE_VPID;
6115 		msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
6116 			VMX_VPID_EXTENT_SUPPORTED_MASK;
6117 	}
6118 
6119 	if (enable_unrestricted_guest)
6120 		msrs->secondary_ctls_high |=
6121 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
6122 
6123 	if (flexpriority_enabled)
6124 		msrs->secondary_ctls_high |=
6125 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6126 
6127 	/* miscellaneous data */
6128 	rdmsr(MSR_IA32_VMX_MISC,
6129 		msrs->misc_low,
6130 		msrs->misc_high);
6131 	msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
6132 	msrs->misc_low |=
6133 		MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
6134 		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
6135 		VMX_MISC_ACTIVITY_HLT;
6136 	msrs->misc_high = 0;
6137 
6138 	/*
6139 	 * This MSR reports some information about VMX support. We
6140 	 * should return information about the VMX we emulate for the
6141 	 * guest, and the VMCS structure we give it - not about the
6142 	 * VMX support of the underlying hardware.
6143 	 */
6144 	msrs->basic =
6145 		VMCS12_REVISION |
6146 		VMX_BASIC_TRUE_CTLS |
6147 		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
6148 		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
6149 
6150 	if (cpu_has_vmx_basic_inout())
6151 		msrs->basic |= VMX_BASIC_INOUT;
6152 
6153 	/*
6154 	 * These MSRs specify bits which the guest must keep fixed on
6155 	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
6156 	 * We picked the standard core2 setting.
6157 	 */
6158 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
6159 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
6160 	msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
6161 	msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
6162 
6163 	/* These MSRs specify bits which the guest must keep fixed off. */
6164 	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
6165 	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
6166 
6167 	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
6168 	msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
6169 }
6170 
6171 void nested_vmx_hardware_unsetup(void)
6172 {
6173 	int i;
6174 
6175 	if (enable_shadow_vmcs) {
6176 		for (i = 0; i < VMX_BITMAP_NR; i++)
6177 			free_page((unsigned long)vmx_bitmap[i]);
6178 	}
6179 }
6180 
6181 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
6182 {
6183 	int i;
6184 
6185 	if (!cpu_has_vmx_shadow_vmcs())
6186 		enable_shadow_vmcs = 0;
6187 	if (enable_shadow_vmcs) {
6188 		for (i = 0; i < VMX_BITMAP_NR; i++) {
6189 			/*
6190 			 * The vmx_bitmap is not tied to a VM and so should
6191 			 * not be charged to a memcg.
6192 			 */
6193 			vmx_bitmap[i] = (unsigned long *)
6194 				__get_free_page(GFP_KERNEL);
6195 			if (!vmx_bitmap[i]) {
6196 				nested_vmx_hardware_unsetup();
6197 				return -ENOMEM;
6198 			}
6199 		}
6200 
6201 		init_vmcs_shadow_fields();
6202 	}
6203 
6204 	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear;
6205 	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch;
6206 	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld;
6207 	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst;
6208 	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread;
6209 	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume;
6210 	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite;
6211 	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff;
6212 	exit_handlers[EXIT_REASON_VMON]		= handle_vmon;
6213 	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept;
6214 	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid;
6215 	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc;
6216 
6217 	kvm_x86_ops->check_nested_events = vmx_check_nested_events;
6218 	kvm_x86_ops->get_nested_state = vmx_get_nested_state;
6219 	kvm_x86_ops->set_nested_state = vmx_set_nested_state;
6220 	kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages;
6221 	kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
6222 	kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
6223 
6224 	return 0;
6225 }
6226