xref: /openbmc/linux/arch/x86/kvm/vmx/nested.c (revision 151f4e2b)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/frame.h>
4 #include <linux/percpu.h>
5 
6 #include <asm/debugreg.h>
7 #include <asm/mmu_context.h>
8 
9 #include "cpuid.h"
10 #include "hyperv.h"
11 #include "mmu.h"
12 #include "nested.h"
13 #include "trace.h"
14 #include "x86.h"
15 
16 static bool __read_mostly enable_shadow_vmcs = 1;
17 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
18 
19 static bool __read_mostly nested_early_check = 0;
20 module_param(nested_early_check, bool, S_IRUGO);
21 
22 /*
23  * Hyper-V requires all of these, so mark them as supported even though
24  * they are just treated the same as all-context.
25  */
26 #define VMX_VPID_EXTENT_SUPPORTED_MASK		\
27 	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
28 	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
29 	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
30 	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
31 
32 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
33 
34 enum {
35 	VMX_VMREAD_BITMAP,
36 	VMX_VMWRITE_BITMAP,
37 	VMX_BITMAP_NR
38 };
39 static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
40 
41 #define vmx_vmread_bitmap                    (vmx_bitmap[VMX_VMREAD_BITMAP])
42 #define vmx_vmwrite_bitmap                   (vmx_bitmap[VMX_VMWRITE_BITMAP])
43 
44 static u16 shadow_read_only_fields[] = {
45 #define SHADOW_FIELD_RO(x) x,
46 #include "vmcs_shadow_fields.h"
47 };
48 static int max_shadow_read_only_fields =
49 	ARRAY_SIZE(shadow_read_only_fields);
50 
51 static u16 shadow_read_write_fields[] = {
52 #define SHADOW_FIELD_RW(x) x,
53 #include "vmcs_shadow_fields.h"
54 };
55 static int max_shadow_read_write_fields =
56 	ARRAY_SIZE(shadow_read_write_fields);
57 
58 static void init_vmcs_shadow_fields(void)
59 {
60 	int i, j;
61 
62 	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
63 	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
64 
65 	for (i = j = 0; i < max_shadow_read_only_fields; i++) {
66 		u16 field = shadow_read_only_fields[i];
67 
68 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
69 		    (i + 1 == max_shadow_read_only_fields ||
70 		     shadow_read_only_fields[i + 1] != field + 1))
71 			pr_err("Missing field from shadow_read_only_field %x\n",
72 			       field + 1);
73 
74 		clear_bit(field, vmx_vmread_bitmap);
75 #ifdef CONFIG_X86_64
76 		if (field & 1)
77 			continue;
78 #endif
79 		if (j < i)
80 			shadow_read_only_fields[j] = field;
81 		j++;
82 	}
83 	max_shadow_read_only_fields = j;
84 
85 	for (i = j = 0; i < max_shadow_read_write_fields; i++) {
86 		u16 field = shadow_read_write_fields[i];
87 
88 		if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 &&
89 		    (i + 1 == max_shadow_read_write_fields ||
90 		     shadow_read_write_fields[i + 1] != field + 1))
91 			pr_err("Missing field from shadow_read_write_field %x\n",
92 			       field + 1);
93 
94 		/*
95 		 * PML and the preemption timer can be emulated, but the
96 		 * processor cannot vmwrite to fields that don't exist
97 		 * on bare metal.
98 		 */
99 		switch (field) {
100 		case GUEST_PML_INDEX:
101 			if (!cpu_has_vmx_pml())
102 				continue;
103 			break;
104 		case VMX_PREEMPTION_TIMER_VALUE:
105 			if (!cpu_has_vmx_preemption_timer())
106 				continue;
107 			break;
108 		case GUEST_INTR_STATUS:
109 			if (!cpu_has_vmx_apicv())
110 				continue;
111 			break;
112 		default:
113 			break;
114 		}
115 
116 		clear_bit(field, vmx_vmwrite_bitmap);
117 		clear_bit(field, vmx_vmread_bitmap);
118 #ifdef CONFIG_X86_64
119 		if (field & 1)
120 			continue;
121 #endif
122 		if (j < i)
123 			shadow_read_write_fields[j] = field;
124 		j++;
125 	}
126 	max_shadow_read_write_fields = j;
127 }
128 
129 /*
130  * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
131  * set the success or error code of an emulated VMX instruction (as specified
132  * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated
133  * instruction.
134  */
135 static int nested_vmx_succeed(struct kvm_vcpu *vcpu)
136 {
137 	vmx_set_rflags(vcpu, vmx_get_rflags(vcpu)
138 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
139 			    X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF));
140 	return kvm_skip_emulated_instruction(vcpu);
141 }
142 
143 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu)
144 {
145 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
146 			& ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
147 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
148 			| X86_EFLAGS_CF);
149 	return kvm_skip_emulated_instruction(vcpu);
150 }
151 
152 static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
153 				u32 vm_instruction_error)
154 {
155 	struct vcpu_vmx *vmx = to_vmx(vcpu);
156 
157 	/*
158 	 * failValid writes the error number to the current VMCS, which
159 	 * can't be done if there isn't a current VMCS.
160 	 */
161 	if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
162 		return nested_vmx_failInvalid(vcpu);
163 
164 	vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
165 			& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
166 			    X86_EFLAGS_SF | X86_EFLAGS_OF))
167 			| X86_EFLAGS_ZF);
168 	get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error;
169 	/*
170 	 * We don't need to force a shadow sync because
171 	 * VM_INSTRUCTION_ERROR is not shadowed
172 	 */
173 	return kvm_skip_emulated_instruction(vcpu);
174 }
175 
176 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
177 {
178 	/* TODO: not to reset guest simply here. */
179 	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
180 	pr_debug_ratelimited("kvm: nested vmx abort, indicator %d\n", indicator);
181 }
182 
183 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
184 {
185 	vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
186 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
187 }
188 
189 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu)
190 {
191 	struct vcpu_vmx *vmx = to_vmx(vcpu);
192 
193 	if (!vmx->nested.hv_evmcs)
194 		return;
195 
196 	kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
197 	vmx->nested.hv_evmcs_vmptr = -1ull;
198 	vmx->nested.hv_evmcs = NULL;
199 }
200 
201 /*
202  * Free whatever needs to be freed from vmx->nested when L1 goes down, or
203  * just stops using VMX.
204  */
205 static void free_nested(struct kvm_vcpu *vcpu)
206 {
207 	struct vcpu_vmx *vmx = to_vmx(vcpu);
208 
209 	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
210 		return;
211 
212 	vmx->nested.vmxon = false;
213 	vmx->nested.smm.vmxon = false;
214 	free_vpid(vmx->nested.vpid02);
215 	vmx->nested.posted_intr_nv = -1;
216 	vmx->nested.current_vmptr = -1ull;
217 	if (enable_shadow_vmcs) {
218 		vmx_disable_shadow_vmcs(vmx);
219 		vmcs_clear(vmx->vmcs01.shadow_vmcs);
220 		free_vmcs(vmx->vmcs01.shadow_vmcs);
221 		vmx->vmcs01.shadow_vmcs = NULL;
222 	}
223 	kfree(vmx->nested.cached_vmcs12);
224 	kfree(vmx->nested.cached_shadow_vmcs12);
225 	/* Unpin physical memory we referred to in the vmcs02 */
226 	if (vmx->nested.apic_access_page) {
227 		kvm_release_page_dirty(vmx->nested.apic_access_page);
228 		vmx->nested.apic_access_page = NULL;
229 	}
230 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
231 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
232 	vmx->nested.pi_desc = NULL;
233 
234 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
235 
236 	nested_release_evmcs(vcpu);
237 
238 	free_loaded_vmcs(&vmx->nested.vmcs02);
239 }
240 
241 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
242 {
243 	struct vcpu_vmx *vmx = to_vmx(vcpu);
244 	int cpu;
245 
246 	if (vmx->loaded_vmcs == vmcs)
247 		return;
248 
249 	cpu = get_cpu();
250 	vmx_vcpu_put(vcpu);
251 	vmx->loaded_vmcs = vmcs;
252 	vmx_vcpu_load(vcpu, cpu);
253 	put_cpu();
254 
255 	vm_entry_controls_reset_shadow(vmx);
256 	vm_exit_controls_reset_shadow(vmx);
257 	vmx_segment_cache_clear(vmx);
258 }
259 
260 /*
261  * Ensure that the current vmcs of the logical processor is the
262  * vmcs01 of the vcpu before calling free_nested().
263  */
264 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu)
265 {
266 	vcpu_load(vcpu);
267 	vmx_leave_nested(vcpu);
268 	vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
269 	free_nested(vcpu);
270 	vcpu_put(vcpu);
271 }
272 
273 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
274 		struct x86_exception *fault)
275 {
276 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
277 	struct vcpu_vmx *vmx = to_vmx(vcpu);
278 	u32 exit_reason;
279 	unsigned long exit_qualification = vcpu->arch.exit_qualification;
280 
281 	if (vmx->nested.pml_full) {
282 		exit_reason = EXIT_REASON_PML_FULL;
283 		vmx->nested.pml_full = false;
284 		exit_qualification &= INTR_INFO_UNBLOCK_NMI;
285 	} else if (fault->error_code & PFERR_RSVD_MASK)
286 		exit_reason = EXIT_REASON_EPT_MISCONFIG;
287 	else
288 		exit_reason = EXIT_REASON_EPT_VIOLATION;
289 
290 	nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
291 	vmcs12->guest_physical_address = fault->address;
292 }
293 
294 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
295 {
296 	WARN_ON(mmu_is_nested(vcpu));
297 
298 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
299 	kvm_init_shadow_ept_mmu(vcpu,
300 			to_vmx(vcpu)->nested.msrs.ept_caps &
301 			VMX_EPT_EXECUTE_ONLY_BIT,
302 			nested_ept_ad_enabled(vcpu),
303 			nested_ept_get_cr3(vcpu));
304 	vcpu->arch.mmu->set_cr3           = vmx_set_cr3;
305 	vcpu->arch.mmu->get_cr3           = nested_ept_get_cr3;
306 	vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
307 	vcpu->arch.mmu->get_pdptr         = kvm_pdptr_read;
308 
309 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
310 }
311 
312 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
313 {
314 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
315 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
316 }
317 
318 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
319 					    u16 error_code)
320 {
321 	bool inequality, bit;
322 
323 	bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0;
324 	inequality =
325 		(error_code & vmcs12->page_fault_error_code_mask) !=
326 		 vmcs12->page_fault_error_code_match;
327 	return inequality ^ bit;
328 }
329 
330 
331 /*
332  * KVM wants to inject page-faults which it got to the guest. This function
333  * checks whether in a nested guest, we need to inject them to L1 or L2.
334  */
335 static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
336 {
337 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
338 	unsigned int nr = vcpu->arch.exception.nr;
339 	bool has_payload = vcpu->arch.exception.has_payload;
340 	unsigned long payload = vcpu->arch.exception.payload;
341 
342 	if (nr == PF_VECTOR) {
343 		if (vcpu->arch.exception.nested_apf) {
344 			*exit_qual = vcpu->arch.apf.nested_apf_token;
345 			return 1;
346 		}
347 		if (nested_vmx_is_page_fault_vmexit(vmcs12,
348 						    vcpu->arch.exception.error_code)) {
349 			*exit_qual = has_payload ? payload : vcpu->arch.cr2;
350 			return 1;
351 		}
352 	} else if (vmcs12->exception_bitmap & (1u << nr)) {
353 		if (nr == DB_VECTOR) {
354 			if (!has_payload) {
355 				payload = vcpu->arch.dr6;
356 				payload &= ~(DR6_FIXED_1 | DR6_BT);
357 				payload ^= DR6_RTM;
358 			}
359 			*exit_qual = payload;
360 		} else
361 			*exit_qual = 0;
362 		return 1;
363 	}
364 
365 	return 0;
366 }
367 
368 
369 static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
370 		struct x86_exception *fault)
371 {
372 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
373 
374 	WARN_ON(!is_guest_mode(vcpu));
375 
376 	if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
377 		!to_vmx(vcpu)->nested.nested_run_pending) {
378 		vmcs12->vm_exit_intr_error_code = fault->error_code;
379 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
380 				  PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
381 				  INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
382 				  fault->address);
383 	} else {
384 		kvm_inject_page_fault(vcpu, fault);
385 	}
386 }
387 
388 static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
389 {
390 	return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
391 }
392 
393 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
394 					       struct vmcs12 *vmcs12)
395 {
396 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
397 		return 0;
398 
399 	if (!page_address_valid(vcpu, vmcs12->io_bitmap_a) ||
400 	    !page_address_valid(vcpu, vmcs12->io_bitmap_b))
401 		return -EINVAL;
402 
403 	return 0;
404 }
405 
406 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu,
407 						struct vmcs12 *vmcs12)
408 {
409 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
410 		return 0;
411 
412 	if (!page_address_valid(vcpu, vmcs12->msr_bitmap))
413 		return -EINVAL;
414 
415 	return 0;
416 }
417 
418 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu,
419 						struct vmcs12 *vmcs12)
420 {
421 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
422 		return 0;
423 
424 	if (!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))
425 		return -EINVAL;
426 
427 	return 0;
428 }
429 
430 /*
431  * Check if MSR is intercepted for L01 MSR bitmap.
432  */
433 static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
434 {
435 	unsigned long *msr_bitmap;
436 	int f = sizeof(unsigned long);
437 
438 	if (!cpu_has_vmx_msr_bitmap())
439 		return true;
440 
441 	msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
442 
443 	if (msr <= 0x1fff) {
444 		return !!test_bit(msr, msr_bitmap + 0x800 / f);
445 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
446 		msr &= 0x1fff;
447 		return !!test_bit(msr, msr_bitmap + 0xc00 / f);
448 	}
449 
450 	return true;
451 }
452 
453 /*
454  * If a msr is allowed by L0, we should check whether it is allowed by L1.
455  * The corresponding bit will be cleared unless both of L0 and L1 allow it.
456  */
457 static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
458 					       unsigned long *msr_bitmap_nested,
459 					       u32 msr, int type)
460 {
461 	int f = sizeof(unsigned long);
462 
463 	/*
464 	 * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
465 	 * have the write-low and read-high bitmap offsets the wrong way round.
466 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
467 	 */
468 	if (msr <= 0x1fff) {
469 		if (type & MSR_TYPE_R &&
470 		   !test_bit(msr, msr_bitmap_l1 + 0x000 / f))
471 			/* read-low */
472 			__clear_bit(msr, msr_bitmap_nested + 0x000 / f);
473 
474 		if (type & MSR_TYPE_W &&
475 		   !test_bit(msr, msr_bitmap_l1 + 0x800 / f))
476 			/* write-low */
477 			__clear_bit(msr, msr_bitmap_nested + 0x800 / f);
478 
479 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
480 		msr &= 0x1fff;
481 		if (type & MSR_TYPE_R &&
482 		   !test_bit(msr, msr_bitmap_l1 + 0x400 / f))
483 			/* read-high */
484 			__clear_bit(msr, msr_bitmap_nested + 0x400 / f);
485 
486 		if (type & MSR_TYPE_W &&
487 		   !test_bit(msr, msr_bitmap_l1 + 0xc00 / f))
488 			/* write-high */
489 			__clear_bit(msr, msr_bitmap_nested + 0xc00 / f);
490 
491 	}
492 }
493 
494 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
495 	int msr;
496 
497 	for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
498 		unsigned word = msr / BITS_PER_LONG;
499 
500 		msr_bitmap[word] = ~0;
501 		msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
502 	}
503 }
504 
505 /*
506  * Merge L0's and L1's MSR bitmap, return false to indicate that
507  * we do not use the hardware.
508  */
509 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
510 						 struct vmcs12 *vmcs12)
511 {
512 	int msr;
513 	unsigned long *msr_bitmap_l1;
514 	unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
515 	struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map;
516 
517 	/* Nothing to do if the MSR bitmap is not in use.  */
518 	if (!cpu_has_vmx_msr_bitmap() ||
519 	    !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
520 		return false;
521 
522 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map))
523 		return false;
524 
525 	msr_bitmap_l1 = (unsigned long *)map->hva;
526 
527 	/*
528 	 * To keep the control flow simple, pay eight 8-byte writes (sixteen
529 	 * 4-byte writes on 32-bit systems) up front to enable intercepts for
530 	 * the x2APIC MSR range and selectively disable them below.
531 	 */
532 	enable_x2apic_msr_intercepts(msr_bitmap_l0);
533 
534 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
535 		if (nested_cpu_has_apic_reg_virt(vmcs12)) {
536 			/*
537 			 * L0 need not intercept reads for MSRs between 0x800
538 			 * and 0x8ff, it just lets the processor take the value
539 			 * from the virtual-APIC page; take those 256 bits
540 			 * directly from the L1 bitmap.
541 			 */
542 			for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
543 				unsigned word = msr / BITS_PER_LONG;
544 
545 				msr_bitmap_l0[word] = msr_bitmap_l1[word];
546 			}
547 		}
548 
549 		nested_vmx_disable_intercept_for_msr(
550 			msr_bitmap_l1, msr_bitmap_l0,
551 			X2APIC_MSR(APIC_TASKPRI),
552 			MSR_TYPE_R | MSR_TYPE_W);
553 
554 		if (nested_cpu_has_vid(vmcs12)) {
555 			nested_vmx_disable_intercept_for_msr(
556 				msr_bitmap_l1, msr_bitmap_l0,
557 				X2APIC_MSR(APIC_EOI),
558 				MSR_TYPE_W);
559 			nested_vmx_disable_intercept_for_msr(
560 				msr_bitmap_l1, msr_bitmap_l0,
561 				X2APIC_MSR(APIC_SELF_IPI),
562 				MSR_TYPE_W);
563 		}
564 	}
565 
566 	/* KVM unconditionally exposes the FS/GS base MSRs to L1. */
567 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
568 					     MSR_FS_BASE, MSR_TYPE_RW);
569 
570 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
571 					     MSR_GS_BASE, MSR_TYPE_RW);
572 
573 	nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
574 					     MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
575 
576 	/*
577 	 * Checking the L0->L1 bitmap is trying to verify two things:
578 	 *
579 	 * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
580 	 *    ensures that we do not accidentally generate an L02 MSR bitmap
581 	 *    from the L12 MSR bitmap that is too permissive.
582 	 * 2. That L1 or L2s have actually used the MSR. This avoids
583 	 *    unnecessarily merging of the bitmap if the MSR is unused. This
584 	 *    works properly because we only update the L01 MSR bitmap lazily.
585 	 *    So even if L0 should pass L1 these MSRs, the L01 bitmap is only
586 	 *    updated to reflect this when L1 (or its L2s) actually write to
587 	 *    the MSR.
588 	 */
589 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL))
590 		nested_vmx_disable_intercept_for_msr(
591 					msr_bitmap_l1, msr_bitmap_l0,
592 					MSR_IA32_SPEC_CTRL,
593 					MSR_TYPE_R | MSR_TYPE_W);
594 
595 	if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD))
596 		nested_vmx_disable_intercept_for_msr(
597 					msr_bitmap_l1, msr_bitmap_l0,
598 					MSR_IA32_PRED_CMD,
599 					MSR_TYPE_W);
600 
601 	kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false);
602 
603 	return true;
604 }
605 
606 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
607 				       struct vmcs12 *vmcs12)
608 {
609 	struct kvm_host_map map;
610 	struct vmcs12 *shadow;
611 
612 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
613 	    vmcs12->vmcs_link_pointer == -1ull)
614 		return;
615 
616 	shadow = get_shadow_vmcs12(vcpu);
617 
618 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
619 		return;
620 
621 	memcpy(shadow, map.hva, VMCS12_SIZE);
622 	kvm_vcpu_unmap(vcpu, &map, false);
623 }
624 
625 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
626 					      struct vmcs12 *vmcs12)
627 {
628 	struct vcpu_vmx *vmx = to_vmx(vcpu);
629 
630 	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
631 	    vmcs12->vmcs_link_pointer == -1ull)
632 		return;
633 
634 	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
635 			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
636 }
637 
638 /*
639  * In nested virtualization, check if L1 has set
640  * VM_EXIT_ACK_INTR_ON_EXIT
641  */
642 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu)
643 {
644 	return get_vmcs12(vcpu)->vm_exit_controls &
645 		VM_EXIT_ACK_INTR_ON_EXIT;
646 }
647 
648 static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
649 {
650 	return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
651 }
652 
653 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
654 					  struct vmcs12 *vmcs12)
655 {
656 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) &&
657 	    !page_address_valid(vcpu, vmcs12->apic_access_addr))
658 		return -EINVAL;
659 	else
660 		return 0;
661 }
662 
663 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
664 					   struct vmcs12 *vmcs12)
665 {
666 	if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
667 	    !nested_cpu_has_apic_reg_virt(vmcs12) &&
668 	    !nested_cpu_has_vid(vmcs12) &&
669 	    !nested_cpu_has_posted_intr(vmcs12))
670 		return 0;
671 
672 	/*
673 	 * If virtualize x2apic mode is enabled,
674 	 * virtualize apic access must be disabled.
675 	 */
676 	if (nested_cpu_has_virt_x2apic_mode(vmcs12) &&
677 	    nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
678 		return -EINVAL;
679 
680 	/*
681 	 * If virtual interrupt delivery is enabled,
682 	 * we must exit on external interrupts.
683 	 */
684 	if (nested_cpu_has_vid(vmcs12) &&
685 	   !nested_exit_on_intr(vcpu))
686 		return -EINVAL;
687 
688 	/*
689 	 * bits 15:8 should be zero in posted_intr_nv,
690 	 * the descriptor address has been already checked
691 	 * in nested_get_vmcs12_pages.
692 	 *
693 	 * bits 5:0 of posted_intr_desc_addr should be zero.
694 	 */
695 	if (nested_cpu_has_posted_intr(vmcs12) &&
696 	   (!nested_cpu_has_vid(vmcs12) ||
697 	    !nested_exit_intr_ack_set(vcpu) ||
698 	    (vmcs12->posted_intr_nv & 0xff00) ||
699 	    (vmcs12->posted_intr_desc_addr & 0x3f) ||
700 	    (vmcs12->posted_intr_desc_addr >> cpuid_maxphyaddr(vcpu))))
701 		return -EINVAL;
702 
703 	/* tpr shadow is needed by all apicv features. */
704 	if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
705 		return -EINVAL;
706 
707 	return 0;
708 }
709 
710 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
711 				       u32 count, u64 addr)
712 {
713 	int maxphyaddr;
714 
715 	if (count == 0)
716 		return 0;
717 	maxphyaddr = cpuid_maxphyaddr(vcpu);
718 	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
719 	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr)
720 		return -EINVAL;
721 
722 	return 0;
723 }
724 
725 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu,
726 						     struct vmcs12 *vmcs12)
727 {
728 	if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_load_count,
729 					vmcs12->vm_exit_msr_load_addr) ||
730 	    nested_vmx_check_msr_switch(vcpu, vmcs12->vm_exit_msr_store_count,
731 					vmcs12->vm_exit_msr_store_addr))
732 		return -EINVAL;
733 
734 	return 0;
735 }
736 
737 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu,
738                                                       struct vmcs12 *vmcs12)
739 {
740 	if (nested_vmx_check_msr_switch(vcpu, vmcs12->vm_entry_msr_load_count,
741                                         vmcs12->vm_entry_msr_load_addr))
742                 return -EINVAL;
743 
744 	return 0;
745 }
746 
747 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
748 					 struct vmcs12 *vmcs12)
749 {
750 	if (!nested_cpu_has_pml(vmcs12))
751 		return 0;
752 
753 	if (!nested_cpu_has_ept(vmcs12) ||
754 	    !page_address_valid(vcpu, vmcs12->pml_address))
755 		return -EINVAL;
756 
757 	return 0;
758 }
759 
760 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu,
761 							struct vmcs12 *vmcs12)
762 {
763 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) &&
764 	    !nested_cpu_has_ept(vmcs12))
765 		return -EINVAL;
766 	return 0;
767 }
768 
769 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu,
770 							 struct vmcs12 *vmcs12)
771 {
772 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) &&
773 	    !nested_cpu_has_ept(vmcs12))
774 		return -EINVAL;
775 	return 0;
776 }
777 
778 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
779 						 struct vmcs12 *vmcs12)
780 {
781 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
782 		return 0;
783 
784 	if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
785 	    !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
786 		return -EINVAL;
787 
788 	return 0;
789 }
790 
791 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
792 				       struct vmx_msr_entry *e)
793 {
794 	/* x2APIC MSR accesses are not allowed */
795 	if (vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)
796 		return -EINVAL;
797 	if (e->index == MSR_IA32_UCODE_WRITE || /* SDM Table 35-2 */
798 	    e->index == MSR_IA32_UCODE_REV)
799 		return -EINVAL;
800 	if (e->reserved != 0)
801 		return -EINVAL;
802 	return 0;
803 }
804 
805 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu,
806 				     struct vmx_msr_entry *e)
807 {
808 	if (e->index == MSR_FS_BASE ||
809 	    e->index == MSR_GS_BASE ||
810 	    e->index == MSR_IA32_SMM_MONITOR_CTL || /* SMM is not supported */
811 	    nested_vmx_msr_check_common(vcpu, e))
812 		return -EINVAL;
813 	return 0;
814 }
815 
816 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu,
817 				      struct vmx_msr_entry *e)
818 {
819 	if (e->index == MSR_IA32_SMBASE || /* SMM is not supported */
820 	    nested_vmx_msr_check_common(vcpu, e))
821 		return -EINVAL;
822 	return 0;
823 }
824 
825 /*
826  * Load guest's/host's msr at nested entry/exit.
827  * return 0 for success, entry index for failure.
828  */
829 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
830 {
831 	u32 i;
832 	struct vmx_msr_entry e;
833 	struct msr_data msr;
834 
835 	msr.host_initiated = false;
836 	for (i = 0; i < count; i++) {
837 		if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e),
838 					&e, sizeof(e))) {
839 			pr_debug_ratelimited(
840 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
841 				__func__, i, gpa + i * sizeof(e));
842 			goto fail;
843 		}
844 		if (nested_vmx_load_msr_check(vcpu, &e)) {
845 			pr_debug_ratelimited(
846 				"%s check failed (%u, 0x%x, 0x%x)\n",
847 				__func__, i, e.index, e.reserved);
848 			goto fail;
849 		}
850 		msr.index = e.index;
851 		msr.data = e.value;
852 		if (kvm_set_msr(vcpu, &msr)) {
853 			pr_debug_ratelimited(
854 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
855 				__func__, i, e.index, e.value);
856 			goto fail;
857 		}
858 	}
859 	return 0;
860 fail:
861 	return i + 1;
862 }
863 
864 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
865 {
866 	u32 i;
867 	struct vmx_msr_entry e;
868 
869 	for (i = 0; i < count; i++) {
870 		struct msr_data msr_info;
871 		if (kvm_vcpu_read_guest(vcpu,
872 					gpa + i * sizeof(e),
873 					&e, 2 * sizeof(u32))) {
874 			pr_debug_ratelimited(
875 				"%s cannot read MSR entry (%u, 0x%08llx)\n",
876 				__func__, i, gpa + i * sizeof(e));
877 			return -EINVAL;
878 		}
879 		if (nested_vmx_store_msr_check(vcpu, &e)) {
880 			pr_debug_ratelimited(
881 				"%s check failed (%u, 0x%x, 0x%x)\n",
882 				__func__, i, e.index, e.reserved);
883 			return -EINVAL;
884 		}
885 		msr_info.host_initiated = false;
886 		msr_info.index = e.index;
887 		if (kvm_get_msr(vcpu, &msr_info)) {
888 			pr_debug_ratelimited(
889 				"%s cannot read MSR (%u, 0x%x)\n",
890 				__func__, i, e.index);
891 			return -EINVAL;
892 		}
893 		if (kvm_vcpu_write_guest(vcpu,
894 					 gpa + i * sizeof(e) +
895 					     offsetof(struct vmx_msr_entry, value),
896 					 &msr_info.data, sizeof(msr_info.data))) {
897 			pr_debug_ratelimited(
898 				"%s cannot write MSR (%u, 0x%x, 0x%llx)\n",
899 				__func__, i, e.index, msr_info.data);
900 			return -EINVAL;
901 		}
902 	}
903 	return 0;
904 }
905 
906 static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
907 {
908 	unsigned long invalid_mask;
909 
910 	invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu);
911 	return (val & invalid_mask) == 0;
912 }
913 
914 /*
915  * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
916  * emulating VM entry into a guest with EPT enabled.
917  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
918  * is assigned to entry_failure_code on failure.
919  */
920 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
921 			       u32 *entry_failure_code)
922 {
923 	if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
924 		if (!nested_cr3_valid(vcpu, cr3)) {
925 			*entry_failure_code = ENTRY_FAIL_DEFAULT;
926 			return -EINVAL;
927 		}
928 
929 		/*
930 		 * If PAE paging and EPT are both on, CR3 is not used by the CPU and
931 		 * must not be dereferenced.
932 		 */
933 		if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) &&
934 		    !nested_ept) {
935 			if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) {
936 				*entry_failure_code = ENTRY_FAIL_PDPTE;
937 				return -EINVAL;
938 			}
939 		}
940 	}
941 
942 	if (!nested_ept)
943 		kvm_mmu_new_cr3(vcpu, cr3, false);
944 
945 	vcpu->arch.cr3 = cr3;
946 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
947 
948 	kvm_init_mmu(vcpu, false);
949 
950 	return 0;
951 }
952 
953 /*
954  * Returns if KVM is able to config CPU to tag TLB entries
955  * populated by L2 differently than TLB entries populated
956  * by L1.
957  *
958  * If L1 uses EPT, then TLB entries are tagged with different EPTP.
959  *
960  * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
961  * with different VPID (L1 entries are tagged with vmx->vpid
962  * while L2 entries are tagged with vmx->nested.vpid02).
963  */
964 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu)
965 {
966 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
967 
968 	return nested_cpu_has_ept(vmcs12) ||
969 	       (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
970 }
971 
972 static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
973 {
974 	struct vcpu_vmx *vmx = to_vmx(vcpu);
975 
976 	return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
977 }
978 
979 
980 static inline bool vmx_control_verify(u32 control, u32 low, u32 high)
981 {
982 	return fixed_bits_valid(control, low, high);
983 }
984 
985 static inline u64 vmx_control_msr(u32 low, u32 high)
986 {
987 	return low | ((u64)high << 32);
988 }
989 
990 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
991 {
992 	superset &= mask;
993 	subset &= mask;
994 
995 	return (superset | subset) == superset;
996 }
997 
998 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data)
999 {
1000 	const u64 feature_and_reserved =
1001 		/* feature (except bit 48; see below) */
1002 		BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
1003 		/* reserved */
1004 		BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
1005 	u64 vmx_basic = vmx->nested.msrs.basic;
1006 
1007 	if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
1008 		return -EINVAL;
1009 
1010 	/*
1011 	 * KVM does not emulate a version of VMX that constrains physical
1012 	 * addresses of VMX structures (e.g. VMCS) to 32-bits.
1013 	 */
1014 	if (data & BIT_ULL(48))
1015 		return -EINVAL;
1016 
1017 	if (vmx_basic_vmcs_revision_id(vmx_basic) !=
1018 	    vmx_basic_vmcs_revision_id(data))
1019 		return -EINVAL;
1020 
1021 	if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data))
1022 		return -EINVAL;
1023 
1024 	vmx->nested.msrs.basic = data;
1025 	return 0;
1026 }
1027 
1028 static int
1029 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1030 {
1031 	u64 supported;
1032 	u32 *lowp, *highp;
1033 
1034 	switch (msr_index) {
1035 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1036 		lowp = &vmx->nested.msrs.pinbased_ctls_low;
1037 		highp = &vmx->nested.msrs.pinbased_ctls_high;
1038 		break;
1039 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1040 		lowp = &vmx->nested.msrs.procbased_ctls_low;
1041 		highp = &vmx->nested.msrs.procbased_ctls_high;
1042 		break;
1043 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1044 		lowp = &vmx->nested.msrs.exit_ctls_low;
1045 		highp = &vmx->nested.msrs.exit_ctls_high;
1046 		break;
1047 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1048 		lowp = &vmx->nested.msrs.entry_ctls_low;
1049 		highp = &vmx->nested.msrs.entry_ctls_high;
1050 		break;
1051 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1052 		lowp = &vmx->nested.msrs.secondary_ctls_low;
1053 		highp = &vmx->nested.msrs.secondary_ctls_high;
1054 		break;
1055 	default:
1056 		BUG();
1057 	}
1058 
1059 	supported = vmx_control_msr(*lowp, *highp);
1060 
1061 	/* Check must-be-1 bits are still 1. */
1062 	if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0)))
1063 		return -EINVAL;
1064 
1065 	/* Check must-be-0 bits are still 0. */
1066 	if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
1067 		return -EINVAL;
1068 
1069 	*lowp = data;
1070 	*highp = data >> 32;
1071 	return 0;
1072 }
1073 
1074 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data)
1075 {
1076 	const u64 feature_and_reserved_bits =
1077 		/* feature */
1078 		BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) |
1079 		BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
1080 		/* reserved */
1081 		GENMASK_ULL(13, 9) | BIT_ULL(31);
1082 	u64 vmx_misc;
1083 
1084 	vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
1085 				   vmx->nested.msrs.misc_high);
1086 
1087 	if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
1088 		return -EINVAL;
1089 
1090 	if ((vmx->nested.msrs.pinbased_ctls_high &
1091 	     PIN_BASED_VMX_PREEMPTION_TIMER) &&
1092 	    vmx_misc_preemption_timer_rate(data) !=
1093 	    vmx_misc_preemption_timer_rate(vmx_misc))
1094 		return -EINVAL;
1095 
1096 	if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc))
1097 		return -EINVAL;
1098 
1099 	if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc))
1100 		return -EINVAL;
1101 
1102 	if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc))
1103 		return -EINVAL;
1104 
1105 	vmx->nested.msrs.misc_low = data;
1106 	vmx->nested.msrs.misc_high = data >> 32;
1107 
1108 	/*
1109 	 * If L1 has read-only VM-exit information fields, use the
1110 	 * less permissive vmx_vmwrite_bitmap to specify write
1111 	 * permissions for the shadow VMCS.
1112 	 */
1113 	if (enable_shadow_vmcs && !nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1114 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
1115 
1116 	return 0;
1117 }
1118 
1119 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
1120 {
1121 	u64 vmx_ept_vpid_cap;
1122 
1123 	vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
1124 					   vmx->nested.msrs.vpid_caps);
1125 
1126 	/* Every bit is either reserved or a feature bit. */
1127 	if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
1128 		return -EINVAL;
1129 
1130 	vmx->nested.msrs.ept_caps = data;
1131 	vmx->nested.msrs.vpid_caps = data >> 32;
1132 	return 0;
1133 }
1134 
1135 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
1136 {
1137 	u64 *msr;
1138 
1139 	switch (msr_index) {
1140 	case MSR_IA32_VMX_CR0_FIXED0:
1141 		msr = &vmx->nested.msrs.cr0_fixed0;
1142 		break;
1143 	case MSR_IA32_VMX_CR4_FIXED0:
1144 		msr = &vmx->nested.msrs.cr4_fixed0;
1145 		break;
1146 	default:
1147 		BUG();
1148 	}
1149 
1150 	/*
1151 	 * 1 bits (which indicates bits which "must-be-1" during VMX operation)
1152 	 * must be 1 in the restored value.
1153 	 */
1154 	if (!is_bitwise_subset(data, *msr, -1ULL))
1155 		return -EINVAL;
1156 
1157 	*msr = data;
1158 	return 0;
1159 }
1160 
1161 /*
1162  * Called when userspace is restoring VMX MSRs.
1163  *
1164  * Returns 0 on success, non-0 otherwise.
1165  */
1166 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1167 {
1168 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1169 
1170 	/*
1171 	 * Don't allow changes to the VMX capability MSRs while the vCPU
1172 	 * is in VMX operation.
1173 	 */
1174 	if (vmx->nested.vmxon)
1175 		return -EBUSY;
1176 
1177 	switch (msr_index) {
1178 	case MSR_IA32_VMX_BASIC:
1179 		return vmx_restore_vmx_basic(vmx, data);
1180 	case MSR_IA32_VMX_PINBASED_CTLS:
1181 	case MSR_IA32_VMX_PROCBASED_CTLS:
1182 	case MSR_IA32_VMX_EXIT_CTLS:
1183 	case MSR_IA32_VMX_ENTRY_CTLS:
1184 		/*
1185 		 * The "non-true" VMX capability MSRs are generated from the
1186 		 * "true" MSRs, so we do not support restoring them directly.
1187 		 *
1188 		 * If userspace wants to emulate VMX_BASIC[55]=0, userspace
1189 		 * should restore the "true" MSRs with the must-be-1 bits
1190 		 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND
1191 		 * DEFAULT SETTINGS".
1192 		 */
1193 		return -EINVAL;
1194 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1195 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1196 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1197 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1198 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1199 		return vmx_restore_control_msr(vmx, msr_index, data);
1200 	case MSR_IA32_VMX_MISC:
1201 		return vmx_restore_vmx_misc(vmx, data);
1202 	case MSR_IA32_VMX_CR0_FIXED0:
1203 	case MSR_IA32_VMX_CR4_FIXED0:
1204 		return vmx_restore_fixed0_msr(vmx, msr_index, data);
1205 	case MSR_IA32_VMX_CR0_FIXED1:
1206 	case MSR_IA32_VMX_CR4_FIXED1:
1207 		/*
1208 		 * These MSRs are generated based on the vCPU's CPUID, so we
1209 		 * do not support restoring them directly.
1210 		 */
1211 		return -EINVAL;
1212 	case MSR_IA32_VMX_EPT_VPID_CAP:
1213 		return vmx_restore_vmx_ept_vpid_cap(vmx, data);
1214 	case MSR_IA32_VMX_VMCS_ENUM:
1215 		vmx->nested.msrs.vmcs_enum = data;
1216 		return 0;
1217 	default:
1218 		/*
1219 		 * The rest of the VMX capability MSRs do not support restore.
1220 		 */
1221 		return -EINVAL;
1222 	}
1223 }
1224 
1225 /* Returns 0 on success, non-0 otherwise. */
1226 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata)
1227 {
1228 	switch (msr_index) {
1229 	case MSR_IA32_VMX_BASIC:
1230 		*pdata = msrs->basic;
1231 		break;
1232 	case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
1233 	case MSR_IA32_VMX_PINBASED_CTLS:
1234 		*pdata = vmx_control_msr(
1235 			msrs->pinbased_ctls_low,
1236 			msrs->pinbased_ctls_high);
1237 		if (msr_index == MSR_IA32_VMX_PINBASED_CTLS)
1238 			*pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1239 		break;
1240 	case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
1241 	case MSR_IA32_VMX_PROCBASED_CTLS:
1242 		*pdata = vmx_control_msr(
1243 			msrs->procbased_ctls_low,
1244 			msrs->procbased_ctls_high);
1245 		if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS)
1246 			*pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
1247 		break;
1248 	case MSR_IA32_VMX_TRUE_EXIT_CTLS:
1249 	case MSR_IA32_VMX_EXIT_CTLS:
1250 		*pdata = vmx_control_msr(
1251 			msrs->exit_ctls_low,
1252 			msrs->exit_ctls_high);
1253 		if (msr_index == MSR_IA32_VMX_EXIT_CTLS)
1254 			*pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
1255 		break;
1256 	case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
1257 	case MSR_IA32_VMX_ENTRY_CTLS:
1258 		*pdata = vmx_control_msr(
1259 			msrs->entry_ctls_low,
1260 			msrs->entry_ctls_high);
1261 		if (msr_index == MSR_IA32_VMX_ENTRY_CTLS)
1262 			*pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
1263 		break;
1264 	case MSR_IA32_VMX_MISC:
1265 		*pdata = vmx_control_msr(
1266 			msrs->misc_low,
1267 			msrs->misc_high);
1268 		break;
1269 	case MSR_IA32_VMX_CR0_FIXED0:
1270 		*pdata = msrs->cr0_fixed0;
1271 		break;
1272 	case MSR_IA32_VMX_CR0_FIXED1:
1273 		*pdata = msrs->cr0_fixed1;
1274 		break;
1275 	case MSR_IA32_VMX_CR4_FIXED0:
1276 		*pdata = msrs->cr4_fixed0;
1277 		break;
1278 	case MSR_IA32_VMX_CR4_FIXED1:
1279 		*pdata = msrs->cr4_fixed1;
1280 		break;
1281 	case MSR_IA32_VMX_VMCS_ENUM:
1282 		*pdata = msrs->vmcs_enum;
1283 		break;
1284 	case MSR_IA32_VMX_PROCBASED_CTLS2:
1285 		*pdata = vmx_control_msr(
1286 			msrs->secondary_ctls_low,
1287 			msrs->secondary_ctls_high);
1288 		break;
1289 	case MSR_IA32_VMX_EPT_VPID_CAP:
1290 		*pdata = msrs->ept_caps |
1291 			((u64)msrs->vpid_caps << 32);
1292 		break;
1293 	case MSR_IA32_VMX_VMFUNC:
1294 		*pdata = msrs->vmfunc_controls;
1295 		break;
1296 	default:
1297 		return 1;
1298 	}
1299 
1300 	return 0;
1301 }
1302 
1303 /*
1304  * Copy the writable VMCS shadow fields back to the VMCS12, in case
1305  * they have been modified by the L1 guest. Note that the "read-only"
1306  * VM-exit information fields are actually writable if the vCPU is
1307  * configured to support "VMWRITE to any supported field in the VMCS."
1308  */
1309 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
1310 {
1311 	const u16 *fields[] = {
1312 		shadow_read_write_fields,
1313 		shadow_read_only_fields
1314 	};
1315 	const int max_fields[] = {
1316 		max_shadow_read_write_fields,
1317 		max_shadow_read_only_fields
1318 	};
1319 	int i, q;
1320 	unsigned long field;
1321 	u64 field_value;
1322 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1323 
1324 	preempt_disable();
1325 
1326 	vmcs_load(shadow_vmcs);
1327 
1328 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
1329 		for (i = 0; i < max_fields[q]; i++) {
1330 			field = fields[q][i];
1331 			field_value = __vmcs_readl(field);
1332 			vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
1333 		}
1334 		/*
1335 		 * Skip the VM-exit information fields if they are read-only.
1336 		 */
1337 		if (!nested_cpu_has_vmwrite_any_field(&vmx->vcpu))
1338 			break;
1339 	}
1340 
1341 	vmcs_clear(shadow_vmcs);
1342 	vmcs_load(vmx->loaded_vmcs->vmcs);
1343 
1344 	preempt_enable();
1345 }
1346 
1347 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
1348 {
1349 	const u16 *fields[] = {
1350 		shadow_read_write_fields,
1351 		shadow_read_only_fields
1352 	};
1353 	const int max_fields[] = {
1354 		max_shadow_read_write_fields,
1355 		max_shadow_read_only_fields
1356 	};
1357 	int i, q;
1358 	unsigned long field;
1359 	u64 field_value = 0;
1360 	struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs;
1361 
1362 	vmcs_load(shadow_vmcs);
1363 
1364 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
1365 		for (i = 0; i < max_fields[q]; i++) {
1366 			field = fields[q][i];
1367 			vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
1368 			__vmcs_writel(field, field_value);
1369 		}
1370 	}
1371 
1372 	vmcs_clear(shadow_vmcs);
1373 	vmcs_load(vmx->loaded_vmcs->vmcs);
1374 }
1375 
1376 static int copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx)
1377 {
1378 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1379 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1380 
1381 	/* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */
1382 	vmcs12->tpr_threshold = evmcs->tpr_threshold;
1383 	vmcs12->guest_rip = evmcs->guest_rip;
1384 
1385 	if (unlikely(!(evmcs->hv_clean_fields &
1386 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) {
1387 		vmcs12->guest_rsp = evmcs->guest_rsp;
1388 		vmcs12->guest_rflags = evmcs->guest_rflags;
1389 		vmcs12->guest_interruptibility_info =
1390 			evmcs->guest_interruptibility_info;
1391 	}
1392 
1393 	if (unlikely(!(evmcs->hv_clean_fields &
1394 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1395 		vmcs12->cpu_based_vm_exec_control =
1396 			evmcs->cpu_based_vm_exec_control;
1397 	}
1398 
1399 	if (unlikely(!(evmcs->hv_clean_fields &
1400 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) {
1401 		vmcs12->exception_bitmap = evmcs->exception_bitmap;
1402 	}
1403 
1404 	if (unlikely(!(evmcs->hv_clean_fields &
1405 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) {
1406 		vmcs12->vm_entry_controls = evmcs->vm_entry_controls;
1407 	}
1408 
1409 	if (unlikely(!(evmcs->hv_clean_fields &
1410 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) {
1411 		vmcs12->vm_entry_intr_info_field =
1412 			evmcs->vm_entry_intr_info_field;
1413 		vmcs12->vm_entry_exception_error_code =
1414 			evmcs->vm_entry_exception_error_code;
1415 		vmcs12->vm_entry_instruction_len =
1416 			evmcs->vm_entry_instruction_len;
1417 	}
1418 
1419 	if (unlikely(!(evmcs->hv_clean_fields &
1420 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1421 		vmcs12->host_ia32_pat = evmcs->host_ia32_pat;
1422 		vmcs12->host_ia32_efer = evmcs->host_ia32_efer;
1423 		vmcs12->host_cr0 = evmcs->host_cr0;
1424 		vmcs12->host_cr3 = evmcs->host_cr3;
1425 		vmcs12->host_cr4 = evmcs->host_cr4;
1426 		vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp;
1427 		vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip;
1428 		vmcs12->host_rip = evmcs->host_rip;
1429 		vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs;
1430 		vmcs12->host_es_selector = evmcs->host_es_selector;
1431 		vmcs12->host_cs_selector = evmcs->host_cs_selector;
1432 		vmcs12->host_ss_selector = evmcs->host_ss_selector;
1433 		vmcs12->host_ds_selector = evmcs->host_ds_selector;
1434 		vmcs12->host_fs_selector = evmcs->host_fs_selector;
1435 		vmcs12->host_gs_selector = evmcs->host_gs_selector;
1436 		vmcs12->host_tr_selector = evmcs->host_tr_selector;
1437 	}
1438 
1439 	if (unlikely(!(evmcs->hv_clean_fields &
1440 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) {
1441 		vmcs12->pin_based_vm_exec_control =
1442 			evmcs->pin_based_vm_exec_control;
1443 		vmcs12->vm_exit_controls = evmcs->vm_exit_controls;
1444 		vmcs12->secondary_vm_exec_control =
1445 			evmcs->secondary_vm_exec_control;
1446 	}
1447 
1448 	if (unlikely(!(evmcs->hv_clean_fields &
1449 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) {
1450 		vmcs12->io_bitmap_a = evmcs->io_bitmap_a;
1451 		vmcs12->io_bitmap_b = evmcs->io_bitmap_b;
1452 	}
1453 
1454 	if (unlikely(!(evmcs->hv_clean_fields &
1455 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) {
1456 		vmcs12->msr_bitmap = evmcs->msr_bitmap;
1457 	}
1458 
1459 	if (unlikely(!(evmcs->hv_clean_fields &
1460 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) {
1461 		vmcs12->guest_es_base = evmcs->guest_es_base;
1462 		vmcs12->guest_cs_base = evmcs->guest_cs_base;
1463 		vmcs12->guest_ss_base = evmcs->guest_ss_base;
1464 		vmcs12->guest_ds_base = evmcs->guest_ds_base;
1465 		vmcs12->guest_fs_base = evmcs->guest_fs_base;
1466 		vmcs12->guest_gs_base = evmcs->guest_gs_base;
1467 		vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base;
1468 		vmcs12->guest_tr_base = evmcs->guest_tr_base;
1469 		vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base;
1470 		vmcs12->guest_idtr_base = evmcs->guest_idtr_base;
1471 		vmcs12->guest_es_limit = evmcs->guest_es_limit;
1472 		vmcs12->guest_cs_limit = evmcs->guest_cs_limit;
1473 		vmcs12->guest_ss_limit = evmcs->guest_ss_limit;
1474 		vmcs12->guest_ds_limit = evmcs->guest_ds_limit;
1475 		vmcs12->guest_fs_limit = evmcs->guest_fs_limit;
1476 		vmcs12->guest_gs_limit = evmcs->guest_gs_limit;
1477 		vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit;
1478 		vmcs12->guest_tr_limit = evmcs->guest_tr_limit;
1479 		vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit;
1480 		vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit;
1481 		vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes;
1482 		vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes;
1483 		vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes;
1484 		vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes;
1485 		vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes;
1486 		vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes;
1487 		vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes;
1488 		vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes;
1489 		vmcs12->guest_es_selector = evmcs->guest_es_selector;
1490 		vmcs12->guest_cs_selector = evmcs->guest_cs_selector;
1491 		vmcs12->guest_ss_selector = evmcs->guest_ss_selector;
1492 		vmcs12->guest_ds_selector = evmcs->guest_ds_selector;
1493 		vmcs12->guest_fs_selector = evmcs->guest_fs_selector;
1494 		vmcs12->guest_gs_selector = evmcs->guest_gs_selector;
1495 		vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector;
1496 		vmcs12->guest_tr_selector = evmcs->guest_tr_selector;
1497 	}
1498 
1499 	if (unlikely(!(evmcs->hv_clean_fields &
1500 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) {
1501 		vmcs12->tsc_offset = evmcs->tsc_offset;
1502 		vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
1503 		vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
1504 	}
1505 
1506 	if (unlikely(!(evmcs->hv_clean_fields &
1507 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) {
1508 		vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask;
1509 		vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask;
1510 		vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow;
1511 		vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow;
1512 		vmcs12->guest_cr0 = evmcs->guest_cr0;
1513 		vmcs12->guest_cr3 = evmcs->guest_cr3;
1514 		vmcs12->guest_cr4 = evmcs->guest_cr4;
1515 		vmcs12->guest_dr7 = evmcs->guest_dr7;
1516 	}
1517 
1518 	if (unlikely(!(evmcs->hv_clean_fields &
1519 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) {
1520 		vmcs12->host_fs_base = evmcs->host_fs_base;
1521 		vmcs12->host_gs_base = evmcs->host_gs_base;
1522 		vmcs12->host_tr_base = evmcs->host_tr_base;
1523 		vmcs12->host_gdtr_base = evmcs->host_gdtr_base;
1524 		vmcs12->host_idtr_base = evmcs->host_idtr_base;
1525 		vmcs12->host_rsp = evmcs->host_rsp;
1526 	}
1527 
1528 	if (unlikely(!(evmcs->hv_clean_fields &
1529 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) {
1530 		vmcs12->ept_pointer = evmcs->ept_pointer;
1531 		vmcs12->virtual_processor_id = evmcs->virtual_processor_id;
1532 	}
1533 
1534 	if (unlikely(!(evmcs->hv_clean_fields &
1535 		       HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) {
1536 		vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer;
1537 		vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl;
1538 		vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat;
1539 		vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer;
1540 		vmcs12->guest_pdptr0 = evmcs->guest_pdptr0;
1541 		vmcs12->guest_pdptr1 = evmcs->guest_pdptr1;
1542 		vmcs12->guest_pdptr2 = evmcs->guest_pdptr2;
1543 		vmcs12->guest_pdptr3 = evmcs->guest_pdptr3;
1544 		vmcs12->guest_pending_dbg_exceptions =
1545 			evmcs->guest_pending_dbg_exceptions;
1546 		vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp;
1547 		vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip;
1548 		vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
1549 		vmcs12->guest_activity_state = evmcs->guest_activity_state;
1550 		vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
1551 	}
1552 
1553 	/*
1554 	 * Not used?
1555 	 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
1556 	 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
1557 	 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
1558 	 * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
1559 	 * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
1560 	 * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
1561 	 * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
1562 	 * vmcs12->page_fault_error_code_mask =
1563 	 *		evmcs->page_fault_error_code_mask;
1564 	 * vmcs12->page_fault_error_code_match =
1565 	 *		evmcs->page_fault_error_code_match;
1566 	 * vmcs12->cr3_target_count = evmcs->cr3_target_count;
1567 	 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count;
1568 	 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count;
1569 	 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count;
1570 	 */
1571 
1572 	/*
1573 	 * Read only fields:
1574 	 * vmcs12->guest_physical_address = evmcs->guest_physical_address;
1575 	 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error;
1576 	 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason;
1577 	 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info;
1578 	 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code;
1579 	 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field;
1580 	 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code;
1581 	 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len;
1582 	 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info;
1583 	 * vmcs12->exit_qualification = evmcs->exit_qualification;
1584 	 * vmcs12->guest_linear_address = evmcs->guest_linear_address;
1585 	 *
1586 	 * Not present in struct vmcs12:
1587 	 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx;
1588 	 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi;
1589 	 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi;
1590 	 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip;
1591 	 */
1592 
1593 	return 0;
1594 }
1595 
1596 static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
1597 {
1598 	struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12;
1599 	struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs;
1600 
1601 	/*
1602 	 * Should not be changed by KVM:
1603 	 *
1604 	 * evmcs->host_es_selector = vmcs12->host_es_selector;
1605 	 * evmcs->host_cs_selector = vmcs12->host_cs_selector;
1606 	 * evmcs->host_ss_selector = vmcs12->host_ss_selector;
1607 	 * evmcs->host_ds_selector = vmcs12->host_ds_selector;
1608 	 * evmcs->host_fs_selector = vmcs12->host_fs_selector;
1609 	 * evmcs->host_gs_selector = vmcs12->host_gs_selector;
1610 	 * evmcs->host_tr_selector = vmcs12->host_tr_selector;
1611 	 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat;
1612 	 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer;
1613 	 * evmcs->host_cr0 = vmcs12->host_cr0;
1614 	 * evmcs->host_cr3 = vmcs12->host_cr3;
1615 	 * evmcs->host_cr4 = vmcs12->host_cr4;
1616 	 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp;
1617 	 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip;
1618 	 * evmcs->host_rip = vmcs12->host_rip;
1619 	 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs;
1620 	 * evmcs->host_fs_base = vmcs12->host_fs_base;
1621 	 * evmcs->host_gs_base = vmcs12->host_gs_base;
1622 	 * evmcs->host_tr_base = vmcs12->host_tr_base;
1623 	 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base;
1624 	 * evmcs->host_idtr_base = vmcs12->host_idtr_base;
1625 	 * evmcs->host_rsp = vmcs12->host_rsp;
1626 	 * sync_vmcs12() doesn't read these:
1627 	 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a;
1628 	 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b;
1629 	 * evmcs->msr_bitmap = vmcs12->msr_bitmap;
1630 	 * evmcs->ept_pointer = vmcs12->ept_pointer;
1631 	 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap;
1632 	 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
1633 	 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
1634 	 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
1635 	 * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
1636 	 * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
1637 	 * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
1638 	 * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
1639 	 * evmcs->tpr_threshold = vmcs12->tpr_threshold;
1640 	 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
1641 	 * evmcs->exception_bitmap = vmcs12->exception_bitmap;
1642 	 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer;
1643 	 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control;
1644 	 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls;
1645 	 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control;
1646 	 * evmcs->page_fault_error_code_mask =
1647 	 *		vmcs12->page_fault_error_code_mask;
1648 	 * evmcs->page_fault_error_code_match =
1649 	 *		vmcs12->page_fault_error_code_match;
1650 	 * evmcs->cr3_target_count = vmcs12->cr3_target_count;
1651 	 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr;
1652 	 * evmcs->tsc_offset = vmcs12->tsc_offset;
1653 	 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl;
1654 	 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask;
1655 	 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask;
1656 	 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow;
1657 	 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow;
1658 	 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
1659 	 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
1660 	 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
1661 	 *
1662 	 * Not present in struct vmcs12:
1663 	 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
1664 	 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
1665 	 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
1666 	 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
1667 	 */
1668 
1669 	evmcs->guest_es_selector = vmcs12->guest_es_selector;
1670 	evmcs->guest_cs_selector = vmcs12->guest_cs_selector;
1671 	evmcs->guest_ss_selector = vmcs12->guest_ss_selector;
1672 	evmcs->guest_ds_selector = vmcs12->guest_ds_selector;
1673 	evmcs->guest_fs_selector = vmcs12->guest_fs_selector;
1674 	evmcs->guest_gs_selector = vmcs12->guest_gs_selector;
1675 	evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector;
1676 	evmcs->guest_tr_selector = vmcs12->guest_tr_selector;
1677 
1678 	evmcs->guest_es_limit = vmcs12->guest_es_limit;
1679 	evmcs->guest_cs_limit = vmcs12->guest_cs_limit;
1680 	evmcs->guest_ss_limit = vmcs12->guest_ss_limit;
1681 	evmcs->guest_ds_limit = vmcs12->guest_ds_limit;
1682 	evmcs->guest_fs_limit = vmcs12->guest_fs_limit;
1683 	evmcs->guest_gs_limit = vmcs12->guest_gs_limit;
1684 	evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit;
1685 	evmcs->guest_tr_limit = vmcs12->guest_tr_limit;
1686 	evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit;
1687 	evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit;
1688 
1689 	evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes;
1690 	evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes;
1691 	evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes;
1692 	evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes;
1693 	evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes;
1694 	evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes;
1695 	evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes;
1696 	evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes;
1697 
1698 	evmcs->guest_es_base = vmcs12->guest_es_base;
1699 	evmcs->guest_cs_base = vmcs12->guest_cs_base;
1700 	evmcs->guest_ss_base = vmcs12->guest_ss_base;
1701 	evmcs->guest_ds_base = vmcs12->guest_ds_base;
1702 	evmcs->guest_fs_base = vmcs12->guest_fs_base;
1703 	evmcs->guest_gs_base = vmcs12->guest_gs_base;
1704 	evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base;
1705 	evmcs->guest_tr_base = vmcs12->guest_tr_base;
1706 	evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base;
1707 	evmcs->guest_idtr_base = vmcs12->guest_idtr_base;
1708 
1709 	evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat;
1710 	evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer;
1711 
1712 	evmcs->guest_pdptr0 = vmcs12->guest_pdptr0;
1713 	evmcs->guest_pdptr1 = vmcs12->guest_pdptr1;
1714 	evmcs->guest_pdptr2 = vmcs12->guest_pdptr2;
1715 	evmcs->guest_pdptr3 = vmcs12->guest_pdptr3;
1716 
1717 	evmcs->guest_pending_dbg_exceptions =
1718 		vmcs12->guest_pending_dbg_exceptions;
1719 	evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp;
1720 	evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip;
1721 
1722 	evmcs->guest_activity_state = vmcs12->guest_activity_state;
1723 	evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs;
1724 
1725 	evmcs->guest_cr0 = vmcs12->guest_cr0;
1726 	evmcs->guest_cr3 = vmcs12->guest_cr3;
1727 	evmcs->guest_cr4 = vmcs12->guest_cr4;
1728 	evmcs->guest_dr7 = vmcs12->guest_dr7;
1729 
1730 	evmcs->guest_physical_address = vmcs12->guest_physical_address;
1731 
1732 	evmcs->vm_instruction_error = vmcs12->vm_instruction_error;
1733 	evmcs->vm_exit_reason = vmcs12->vm_exit_reason;
1734 	evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info;
1735 	evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code;
1736 	evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field;
1737 	evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code;
1738 	evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len;
1739 	evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info;
1740 
1741 	evmcs->exit_qualification = vmcs12->exit_qualification;
1742 
1743 	evmcs->guest_linear_address = vmcs12->guest_linear_address;
1744 	evmcs->guest_rsp = vmcs12->guest_rsp;
1745 	evmcs->guest_rflags = vmcs12->guest_rflags;
1746 
1747 	evmcs->guest_interruptibility_info =
1748 		vmcs12->guest_interruptibility_info;
1749 	evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control;
1750 	evmcs->vm_entry_controls = vmcs12->vm_entry_controls;
1751 	evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field;
1752 	evmcs->vm_entry_exception_error_code =
1753 		vmcs12->vm_entry_exception_error_code;
1754 	evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len;
1755 
1756 	evmcs->guest_rip = vmcs12->guest_rip;
1757 
1758 	evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs;
1759 
1760 	return 0;
1761 }
1762 
1763 /*
1764  * This is an equivalent of the nested hypervisor executing the vmptrld
1765  * instruction.
1766  */
1767 static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
1768 						 bool from_launch)
1769 {
1770 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1771 	struct hv_vp_assist_page assist_page;
1772 
1773 	if (likely(!vmx->nested.enlightened_vmcs_enabled))
1774 		return 1;
1775 
1776 	if (unlikely(!kvm_hv_get_assist_page(vcpu, &assist_page)))
1777 		return 1;
1778 
1779 	if (unlikely(!assist_page.enlighten_vmentry))
1780 		return 1;
1781 
1782 	if (unlikely(assist_page.current_nested_vmcs !=
1783 		     vmx->nested.hv_evmcs_vmptr)) {
1784 
1785 		if (!vmx->nested.hv_evmcs)
1786 			vmx->nested.current_vmptr = -1ull;
1787 
1788 		nested_release_evmcs(vcpu);
1789 
1790 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(assist_page.current_nested_vmcs),
1791 				 &vmx->nested.hv_evmcs_map))
1792 			return 0;
1793 
1794 		vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
1795 
1796 		/*
1797 		 * Currently, KVM only supports eVMCS version 1
1798 		 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this
1799 		 * value to first u32 field of eVMCS which should specify eVMCS
1800 		 * VersionNumber.
1801 		 *
1802 		 * Guest should be aware of supported eVMCS versions by host by
1803 		 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is
1804 		 * expected to set this CPUID leaf according to the value
1805 		 * returned in vmcs_version from nested_enable_evmcs().
1806 		 *
1807 		 * However, it turns out that Microsoft Hyper-V fails to comply
1808 		 * to their own invented interface: When Hyper-V use eVMCS, it
1809 		 * just sets first u32 field of eVMCS to revision_id specified
1810 		 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number
1811 		 * which is one of the supported versions specified in
1812 		 * CPUID.0x4000000A.EAX[0:15].
1813 		 *
1814 		 * To overcome Hyper-V bug, we accept here either a supported
1815 		 * eVMCS version or VMCS12 revision_id as valid values for first
1816 		 * u32 field of eVMCS.
1817 		 */
1818 		if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
1819 		    (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
1820 			nested_release_evmcs(vcpu);
1821 			return 0;
1822 		}
1823 
1824 		vmx->nested.dirty_vmcs12 = true;
1825 		/*
1826 		 * As we keep L2 state for one guest only 'hv_clean_fields' mask
1827 		 * can't be used when we switch between them. Reset it here for
1828 		 * simplicity.
1829 		 */
1830 		vmx->nested.hv_evmcs->hv_clean_fields &=
1831 			~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1832 		vmx->nested.hv_evmcs_vmptr = assist_page.current_nested_vmcs;
1833 
1834 		/*
1835 		 * Unlike normal vmcs12, enlightened vmcs12 is not fully
1836 		 * reloaded from guest's memory (read only fields, fields not
1837 		 * present in struct hv_enlightened_vmcs, ...). Make sure there
1838 		 * are no leftovers.
1839 		 */
1840 		if (from_launch) {
1841 			struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1842 			memset(vmcs12, 0, sizeof(*vmcs12));
1843 			vmcs12->hdr.revision_id = VMCS12_REVISION;
1844 		}
1845 
1846 	}
1847 	return 1;
1848 }
1849 
1850 void nested_sync_from_vmcs12(struct kvm_vcpu *vcpu)
1851 {
1852 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1853 
1854 	/*
1855 	 * hv_evmcs may end up being not mapped after migration (when
1856 	 * L2 was running), map it here to make sure vmcs12 changes are
1857 	 * properly reflected.
1858 	 */
1859 	if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
1860 		nested_vmx_handle_enlightened_vmptrld(vcpu, false);
1861 
1862 	if (vmx->nested.hv_evmcs) {
1863 		copy_vmcs12_to_enlightened(vmx);
1864 		/* All fields are clean */
1865 		vmx->nested.hv_evmcs->hv_clean_fields |=
1866 			HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
1867 	} else {
1868 		copy_vmcs12_to_shadow(vmx);
1869 	}
1870 
1871 	vmx->nested.need_vmcs12_sync = false;
1872 }
1873 
1874 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer)
1875 {
1876 	struct vcpu_vmx *vmx =
1877 		container_of(timer, struct vcpu_vmx, nested.preemption_timer);
1878 
1879 	vmx->nested.preemption_timer_expired = true;
1880 	kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
1881 	kvm_vcpu_kick(&vmx->vcpu);
1882 
1883 	return HRTIMER_NORESTART;
1884 }
1885 
1886 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
1887 {
1888 	u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
1889 	struct vcpu_vmx *vmx = to_vmx(vcpu);
1890 
1891 	/*
1892 	 * A timer value of zero is architecturally guaranteed to cause
1893 	 * a VMExit prior to executing any instructions in the guest.
1894 	 */
1895 	if (preemption_timeout == 0) {
1896 		vmx_preemption_timer_fn(&vmx->nested.preemption_timer);
1897 		return;
1898 	}
1899 
1900 	if (vcpu->arch.virtual_tsc_khz == 0)
1901 		return;
1902 
1903 	preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
1904 	preemption_timeout *= 1000000;
1905 	do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
1906 	hrtimer_start(&vmx->nested.preemption_timer,
1907 		      ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
1908 }
1909 
1910 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1911 {
1912 	if (vmx->nested.nested_run_pending &&
1913 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
1914 		return vmcs12->guest_ia32_efer;
1915 	else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
1916 		return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME);
1917 	else
1918 		return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME);
1919 }
1920 
1921 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx)
1922 {
1923 	/*
1924 	 * If vmcs02 hasn't been initialized, set the constant vmcs02 state
1925 	 * according to L0's settings (vmcs12 is irrelevant here).  Host
1926 	 * fields that come from L0 and are not constant, e.g. HOST_CR3,
1927 	 * will be set as needed prior to VMLAUNCH/VMRESUME.
1928 	 */
1929 	if (vmx->nested.vmcs02_initialized)
1930 		return;
1931 	vmx->nested.vmcs02_initialized = true;
1932 
1933 	/*
1934 	 * We don't care what the EPTP value is we just need to guarantee
1935 	 * it's valid so we don't get a false positive when doing early
1936 	 * consistency checks.
1937 	 */
1938 	if (enable_ept && nested_early_check)
1939 		vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
1940 
1941 	/* All VMFUNCs are currently emulated through L0 vmexits.  */
1942 	if (cpu_has_vmx_vmfunc())
1943 		vmcs_write64(VM_FUNCTION_CONTROL, 0);
1944 
1945 	if (cpu_has_vmx_posted_intr())
1946 		vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR);
1947 
1948 	if (cpu_has_vmx_msr_bitmap())
1949 		vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
1950 
1951 	if (enable_pml)
1952 		vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
1953 
1954 	/*
1955 	 * Set the MSR load/store lists to match L0's settings.  Only the
1956 	 * addresses are constant (for vmcs02), the counts can change based
1957 	 * on L2's behavior, e.g. switching to/from long mode.
1958 	 */
1959 	vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
1960 	vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
1961 	vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
1962 
1963 	vmx_set_constant_host_state(vmx);
1964 }
1965 
1966 static void prepare_vmcs02_early_full(struct vcpu_vmx *vmx,
1967 				      struct vmcs12 *vmcs12)
1968 {
1969 	prepare_vmcs02_constant_state(vmx);
1970 
1971 	vmcs_write64(VMCS_LINK_POINTER, -1ull);
1972 
1973 	if (enable_vpid) {
1974 		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02)
1975 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
1976 		else
1977 			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
1978 	}
1979 }
1980 
1981 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
1982 {
1983 	u32 exec_control, vmcs12_exec_ctrl;
1984 	u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
1985 
1986 	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs)
1987 		prepare_vmcs02_early_full(vmx, vmcs12);
1988 
1989 	/*
1990 	 * PIN CONTROLS
1991 	 */
1992 	exec_control = vmcs12->pin_based_vm_exec_control;
1993 
1994 	/* Preemption timer setting is computed directly in vmx_vcpu_run.  */
1995 	exec_control |= vmcs_config.pin_based_exec_ctrl;
1996 	exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
1997 	vmx->loaded_vmcs->hv_timer_armed = false;
1998 
1999 	/* Posted interrupts setting is only taken from vmcs12.  */
2000 	if (nested_cpu_has_posted_intr(vmcs12)) {
2001 		vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
2002 		vmx->nested.pi_pending = false;
2003 	} else {
2004 		exec_control &= ~PIN_BASED_POSTED_INTR;
2005 	}
2006 	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
2007 
2008 	/*
2009 	 * EXEC CONTROLS
2010 	 */
2011 	exec_control = vmx_exec_control(vmx); /* L0's desires */
2012 	exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
2013 	exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2014 	exec_control &= ~CPU_BASED_TPR_SHADOW;
2015 	exec_control |= vmcs12->cpu_based_vm_exec_control;
2016 
2017 	/*
2018 	 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
2019 	 * nested_get_vmcs12_pages can't fix it up, the illegal value
2020 	 * will result in a VM entry failure.
2021 	 */
2022 	if (exec_control & CPU_BASED_TPR_SHADOW) {
2023 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
2024 		vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
2025 	} else {
2026 #ifdef CONFIG_X86_64
2027 		exec_control |= CPU_BASED_CR8_LOAD_EXITING |
2028 				CPU_BASED_CR8_STORE_EXITING;
2029 #endif
2030 	}
2031 
2032 	/*
2033 	 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed
2034 	 * for I/O port accesses.
2035 	 */
2036 	exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
2037 	exec_control |= CPU_BASED_UNCOND_IO_EXITING;
2038 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
2039 
2040 	/*
2041 	 * SECONDARY EXEC CONTROLS
2042 	 */
2043 	if (cpu_has_secondary_exec_ctrls()) {
2044 		exec_control = vmx->secondary_exec_control;
2045 
2046 		/* Take the following fields only from vmcs12 */
2047 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
2048 				  SECONDARY_EXEC_ENABLE_INVPCID |
2049 				  SECONDARY_EXEC_RDTSCP |
2050 				  SECONDARY_EXEC_XSAVES |
2051 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
2052 				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
2053 				  SECONDARY_EXEC_ENABLE_VMFUNC);
2054 		if (nested_cpu_has(vmcs12,
2055 				   CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
2056 			vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
2057 				~SECONDARY_EXEC_ENABLE_PML;
2058 			exec_control |= vmcs12_exec_ctrl;
2059 		}
2060 
2061 		/* VMCS shadowing for L2 is emulated for now */
2062 		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
2063 
2064 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
2065 			vmcs_write16(GUEST_INTR_STATUS,
2066 				vmcs12->guest_intr_status);
2067 
2068 		/*
2069 		 * Write an illegal value to APIC_ACCESS_ADDR. Later,
2070 		 * nested_get_vmcs12_pages will either fix it up or
2071 		 * remove the VM execution control.
2072 		 */
2073 		if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
2074 			vmcs_write64(APIC_ACCESS_ADDR, -1ull);
2075 
2076 		if (exec_control & SECONDARY_EXEC_ENCLS_EXITING)
2077 			vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
2078 
2079 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2080 	}
2081 
2082 	/*
2083 	 * ENTRY CONTROLS
2084 	 *
2085 	 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE
2086 	 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
2087 	 * on the related bits (if supported by the CPU) in the hope that
2088 	 * we can avoid VMWrites during vmx_set_efer().
2089 	 */
2090 	exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
2091 			~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
2092 	if (cpu_has_load_ia32_efer()) {
2093 		if (guest_efer & EFER_LMA)
2094 			exec_control |= VM_ENTRY_IA32E_MODE;
2095 		if (guest_efer != host_efer)
2096 			exec_control |= VM_ENTRY_LOAD_IA32_EFER;
2097 	}
2098 	vm_entry_controls_init(vmx, exec_control);
2099 
2100 	/*
2101 	 * EXIT CONTROLS
2102 	 *
2103 	 * L2->L1 exit controls are emulated - the hardware exit is to L0 so
2104 	 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
2105 	 * bits may be modified by vmx_set_efer() in prepare_vmcs02().
2106 	 */
2107 	exec_control = vmx_vmexit_ctrl();
2108 	if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
2109 		exec_control |= VM_EXIT_LOAD_IA32_EFER;
2110 	vm_exit_controls_init(vmx, exec_control);
2111 
2112 	/*
2113 	 * Conceptually we want to copy the PML address and index from
2114 	 * vmcs01 here, and then back to vmcs01 on nested vmexit. But,
2115 	 * since we always flush the log on each vmexit and never change
2116 	 * the PML address (once set), this happens to be equivalent to
2117 	 * simply resetting the index in vmcs02.
2118 	 */
2119 	if (enable_pml)
2120 		vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
2121 
2122 	/*
2123 	 * Interrupt/Exception Fields
2124 	 */
2125 	if (vmx->nested.nested_run_pending) {
2126 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2127 			     vmcs12->vm_entry_intr_info_field);
2128 		vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
2129 			     vmcs12->vm_entry_exception_error_code);
2130 		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
2131 			     vmcs12->vm_entry_instruction_len);
2132 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
2133 			     vmcs12->guest_interruptibility_info);
2134 		vmx->loaded_vmcs->nmi_known_unmasked =
2135 			!(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI);
2136 	} else {
2137 		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
2138 	}
2139 }
2140 
2141 static void prepare_vmcs02_full(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
2142 {
2143 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2144 
2145 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2146 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2147 		vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
2148 		vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
2149 		vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
2150 		vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
2151 		vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
2152 		vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
2153 		vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
2154 		vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
2155 		vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
2156 		vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
2157 		vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
2158 		vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
2159 		vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
2160 		vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
2161 		vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
2162 		vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
2163 		vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
2164 		vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
2165 		vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
2166 		vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
2167 		vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
2168 		vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
2169 		vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
2170 		vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
2171 		vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
2172 		vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
2173 		vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
2174 		vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
2175 		vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
2176 		vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
2177 		vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
2178 		vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
2179 		vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
2180 		vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
2181 	}
2182 
2183 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2184 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) {
2185 		vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
2186 		vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
2187 			    vmcs12->guest_pending_dbg_exceptions);
2188 		vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
2189 		vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
2190 
2191 		/*
2192 		 * L1 may access the L2's PDPTR, so save them to construct
2193 		 * vmcs12
2194 		 */
2195 		if (enable_ept) {
2196 			vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0);
2197 			vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1);
2198 			vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2);
2199 			vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
2200 		}
2201 	}
2202 
2203 	if (nested_cpu_has_xsaves(vmcs12))
2204 		vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
2205 
2206 	/*
2207 	 * Whether page-faults are trapped is determined by a combination of
2208 	 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
2209 	 * If enable_ept, L0 doesn't care about page faults and we should
2210 	 * set all of these to L1's desires. However, if !enable_ept, L0 does
2211 	 * care about (at least some) page faults, and because it is not easy
2212 	 * (if at all possible?) to merge L0 and L1's desires, we simply ask
2213 	 * to exit on each and every L2 page fault. This is done by setting
2214 	 * MASK=MATCH=0 and (see below) EB.PF=1.
2215 	 * Note that below we don't need special code to set EB.PF beyond the
2216 	 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
2217 	 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
2218 	 * !enable_ept, EB.PF is 1, so the "or" will always be 1.
2219 	 */
2220 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
2221 		enable_ept ? vmcs12->page_fault_error_code_mask : 0);
2222 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
2223 		enable_ept ? vmcs12->page_fault_error_code_match : 0);
2224 
2225 	if (cpu_has_vmx_apicv()) {
2226 		vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
2227 		vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1);
2228 		vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2);
2229 		vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
2230 	}
2231 
2232 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2233 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2234 
2235 	set_cr4_guest_host_mask(vmx);
2236 
2237 	if (kvm_mpx_supported()) {
2238 		if (vmx->nested.nested_run_pending &&
2239 			(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2240 			vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
2241 		else
2242 			vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
2243 	}
2244 }
2245 
2246 /*
2247  * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
2248  * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
2249  * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2
2250  * guest in a way that will both be appropriate to L1's requests, and our
2251  * needs. In addition to modifying the active vmcs (which is vmcs02), this
2252  * function also has additional necessary side-effects, like setting various
2253  * vcpu->arch fields.
2254  * Returns 0 on success, 1 on failure. Invalid state exit qualification code
2255  * is assigned to entry_failure_code on failure.
2256  */
2257 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
2258 			  u32 *entry_failure_code)
2259 {
2260 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2261 	struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
2262 
2263 	if (vmx->nested.dirty_vmcs12 || vmx->nested.hv_evmcs) {
2264 		prepare_vmcs02_full(vmx, vmcs12);
2265 		vmx->nested.dirty_vmcs12 = false;
2266 	}
2267 
2268 	/*
2269 	 * First, the fields that are shadowed.  This must be kept in sync
2270 	 * with vmcs_shadow_fields.h.
2271 	 */
2272 	if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
2273 			   HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) {
2274 		vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
2275 		vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
2276 	}
2277 
2278 	if (vmx->nested.nested_run_pending &&
2279 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
2280 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
2281 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
2282 	} else {
2283 		kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
2284 		vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
2285 	}
2286 	vmx_set_rflags(vcpu, vmcs12->guest_rflags);
2287 
2288 	/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
2289 	 * bitwise-or of what L1 wants to trap for L2, and what we want to
2290 	 * trap. Note that CR0.TS also needs updating - we do this later.
2291 	 */
2292 	update_exception_bitmap(vcpu);
2293 	vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
2294 	vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
2295 
2296 	if (vmx->nested.nested_run_pending &&
2297 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
2298 		vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
2299 		vcpu->arch.pat = vmcs12->guest_ia32_pat;
2300 	} else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
2301 		vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
2302 	}
2303 
2304 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
2305 
2306 	if (kvm_has_tsc_control)
2307 		decache_tsc_multiplier(vmx);
2308 
2309 	if (enable_vpid) {
2310 		/*
2311 		 * There is no direct mapping between vpid02 and vpid12, the
2312 		 * vpid02 is per-vCPU for L0 and reused while the value of
2313 		 * vpid12 is changed w/ one invvpid during nested vmentry.
2314 		 * The vpid12 is allocated by L1 for L2, so it will not
2315 		 * influence global bitmap(for vpid01 and vpid02 allocation)
2316 		 * even if spawn a lot of nested vCPUs.
2317 		 */
2318 		if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
2319 			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
2320 				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
2321 				__vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
2322 			}
2323 		} else {
2324 			/*
2325 			 * If L1 use EPT, then L0 needs to execute INVEPT on
2326 			 * EPTP02 instead of EPTP01. Therefore, delay TLB
2327 			 * flush until vmcs02->eptp is fully updated by
2328 			 * KVM_REQ_LOAD_CR3. Note that this assumes
2329 			 * KVM_REQ_TLB_FLUSH is evaluated after
2330 			 * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
2331 			 */
2332 			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
2333 		}
2334 	}
2335 
2336 	if (nested_cpu_has_ept(vmcs12))
2337 		nested_ept_init_mmu_context(vcpu);
2338 	else if (nested_cpu_has2(vmcs12,
2339 				 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2340 		vmx_flush_tlb(vcpu, true);
2341 
2342 	/*
2343 	 * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
2344 	 * bits which we consider mandatory enabled.
2345 	 * The CR0_READ_SHADOW is what L2 should have expected to read given
2346 	 * the specifications by L1; It's not enough to take
2347 	 * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
2348 	 * have more bits than L1 expected.
2349 	 */
2350 	vmx_set_cr0(vcpu, vmcs12->guest_cr0);
2351 	vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
2352 
2353 	vmx_set_cr4(vcpu, vmcs12->guest_cr4);
2354 	vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
2355 
2356 	vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12);
2357 	/* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
2358 	vmx_set_efer(vcpu, vcpu->arch.efer);
2359 
2360 	/*
2361 	 * Guest state is invalid and unrestricted guest is disabled,
2362 	 * which means L1 attempted VMEntry to L2 with invalid state.
2363 	 * Fail the VMEntry.
2364 	 */
2365 	if (vmx->emulation_required) {
2366 		*entry_failure_code = ENTRY_FAIL_DEFAULT;
2367 		return -EINVAL;
2368 	}
2369 
2370 	/* Shadow page tables on either EPT or shadow page tables. */
2371 	if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12),
2372 				entry_failure_code))
2373 		return -EINVAL;
2374 
2375 	if (!enable_ept)
2376 		vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
2377 
2378 	kvm_rsp_write(vcpu, vmcs12->guest_rsp);
2379 	kvm_rip_write(vcpu, vmcs12->guest_rip);
2380 	return 0;
2381 }
2382 
2383 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12)
2384 {
2385 	if (!nested_cpu_has_nmi_exiting(vmcs12) &&
2386 	    nested_cpu_has_virtual_nmis(vmcs12))
2387 		return -EINVAL;
2388 
2389 	if (!nested_cpu_has_virtual_nmis(vmcs12) &&
2390 	    nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING))
2391 		return -EINVAL;
2392 
2393 	return 0;
2394 }
2395 
2396 static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
2397 {
2398 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2399 	int maxphyaddr = cpuid_maxphyaddr(vcpu);
2400 
2401 	/* Check for memory type validity */
2402 	switch (address & VMX_EPTP_MT_MASK) {
2403 	case VMX_EPTP_MT_UC:
2404 		if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))
2405 			return false;
2406 		break;
2407 	case VMX_EPTP_MT_WB:
2408 		if (!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))
2409 			return false;
2410 		break;
2411 	default:
2412 		return false;
2413 	}
2414 
2415 	/* only 4 levels page-walk length are valid */
2416 	if ((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)
2417 		return false;
2418 
2419 	/* Reserved bits should not be set */
2420 	if (address >> maxphyaddr || ((address >> 7) & 0x1f))
2421 		return false;
2422 
2423 	/* AD, if set, should be supported */
2424 	if (address & VMX_EPTP_AD_ENABLE_BIT) {
2425 		if (!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))
2426 			return false;
2427 	}
2428 
2429 	return true;
2430 }
2431 
2432 /*
2433  * Checks related to VM-Execution Control Fields
2434  */
2435 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu,
2436                                               struct vmcs12 *vmcs12)
2437 {
2438 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2439 
2440 	if (!vmx_control_verify(vmcs12->pin_based_vm_exec_control,
2441 				vmx->nested.msrs.pinbased_ctls_low,
2442 				vmx->nested.msrs.pinbased_ctls_high) ||
2443 	    !vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
2444 				vmx->nested.msrs.procbased_ctls_low,
2445 				vmx->nested.msrs.procbased_ctls_high))
2446 		return -EINVAL;
2447 
2448 	if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
2449 	    !vmx_control_verify(vmcs12->secondary_vm_exec_control,
2450 				 vmx->nested.msrs.secondary_ctls_low,
2451 				 vmx->nested.msrs.secondary_ctls_high))
2452 		return -EINVAL;
2453 
2454 	if (vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu) ||
2455 	    nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) ||
2456 	    nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) ||
2457 	    nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) ||
2458 	    nested_vmx_check_apic_access_controls(vcpu, vmcs12) ||
2459 	    nested_vmx_check_apicv_controls(vcpu, vmcs12) ||
2460 	    nested_vmx_check_nmi_controls(vmcs12) ||
2461 	    nested_vmx_check_pml_controls(vcpu, vmcs12) ||
2462 	    nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) ||
2463 	    nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) ||
2464 	    nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) ||
2465 	    (nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id))
2466 		return -EINVAL;
2467 
2468 	if (!nested_cpu_has_preemption_timer(vmcs12) &&
2469 	    nested_cpu_has_save_preemption_timer(vmcs12))
2470 		return -EINVAL;
2471 
2472 	if (nested_cpu_has_ept(vmcs12) &&
2473 	    !valid_ept_address(vcpu, vmcs12->ept_pointer))
2474 		return -EINVAL;
2475 
2476 	if (nested_cpu_has_vmfunc(vmcs12)) {
2477 		if (vmcs12->vm_function_control &
2478 		    ~vmx->nested.msrs.vmfunc_controls)
2479 			return -EINVAL;
2480 
2481 		if (nested_cpu_has_eptp_switching(vmcs12)) {
2482 			if (!nested_cpu_has_ept(vmcs12) ||
2483 			    !page_address_valid(vcpu, vmcs12->eptp_list_address))
2484 				return -EINVAL;
2485 		}
2486 	}
2487 
2488 	return 0;
2489 }
2490 
2491 /*
2492  * Checks related to VM-Exit Control Fields
2493  */
2494 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu,
2495                                          struct vmcs12 *vmcs12)
2496 {
2497 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2498 
2499 	if (!vmx_control_verify(vmcs12->vm_exit_controls,
2500 				vmx->nested.msrs.exit_ctls_low,
2501 				vmx->nested.msrs.exit_ctls_high) ||
2502 	    nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))
2503 		return -EINVAL;
2504 
2505 	return 0;
2506 }
2507 
2508 /*
2509  * Checks related to VM-Entry Control Fields
2510  */
2511 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
2512 					  struct vmcs12 *vmcs12)
2513 {
2514 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2515 
2516 	if (!vmx_control_verify(vmcs12->vm_entry_controls,
2517 				vmx->nested.msrs.entry_ctls_low,
2518 				vmx->nested.msrs.entry_ctls_high))
2519 		return -EINVAL;
2520 
2521 	/*
2522 	 * From the Intel SDM, volume 3:
2523 	 * Fields relevant to VM-entry event injection must be set properly.
2524 	 * These fields are the VM-entry interruption-information field, the
2525 	 * VM-entry exception error code, and the VM-entry instruction length.
2526 	 */
2527 	if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) {
2528 		u32 intr_info = vmcs12->vm_entry_intr_info_field;
2529 		u8 vector = intr_info & INTR_INFO_VECTOR_MASK;
2530 		u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK;
2531 		bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK;
2532 		bool should_have_error_code;
2533 		bool urg = nested_cpu_has2(vmcs12,
2534 					   SECONDARY_EXEC_UNRESTRICTED_GUEST);
2535 		bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE;
2536 
2537 		/* VM-entry interruption-info field: interruption type */
2538 		if (intr_type == INTR_TYPE_RESERVED ||
2539 		    (intr_type == INTR_TYPE_OTHER_EVENT &&
2540 		     !nested_cpu_supports_monitor_trap_flag(vcpu)))
2541 			return -EINVAL;
2542 
2543 		/* VM-entry interruption-info field: vector */
2544 		if ((intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) ||
2545 		    (intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) ||
2546 		    (intr_type == INTR_TYPE_OTHER_EVENT && vector != 0))
2547 			return -EINVAL;
2548 
2549 		/* VM-entry interruption-info field: deliver error code */
2550 		should_have_error_code =
2551 			intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode &&
2552 			x86_exception_has_error_code(vector);
2553 		if (has_error_code != should_have_error_code)
2554 			return -EINVAL;
2555 
2556 		/* VM-entry exception error code */
2557 		if (has_error_code &&
2558 		    vmcs12->vm_entry_exception_error_code & GENMASK(31, 15))
2559 			return -EINVAL;
2560 
2561 		/* VM-entry interruption-info field: reserved bits */
2562 		if (intr_info & INTR_INFO_RESVD_BITS_MASK)
2563 			return -EINVAL;
2564 
2565 		/* VM-entry instruction length */
2566 		switch (intr_type) {
2567 		case INTR_TYPE_SOFT_EXCEPTION:
2568 		case INTR_TYPE_SOFT_INTR:
2569 		case INTR_TYPE_PRIV_SW_EXCEPTION:
2570 			if ((vmcs12->vm_entry_instruction_len > 15) ||
2571 			    (vmcs12->vm_entry_instruction_len == 0 &&
2572 			     !nested_cpu_has_zero_length_injection(vcpu)))
2573 				return -EINVAL;
2574 		}
2575 	}
2576 
2577 	if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12))
2578 		return -EINVAL;
2579 
2580 	return 0;
2581 }
2582 
2583 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
2584 				     struct vmcs12 *vmcs12)
2585 {
2586 	if (nested_check_vm_execution_controls(vcpu, vmcs12) ||
2587 	    nested_check_vm_exit_controls(vcpu, vmcs12) ||
2588 	    nested_check_vm_entry_controls(vcpu, vmcs12))
2589 		return -EINVAL;
2590 
2591 	return 0;
2592 }
2593 
2594 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu,
2595 				       struct vmcs12 *vmcs12)
2596 {
2597 	bool ia32e;
2598 
2599 	if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
2600 	    !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
2601 	    !nested_cr3_valid(vcpu, vmcs12->host_cr3))
2602 		return -EINVAL;
2603 
2604 	if (is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu) ||
2605 	    is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))
2606 		return -EINVAL;
2607 
2608 	if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) &&
2609 	    !kvm_pat_valid(vmcs12->host_ia32_pat))
2610 		return -EINVAL;
2611 
2612 	/*
2613 	 * If the load IA32_EFER VM-exit control is 1, bits reserved in the
2614 	 * IA32_EFER MSR must be 0 in the field for that register. In addition,
2615 	 * the values of the LMA and LME bits in the field must each be that of
2616 	 * the host address-space size VM-exit control.
2617 	 */
2618 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) {
2619 		ia32e = (vmcs12->vm_exit_controls &
2620 			 VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
2621 		if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
2622 		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
2623 		    ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
2624 			return -EINVAL;
2625 	}
2626 
2627 	return 0;
2628 }
2629 
2630 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
2631 					  struct vmcs12 *vmcs12)
2632 {
2633 	int r = 0;
2634 	struct vmcs12 *shadow;
2635 	struct kvm_host_map map;
2636 
2637 	if (vmcs12->vmcs_link_pointer == -1ull)
2638 		return 0;
2639 
2640 	if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
2641 		return -EINVAL;
2642 
2643 	if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->vmcs_link_pointer), &map))
2644 		return -EINVAL;
2645 
2646 	shadow = map.hva;
2647 
2648 	if (shadow->hdr.revision_id != VMCS12_REVISION ||
2649 	    shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
2650 		r = -EINVAL;
2651 
2652 	kvm_vcpu_unmap(vcpu, &map, false);
2653 	return r;
2654 }
2655 
2656 /*
2657  * Checks related to Guest Non-register State
2658  */
2659 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12)
2660 {
2661 	if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
2662 	    vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
2663 		return -EINVAL;
2664 
2665 	return 0;
2666 }
2667 
2668 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
2669 					struct vmcs12 *vmcs12,
2670 					u32 *exit_qual)
2671 {
2672 	bool ia32e;
2673 
2674 	*exit_qual = ENTRY_FAIL_DEFAULT;
2675 
2676 	if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
2677 	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
2678 		return -EINVAL;
2679 
2680 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
2681 	    !kvm_pat_valid(vmcs12->guest_ia32_pat))
2682 		return -EINVAL;
2683 
2684 	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
2685 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
2686 		return -EINVAL;
2687 	}
2688 
2689 	/*
2690 	 * If the load IA32_EFER VM-entry control is 1, the following checks
2691 	 * are performed on the field for the IA32_EFER MSR:
2692 	 * - Bits reserved in the IA32_EFER MSR must be 0.
2693 	 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of
2694 	 *   the IA-32e mode guest VM-exit control. It must also be identical
2695 	 *   to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
2696 	 *   CR0.PG) is 1.
2697 	 */
2698 	if (to_vmx(vcpu)->nested.nested_run_pending &&
2699 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
2700 		ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
2701 		if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
2702 		    ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
2703 		    ((vmcs12->guest_cr0 & X86_CR0_PG) &&
2704 		     ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
2705 			return -EINVAL;
2706 	}
2707 
2708 	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
2709 	    (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
2710 	     (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
2711 		return -EINVAL;
2712 
2713 	if (nested_check_guest_non_reg_state(vmcs12))
2714 		return -EINVAL;
2715 
2716 	return 0;
2717 }
2718 
2719 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
2720 {
2721 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2722 	unsigned long cr3, cr4;
2723 	bool vm_fail;
2724 
2725 	if (!nested_early_check)
2726 		return 0;
2727 
2728 	if (vmx->msr_autoload.host.nr)
2729 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
2730 	if (vmx->msr_autoload.guest.nr)
2731 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
2732 
2733 	preempt_disable();
2734 
2735 	vmx_prepare_switch_to_guest(vcpu);
2736 
2737 	/*
2738 	 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
2739 	 * which is reserved to '1' by hardware.  GUEST_RFLAGS is guaranteed to
2740 	 * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
2741 	 * there is no need to preserve other bits or save/restore the field.
2742 	 */
2743 	vmcs_writel(GUEST_RFLAGS, 0);
2744 
2745 	cr3 = __get_current_cr3_fast();
2746 	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
2747 		vmcs_writel(HOST_CR3, cr3);
2748 		vmx->loaded_vmcs->host_state.cr3 = cr3;
2749 	}
2750 
2751 	cr4 = cr4_read_shadow();
2752 	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
2753 		vmcs_writel(HOST_CR4, cr4);
2754 		vmx->loaded_vmcs->host_state.cr4 = cr4;
2755 	}
2756 
2757 	asm(
2758 		"sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
2759 		"cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2760 		"je 1f \n\t"
2761 		__ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
2762 		"mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
2763 		"1: \n\t"
2764 		"add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
2765 
2766 		/* Check if vmlaunch or vmresume is needed */
2767 		"cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
2768 
2769 		/*
2770 		 * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
2771 		 * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
2772 		 * Valid.  vmx_vmenter() directly "returns" RFLAGS, and so the
2773 		 * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
2774 		 */
2775 		"call vmx_vmenter\n\t"
2776 
2777 		CC_SET(be)
2778 	      : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
2779 	      :	[HOST_RSP]"r"((unsigned long)HOST_RSP),
2780 		[loaded_vmcs]"r"(vmx->loaded_vmcs),
2781 		[launched]"i"(offsetof(struct loaded_vmcs, launched)),
2782 		[host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
2783 		[wordsize]"i"(sizeof(ulong))
2784 	      : "cc", "memory"
2785 	);
2786 
2787 	preempt_enable();
2788 
2789 	if (vmx->msr_autoload.host.nr)
2790 		vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
2791 	if (vmx->msr_autoload.guest.nr)
2792 		vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
2793 
2794 	if (vm_fail) {
2795 		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
2796 			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
2797 		return 1;
2798 	}
2799 
2800 	/*
2801 	 * VMExit clears RFLAGS.IF and DR7, even on a consistency check.
2802 	 */
2803 	local_irq_enable();
2804 	if (hw_breakpoint_active())
2805 		set_debugreg(__this_cpu_read(cpu_dr7), 7);
2806 
2807 	/*
2808 	 * A non-failing VMEntry means we somehow entered guest mode with
2809 	 * an illegal RIP, and that's just the tip of the iceberg.  There
2810 	 * is no telling what memory has been modified or what state has
2811 	 * been exposed to unknown code.  Hitting this all but guarantees
2812 	 * a (very critical) hardware issue.
2813 	 */
2814 	WARN_ON(!(vmcs_read32(VM_EXIT_REASON) &
2815 		VMX_EXIT_REASONS_FAILED_VMENTRY));
2816 
2817 	return 0;
2818 }
2819 
2820 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
2821 						 struct vmcs12 *vmcs12);
2822 
2823 static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
2824 {
2825 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2826 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2827 	struct kvm_host_map *map;
2828 	struct page *page;
2829 	u64 hpa;
2830 
2831 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2832 		/*
2833 		 * Translate L1 physical address to host physical
2834 		 * address for vmcs02. Keep the page pinned, so this
2835 		 * physical address remains valid. We keep a reference
2836 		 * to it so we can release it later.
2837 		 */
2838 		if (vmx->nested.apic_access_page) { /* shouldn't happen */
2839 			kvm_release_page_dirty(vmx->nested.apic_access_page);
2840 			vmx->nested.apic_access_page = NULL;
2841 		}
2842 		page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
2843 		/*
2844 		 * If translation failed, no matter: This feature asks
2845 		 * to exit when accessing the given address, and if it
2846 		 * can never be accessed, this feature won't do
2847 		 * anything anyway.
2848 		 */
2849 		if (!is_error_page(page)) {
2850 			vmx->nested.apic_access_page = page;
2851 			hpa = page_to_phys(vmx->nested.apic_access_page);
2852 			vmcs_write64(APIC_ACCESS_ADDR, hpa);
2853 		} else {
2854 			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
2855 					SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
2856 		}
2857 	}
2858 
2859 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
2860 		map = &vmx->nested.virtual_apic_map;
2861 
2862 		/*
2863 		 * If translation failed, VM entry will fail because
2864 		 * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
2865 		 */
2866 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) {
2867 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn));
2868 		} else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) &&
2869 		           nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) &&
2870 			   !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
2871 			/*
2872 			 * The processor will never use the TPR shadow, simply
2873 			 * clear the bit from the execution control.  Such a
2874 			 * configuration is useless, but it happens in tests.
2875 			 * For any other configuration, failing the vm entry is
2876 			 * _not_ what the processor does but it's basically the
2877 			 * only possibility we have.
2878 			 */
2879 			vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
2880 					CPU_BASED_TPR_SHADOW);
2881 		} else {
2882 			printk("bad virtual-APIC page address\n");
2883 			dump_vmcs();
2884 		}
2885 	}
2886 
2887 	if (nested_cpu_has_posted_intr(vmcs12)) {
2888 		map = &vmx->nested.pi_desc_map;
2889 
2890 		if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) {
2891 			vmx->nested.pi_desc =
2892 				(struct pi_desc *)(((void *)map->hva) +
2893 				offset_in_page(vmcs12->posted_intr_desc_addr));
2894 			vmcs_write64(POSTED_INTR_DESC_ADDR,
2895 				     pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr));
2896 		}
2897 	}
2898 	if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12))
2899 		vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
2900 			      CPU_BASED_USE_MSR_BITMAPS);
2901 	else
2902 		vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
2903 				CPU_BASED_USE_MSR_BITMAPS);
2904 }
2905 
2906 /*
2907  * Intel's VMX Instruction Reference specifies a common set of prerequisites
2908  * for running VMX instructions (except VMXON, whose prerequisites are
2909  * slightly different). It also specifies what exception to inject otherwise.
2910  * Note that many of these exceptions have priority over VM exits, so they
2911  * don't have to be checked again here.
2912  */
2913 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
2914 {
2915 	if (!to_vmx(vcpu)->nested.vmxon) {
2916 		kvm_queue_exception(vcpu, UD_VECTOR);
2917 		return 0;
2918 	}
2919 
2920 	if (vmx_get_cpl(vcpu)) {
2921 		kvm_inject_gp(vcpu, 0);
2922 		return 0;
2923 	}
2924 
2925 	return 1;
2926 }
2927 
2928 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu)
2929 {
2930 	u8 rvi = vmx_get_rvi();
2931 	u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI);
2932 
2933 	return ((rvi & 0xf0) > (vppr & 0xf0));
2934 }
2935 
2936 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
2937 				   struct vmcs12 *vmcs12);
2938 
2939 /*
2940  * If from_vmentry is false, this is being called from state restore (either RSM
2941  * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
2942 + *
2943 + * Returns:
2944 + *   0 - success, i.e. proceed with actual VMEnter
2945 + *   1 - consistency check VMExit
2946 + *  -1 - consistency check VMFail
2947  */
2948 int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
2949 {
2950 	struct vcpu_vmx *vmx = to_vmx(vcpu);
2951 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
2952 	bool evaluate_pending_interrupts;
2953 	u32 exit_reason = EXIT_REASON_INVALID_STATE;
2954 	u32 exit_qual;
2955 
2956 	evaluate_pending_interrupts = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
2957 		(CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
2958 	if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
2959 		evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
2960 
2961 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
2962 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
2963 	if (kvm_mpx_supported() &&
2964 		!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
2965 		vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
2966 
2967 	vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
2968 
2969 	prepare_vmcs02_early(vmx, vmcs12);
2970 
2971 	if (from_vmentry) {
2972 		nested_get_vmcs12_pages(vcpu);
2973 
2974 		if (nested_vmx_check_vmentry_hw(vcpu)) {
2975 			vmx_switch_vmcs(vcpu, &vmx->vmcs01);
2976 			return -1;
2977 		}
2978 
2979 		if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
2980 			goto vmentry_fail_vmexit;
2981 	}
2982 
2983 	enter_guest_mode(vcpu);
2984 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
2985 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
2986 
2987 	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
2988 		goto vmentry_fail_vmexit_guest_mode;
2989 
2990 	if (from_vmentry) {
2991 		exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
2992 		exit_qual = nested_vmx_load_msr(vcpu,
2993 						vmcs12->vm_entry_msr_load_addr,
2994 						vmcs12->vm_entry_msr_load_count);
2995 		if (exit_qual)
2996 			goto vmentry_fail_vmexit_guest_mode;
2997 	} else {
2998 		/*
2999 		 * The MMU is not initialized to point at the right entities yet and
3000 		 * "get pages" would need to read data from the guest (i.e. we will
3001 		 * need to perform gpa to hpa translation). Request a call
3002 		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
3003 		 * have already been set at vmentry time and should not be reset.
3004 		 */
3005 		kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
3006 	}
3007 
3008 	/*
3009 	 * If L1 had a pending IRQ/NMI until it executed
3010 	 * VMLAUNCH/VMRESUME which wasn't delivered because it was
3011 	 * disallowed (e.g. interrupts disabled), L0 needs to
3012 	 * evaluate if this pending event should cause an exit from L2
3013 	 * to L1 or delivered directly to L2 (e.g. In case L1 don't
3014 	 * intercept EXTERNAL_INTERRUPT).
3015 	 *
3016 	 * Usually this would be handled by the processor noticing an
3017 	 * IRQ/NMI window request, or checking RVI during evaluation of
3018 	 * pending virtual interrupts.  However, this setting was done
3019 	 * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
3020 	 * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
3021 	 */
3022 	if (unlikely(evaluate_pending_interrupts))
3023 		kvm_make_request(KVM_REQ_EVENT, vcpu);
3024 
3025 	/*
3026 	 * Do not start the preemption timer hrtimer until after we know
3027 	 * we are successful, so that only nested_vmx_vmexit needs to cancel
3028 	 * the timer.
3029 	 */
3030 	vmx->nested.preemption_timer_expired = false;
3031 	if (nested_cpu_has_preemption_timer(vmcs12))
3032 		vmx_start_preemption_timer(vcpu);
3033 
3034 	/*
3035 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
3036 	 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
3037 	 * returned as far as L1 is concerned. It will only return (and set
3038 	 * the success flag) when L2 exits (see nested_vmx_vmexit()).
3039 	 */
3040 	return 0;
3041 
3042 	/*
3043 	 * A failed consistency check that leads to a VMExit during L1's
3044 	 * VMEnter to L2 is a variation of a normal VMexit, as explained in
3045 	 * 26.7 "VM-entry failures during or after loading guest state".
3046 	 */
3047 vmentry_fail_vmexit_guest_mode:
3048 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3049 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3050 	leave_guest_mode(vcpu);
3051 
3052 vmentry_fail_vmexit:
3053 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3054 
3055 	if (!from_vmentry)
3056 		return 1;
3057 
3058 	load_vmcs12_host_state(vcpu, vmcs12);
3059 	vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
3060 	vmcs12->exit_qualification = exit_qual;
3061 	if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
3062 		vmx->nested.need_vmcs12_sync = true;
3063 	return 1;
3064 }
3065 
3066 /*
3067  * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
3068  * for running an L2 nested guest.
3069  */
3070 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
3071 {
3072 	struct vmcs12 *vmcs12;
3073 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3074 	u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
3075 	int ret;
3076 
3077 	if (!nested_vmx_check_permission(vcpu))
3078 		return 1;
3079 
3080 	if (!nested_vmx_handle_enlightened_vmptrld(vcpu, true))
3081 		return 1;
3082 
3083 	if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
3084 		return nested_vmx_failInvalid(vcpu);
3085 
3086 	vmcs12 = get_vmcs12(vcpu);
3087 
3088 	/*
3089 	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
3090 	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
3091 	 * rather than RFLAGS.ZF, and no error number is stored to the
3092 	 * VM-instruction error field.
3093 	 */
3094 	if (vmcs12->hdr.shadow_vmcs)
3095 		return nested_vmx_failInvalid(vcpu);
3096 
3097 	if (vmx->nested.hv_evmcs) {
3098 		copy_enlightened_to_vmcs12(vmx);
3099 		/* Enlightened VMCS doesn't have launch state */
3100 		vmcs12->launch_state = !launch;
3101 	} else if (enable_shadow_vmcs) {
3102 		copy_shadow_to_vmcs12(vmx);
3103 	}
3104 
3105 	/*
3106 	 * The nested entry process starts with enforcing various prerequisites
3107 	 * on vmcs12 as required by the Intel SDM, and act appropriately when
3108 	 * they fail: As the SDM explains, some conditions should cause the
3109 	 * instruction to fail, while others will cause the instruction to seem
3110 	 * to succeed, but return an EXIT_REASON_INVALID_STATE.
3111 	 * To speed up the normal (success) code path, we should avoid checking
3112 	 * for misconfigurations which will anyway be caught by the processor
3113 	 * when using the merged vmcs02.
3114 	 */
3115 	if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
3116 		return nested_vmx_failValid(vcpu,
3117 			VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
3118 
3119 	if (vmcs12->launch_state == launch)
3120 		return nested_vmx_failValid(vcpu,
3121 			launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
3122 			       : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
3123 
3124 	if (nested_vmx_check_controls(vcpu, vmcs12))
3125 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3126 
3127 	if (nested_vmx_check_host_state(vcpu, vmcs12))
3128 		return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
3129 
3130 	/*
3131 	 * We're finally done with prerequisite checking, and can start with
3132 	 * the nested entry.
3133 	 */
3134 	vmx->nested.nested_run_pending = 1;
3135 	ret = nested_vmx_enter_non_root_mode(vcpu, true);
3136 	vmx->nested.nested_run_pending = !ret;
3137 	if (ret > 0)
3138 		return 1;
3139 	else if (ret)
3140 		return nested_vmx_failValid(vcpu,
3141 			VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3142 
3143 	/* Hide L1D cache contents from the nested guest.  */
3144 	vmx->vcpu.arch.l1tf_flush_l1d = true;
3145 
3146 	/*
3147 	 * Must happen outside of nested_vmx_enter_non_root_mode() as it will
3148 	 * also be used as part of restoring nVMX state for
3149 	 * snapshot restore (migration).
3150 	 *
3151 	 * In this flow, it is assumed that vmcs12 cache was
3152 	 * trasferred as part of captured nVMX state and should
3153 	 * therefore not be read from guest memory (which may not
3154 	 * exist on destination host yet).
3155 	 */
3156 	nested_cache_shadow_vmcs12(vcpu, vmcs12);
3157 
3158 	/*
3159 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be
3160 	 * awakened by event injection or by an NMI-window VM-exit or
3161 	 * by an interrupt-window VM-exit, halt the vcpu.
3162 	 */
3163 	if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
3164 	    !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
3165 	    !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
3166 	    !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
3167 	      (vmcs12->guest_rflags & X86_EFLAGS_IF))) {
3168 		vmx->nested.nested_run_pending = 0;
3169 		return kvm_vcpu_halt(vcpu);
3170 	}
3171 	return 1;
3172 }
3173 
3174 /*
3175  * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
3176  * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
3177  * This function returns the new value we should put in vmcs12.guest_cr0.
3178  * It's not enough to just return the vmcs02 GUEST_CR0. Rather,
3179  *  1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
3180  *     available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0
3181  *     didn't trap the bit, because if L1 did, so would L0).
3182  *  2. Bits that L1 asked to trap (and therefore L0 also did) could not have
3183  *     been modified by L2, and L1 knows it. So just leave the old value of
3184  *     the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0
3185  *     isn't relevant, because if L0 traps this bit it can set it to anything.
3186  *  3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have
3187  *     changed these bits, and therefore they need to be updated, but L0
3188  *     didn't necessarily allow them to be changed in GUEST_CR0 - and rather
3189  *     put them in vmcs02 CR0_READ_SHADOW. So take these bits from there.
3190  */
3191 static inline unsigned long
3192 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3193 {
3194 	return
3195 	/*1*/	(vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) |
3196 	/*2*/	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) |
3197 	/*3*/	(vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask |
3198 			vcpu->arch.cr0_guest_owned_bits));
3199 }
3200 
3201 static inline unsigned long
3202 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3203 {
3204 	return
3205 	/*1*/	(vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) |
3206 	/*2*/	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) |
3207 	/*3*/	(vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask |
3208 			vcpu->arch.cr4_guest_owned_bits));
3209 }
3210 
3211 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
3212 				      struct vmcs12 *vmcs12)
3213 {
3214 	u32 idt_vectoring;
3215 	unsigned int nr;
3216 
3217 	if (vcpu->arch.exception.injected) {
3218 		nr = vcpu->arch.exception.nr;
3219 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3220 
3221 		if (kvm_exception_is_soft(nr)) {
3222 			vmcs12->vm_exit_instruction_len =
3223 				vcpu->arch.event_exit_inst_len;
3224 			idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
3225 		} else
3226 			idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
3227 
3228 		if (vcpu->arch.exception.has_error_code) {
3229 			idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
3230 			vmcs12->idt_vectoring_error_code =
3231 				vcpu->arch.exception.error_code;
3232 		}
3233 
3234 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3235 	} else if (vcpu->arch.nmi_injected) {
3236 		vmcs12->idt_vectoring_info_field =
3237 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR;
3238 	} else if (vcpu->arch.interrupt.injected) {
3239 		nr = vcpu->arch.interrupt.nr;
3240 		idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
3241 
3242 		if (vcpu->arch.interrupt.soft) {
3243 			idt_vectoring |= INTR_TYPE_SOFT_INTR;
3244 			vmcs12->vm_entry_instruction_len =
3245 				vcpu->arch.event_exit_inst_len;
3246 		} else
3247 			idt_vectoring |= INTR_TYPE_EXT_INTR;
3248 
3249 		vmcs12->idt_vectoring_info_field = idt_vectoring;
3250 	}
3251 }
3252 
3253 
3254 static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
3255 {
3256 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3257 	gfn_t gfn;
3258 
3259 	/*
3260 	 * Don't need to mark the APIC access page dirty; it is never
3261 	 * written to by the CPU during APIC virtualization.
3262 	 */
3263 
3264 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
3265 		gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
3266 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3267 	}
3268 
3269 	if (nested_cpu_has_posted_intr(vmcs12)) {
3270 		gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
3271 		kvm_vcpu_mark_page_dirty(vcpu, gfn);
3272 	}
3273 }
3274 
3275 static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
3276 {
3277 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3278 	int max_irr;
3279 	void *vapic_page;
3280 	u16 status;
3281 
3282 	if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
3283 		return;
3284 
3285 	vmx->nested.pi_pending = false;
3286 	if (!pi_test_and_clear_on(vmx->nested.pi_desc))
3287 		return;
3288 
3289 	max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
3290 	if (max_irr != 256) {
3291 		vapic_page = vmx->nested.virtual_apic_map.hva;
3292 		if (!vapic_page)
3293 			return;
3294 
3295 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir,
3296 			vapic_page, &max_irr);
3297 		status = vmcs_read16(GUEST_INTR_STATUS);
3298 		if ((u8)max_irr > ((u8)status & 0xff)) {
3299 			status &= ~0xff;
3300 			status |= (u8)max_irr;
3301 			vmcs_write16(GUEST_INTR_STATUS, status);
3302 		}
3303 	}
3304 
3305 	nested_mark_vmcs12_pages_dirty(vcpu);
3306 }
3307 
3308 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
3309 					       unsigned long exit_qual)
3310 {
3311 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3312 	unsigned int nr = vcpu->arch.exception.nr;
3313 	u32 intr_info = nr | INTR_INFO_VALID_MASK;
3314 
3315 	if (vcpu->arch.exception.has_error_code) {
3316 		vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
3317 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
3318 	}
3319 
3320 	if (kvm_exception_is_soft(nr))
3321 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
3322 	else
3323 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
3324 
3325 	if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
3326 	    vmx_get_nmi_mask(vcpu))
3327 		intr_info |= INTR_INFO_UNBLOCK_NMI;
3328 
3329 	nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
3330 }
3331 
3332 static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
3333 {
3334 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3335 	unsigned long exit_qual;
3336 	bool block_nested_events =
3337 	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
3338 
3339 	if (vcpu->arch.exception.pending &&
3340 		nested_vmx_check_exception(vcpu, &exit_qual)) {
3341 		if (block_nested_events)
3342 			return -EBUSY;
3343 		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
3344 		return 0;
3345 	}
3346 
3347 	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
3348 	    vmx->nested.preemption_timer_expired) {
3349 		if (block_nested_events)
3350 			return -EBUSY;
3351 		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
3352 		return 0;
3353 	}
3354 
3355 	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
3356 		if (block_nested_events)
3357 			return -EBUSY;
3358 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
3359 				  NMI_VECTOR | INTR_TYPE_NMI_INTR |
3360 				  INTR_INFO_VALID_MASK, 0);
3361 		/*
3362 		 * The NMI-triggered VM exit counts as injection:
3363 		 * clear this one and block further NMIs.
3364 		 */
3365 		vcpu->arch.nmi_pending = 0;
3366 		vmx_set_nmi_mask(vcpu, true);
3367 		return 0;
3368 	}
3369 
3370 	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
3371 	    nested_exit_on_intr(vcpu)) {
3372 		if (block_nested_events)
3373 			return -EBUSY;
3374 		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
3375 		return 0;
3376 	}
3377 
3378 	vmx_complete_nested_posted_interrupt(vcpu);
3379 	return 0;
3380 }
3381 
3382 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
3383 {
3384 	ktime_t remaining =
3385 		hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer);
3386 	u64 value;
3387 
3388 	if (ktime_to_ns(remaining) <= 0)
3389 		return 0;
3390 
3391 	value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz;
3392 	do_div(value, 1000000);
3393 	return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
3394 }
3395 
3396 /*
3397  * Update the guest state fields of vmcs12 to reflect changes that
3398  * occurred while L2 was running. (The "IA-32e mode guest" bit of the
3399  * VM-entry controls is also updated, since this is really a guest
3400  * state bit.)
3401  */
3402 static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
3403 {
3404 	vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
3405 	vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
3406 
3407 	vmcs12->guest_rsp = kvm_rsp_read(vcpu);
3408 	vmcs12->guest_rip = kvm_rip_read(vcpu);
3409 	vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS);
3410 
3411 	vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR);
3412 	vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR);
3413 	vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR);
3414 	vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR);
3415 	vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR);
3416 	vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR);
3417 	vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR);
3418 	vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR);
3419 	vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT);
3420 	vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT);
3421 	vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT);
3422 	vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT);
3423 	vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT);
3424 	vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT);
3425 	vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT);
3426 	vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT);
3427 	vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT);
3428 	vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT);
3429 	vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES);
3430 	vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
3431 	vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
3432 	vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES);
3433 	vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES);
3434 	vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES);
3435 	vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES);
3436 	vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES);
3437 	vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE);
3438 	vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE);
3439 	vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE);
3440 	vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE);
3441 	vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE);
3442 	vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE);
3443 	vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE);
3444 	vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE);
3445 	vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE);
3446 	vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE);
3447 
3448 	vmcs12->guest_interruptibility_info =
3449 		vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3450 	vmcs12->guest_pending_dbg_exceptions =
3451 		vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
3452 	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
3453 		vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT;
3454 	else
3455 		vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
3456 
3457 	if (nested_cpu_has_preemption_timer(vmcs12) &&
3458 	    vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
3459 			vmcs12->vmx_preemption_timer_value =
3460 				vmx_get_preemption_timer_value(vcpu);
3461 
3462 	/*
3463 	 * In some cases (usually, nested EPT), L2 is allowed to change its
3464 	 * own CR3 without exiting. If it has changed it, we must keep it.
3465 	 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined
3466 	 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12.
3467 	 *
3468 	 * Additionally, restore L2's PDPTR to vmcs12.
3469 	 */
3470 	if (enable_ept) {
3471 		vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3);
3472 		vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0);
3473 		vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1);
3474 		vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2);
3475 		vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3);
3476 	}
3477 
3478 	vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS);
3479 
3480 	if (nested_cpu_has_vid(vmcs12))
3481 		vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS);
3482 
3483 	vmcs12->vm_entry_controls =
3484 		(vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) |
3485 		(vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE);
3486 
3487 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) {
3488 		kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7);
3489 		vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
3490 	}
3491 
3492 	/* TODO: These cannot have changed unless we have MSR bitmaps and
3493 	 * the relevant bit asks not to trap the change */
3494 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
3495 		vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT);
3496 	if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER)
3497 		vmcs12->guest_ia32_efer = vcpu->arch.efer;
3498 	vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
3499 	vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
3500 	vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
3501 	if (kvm_mpx_supported())
3502 		vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
3503 }
3504 
3505 /*
3506  * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
3507  * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
3508  * and this function updates it to reflect the changes to the guest state while
3509  * L2 was running (and perhaps made some exits which were handled directly by L0
3510  * without going back to L1), and to reflect the exit reason.
3511  * Note that we do not have to copy here all VMCS fields, just those that
3512  * could have changed by the L2 guest or the exit - i.e., the guest-state and
3513  * exit-information fields only. Other fields are modified by L1 with VMWRITE,
3514  * which already writes to vmcs12 directly.
3515  */
3516 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
3517 			   u32 exit_reason, u32 exit_intr_info,
3518 			   unsigned long exit_qualification)
3519 {
3520 	/* update guest state fields: */
3521 	sync_vmcs12(vcpu, vmcs12);
3522 
3523 	/* update exit information fields: */
3524 
3525 	vmcs12->vm_exit_reason = exit_reason;
3526 	vmcs12->exit_qualification = exit_qualification;
3527 	vmcs12->vm_exit_intr_info = exit_intr_info;
3528 
3529 	vmcs12->idt_vectoring_info_field = 0;
3530 	vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3531 	vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
3532 
3533 	if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
3534 		vmcs12->launch_state = 1;
3535 
3536 		/* vm_entry_intr_info_field is cleared on exit. Emulate this
3537 		 * instead of reading the real value. */
3538 		vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
3539 
3540 		/*
3541 		 * Transfer the event that L0 or L1 may wanted to inject into
3542 		 * L2 to IDT_VECTORING_INFO_FIELD.
3543 		 */
3544 		vmcs12_save_pending_event(vcpu, vmcs12);
3545 
3546 		/*
3547 		 * According to spec, there's no need to store the guest's
3548 		 * MSRs if the exit is due to a VM-entry failure that occurs
3549 		 * during or after loading the guest state. Since this exit
3550 		 * does not fall in that category, we need to save the MSRs.
3551 		 */
3552 		if (nested_vmx_store_msr(vcpu,
3553 					 vmcs12->vm_exit_msr_store_addr,
3554 					 vmcs12->vm_exit_msr_store_count))
3555 			nested_vmx_abort(vcpu,
3556 					 VMX_ABORT_SAVE_GUEST_MSR_FAIL);
3557 	}
3558 
3559 	/*
3560 	 * Drop what we picked up for L2 via vmx_complete_interrupts. It is
3561 	 * preserved above and would only end up incorrectly in L1.
3562 	 */
3563 	vcpu->arch.nmi_injected = false;
3564 	kvm_clear_exception_queue(vcpu);
3565 	kvm_clear_interrupt_queue(vcpu);
3566 }
3567 
3568 /*
3569  * A part of what we need to when the nested L2 guest exits and we want to
3570  * run its L1 parent, is to reset L1's guest state to the host state specified
3571  * in vmcs12.
3572  * This function is to be called not only on normal nested exit, but also on
3573  * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry
3574  * Failures During or After Loading Guest State").
3575  * This function should be called when the active VMCS is L1's (vmcs01).
3576  */
3577 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
3578 				   struct vmcs12 *vmcs12)
3579 {
3580 	struct kvm_segment seg;
3581 	u32 entry_failure_code;
3582 
3583 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
3584 		vcpu->arch.efer = vmcs12->host_ia32_efer;
3585 	else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3586 		vcpu->arch.efer |= (EFER_LMA | EFER_LME);
3587 	else
3588 		vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
3589 	vmx_set_efer(vcpu, vcpu->arch.efer);
3590 
3591 	kvm_rsp_write(vcpu, vmcs12->host_rsp);
3592 	kvm_rip_write(vcpu, vmcs12->host_rip);
3593 	vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
3594 	vmx_set_interrupt_shadow(vcpu, 0);
3595 
3596 	/*
3597 	 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't
3598 	 * actually changed, because vmx_set_cr0 refers to efer set above.
3599 	 *
3600 	 * CR0_GUEST_HOST_MASK is already set in the original vmcs01
3601 	 * (KVM doesn't change it);
3602 	 */
3603 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3604 	vmx_set_cr0(vcpu, vmcs12->host_cr0);
3605 
3606 	/* Same as above - no reason to call set_cr4_guest_host_mask().  */
3607 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3608 	vmx_set_cr4(vcpu, vmcs12->host_cr4);
3609 
3610 	nested_ept_uninit_mmu_context(vcpu);
3611 
3612 	/*
3613 	 * Only PDPTE load can fail as the value of cr3 was checked on entry and
3614 	 * couldn't have changed.
3615 	 */
3616 	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
3617 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
3618 
3619 	if (!enable_ept)
3620 		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3621 
3622 	/*
3623 	 * If vmcs01 doesn't use VPID, CPU flushes TLB on every
3624 	 * VMEntry/VMExit. Thus, no need to flush TLB.
3625 	 *
3626 	 * If vmcs12 doesn't use VPID, L1 expects TLB to be
3627 	 * flushed on every VMEntry/VMExit.
3628 	 *
3629 	 * Otherwise, we can preserve TLB entries as long as we are
3630 	 * able to tag L1 TLB entries differently than L2 TLB entries.
3631 	 *
3632 	 * If vmcs12 uses EPT, we need to execute this flush on EPTP01
3633 	 * and therefore we request the TLB flush to happen only after VMCS EPTP
3634 	 * has been set by KVM_REQ_LOAD_CR3.
3635 	 */
3636 	if (enable_vpid &&
3637 	    (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
3638 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
3639 	}
3640 
3641 	vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
3642 	vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
3643 	vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
3644 	vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
3645 	vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
3646 	vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
3647 	vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
3648 
3649 	/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1.  */
3650 	if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
3651 		vmcs_write64(GUEST_BNDCFGS, 0);
3652 
3653 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) {
3654 		vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat);
3655 		vcpu->arch.pat = vmcs12->host_ia32_pat;
3656 	}
3657 	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
3658 		vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
3659 			vmcs12->host_ia32_perf_global_ctrl);
3660 
3661 	/* Set L1 segment info according to Intel SDM
3662 	    27.5.2 Loading Host Segment and Descriptor-Table Registers */
3663 	seg = (struct kvm_segment) {
3664 		.base = 0,
3665 		.limit = 0xFFFFFFFF,
3666 		.selector = vmcs12->host_cs_selector,
3667 		.type = 11,
3668 		.present = 1,
3669 		.s = 1,
3670 		.g = 1
3671 	};
3672 	if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)
3673 		seg.l = 1;
3674 	else
3675 		seg.db = 1;
3676 	vmx_set_segment(vcpu, &seg, VCPU_SREG_CS);
3677 	seg = (struct kvm_segment) {
3678 		.base = 0,
3679 		.limit = 0xFFFFFFFF,
3680 		.type = 3,
3681 		.present = 1,
3682 		.s = 1,
3683 		.db = 1,
3684 		.g = 1
3685 	};
3686 	seg.selector = vmcs12->host_ds_selector;
3687 	vmx_set_segment(vcpu, &seg, VCPU_SREG_DS);
3688 	seg.selector = vmcs12->host_es_selector;
3689 	vmx_set_segment(vcpu, &seg, VCPU_SREG_ES);
3690 	seg.selector = vmcs12->host_ss_selector;
3691 	vmx_set_segment(vcpu, &seg, VCPU_SREG_SS);
3692 	seg.selector = vmcs12->host_fs_selector;
3693 	seg.base = vmcs12->host_fs_base;
3694 	vmx_set_segment(vcpu, &seg, VCPU_SREG_FS);
3695 	seg.selector = vmcs12->host_gs_selector;
3696 	seg.base = vmcs12->host_gs_base;
3697 	vmx_set_segment(vcpu, &seg, VCPU_SREG_GS);
3698 	seg = (struct kvm_segment) {
3699 		.base = vmcs12->host_tr_base,
3700 		.limit = 0x67,
3701 		.selector = vmcs12->host_tr_selector,
3702 		.type = 11,
3703 		.present = 1
3704 	};
3705 	vmx_set_segment(vcpu, &seg, VCPU_SREG_TR);
3706 
3707 	kvm_set_dr(vcpu, 7, 0x400);
3708 	vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
3709 
3710 	if (cpu_has_vmx_msr_bitmap())
3711 		vmx_update_msr_bitmap(vcpu);
3712 
3713 	if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
3714 				vmcs12->vm_exit_msr_load_count))
3715 		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3716 }
3717 
3718 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
3719 {
3720 	struct shared_msr_entry *efer_msr;
3721 	unsigned int i;
3722 
3723 	if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
3724 		return vmcs_read64(GUEST_IA32_EFER);
3725 
3726 	if (cpu_has_load_ia32_efer())
3727 		return host_efer;
3728 
3729 	for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) {
3730 		if (vmx->msr_autoload.guest.val[i].index == MSR_EFER)
3731 			return vmx->msr_autoload.guest.val[i].value;
3732 	}
3733 
3734 	efer_msr = find_msr_entry(vmx, MSR_EFER);
3735 	if (efer_msr)
3736 		return efer_msr->data;
3737 
3738 	return host_efer;
3739 }
3740 
3741 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu)
3742 {
3743 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3744 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3745 	struct vmx_msr_entry g, h;
3746 	struct msr_data msr;
3747 	gpa_t gpa;
3748 	u32 i, j;
3749 
3750 	vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT);
3751 
3752 	if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
3753 		/*
3754 		 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set
3755 		 * as vmcs01.GUEST_DR7 contains a userspace defined value
3756 		 * and vcpu->arch.dr7 is not squirreled away before the
3757 		 * nested VMENTER (not worth adding a variable in nested_vmx).
3758 		 */
3759 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
3760 			kvm_set_dr(vcpu, 7, DR7_FIXED_1);
3761 		else
3762 			WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7)));
3763 	}
3764 
3765 	/*
3766 	 * Note that calling vmx_set_{efer,cr0,cr4} is important as they
3767 	 * handle a variety of side effects to KVM's software model.
3768 	 */
3769 	vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
3770 
3771 	vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
3772 	vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
3773 
3774 	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
3775 	vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW));
3776 
3777 	nested_ept_uninit_mmu_context(vcpu);
3778 
3779 	/*
3780 	 * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3
3781 	 * points to shadow pages!  Fortunately we only get here after a WARN_ON
3782 	 * if EPT is disabled, so a VMabort is perfectly fine.
3783 	 */
3784 	if (enable_ept) {
3785 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3786 		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
3787 	} else {
3788 		nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED);
3789 	}
3790 
3791 	/*
3792 	 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
3793 	 * from vmcs01 (if necessary).  The PDPTRs are not loaded on
3794 	 * VMFail, like everything else we just need to ensure our
3795 	 * software model is up-to-date.
3796 	 */
3797 	ept_save_pdptrs(vcpu);
3798 
3799 	kvm_mmu_reset_context(vcpu);
3800 
3801 	if (cpu_has_vmx_msr_bitmap())
3802 		vmx_update_msr_bitmap(vcpu);
3803 
3804 	/*
3805 	 * This nasty bit of open coding is a compromise between blindly
3806 	 * loading L1's MSRs using the exit load lists (incorrect emulation
3807 	 * of VMFail), leaving the nested VM's MSRs in the software model
3808 	 * (incorrect behavior) and snapshotting the modified MSRs (too
3809 	 * expensive since the lists are unbound by hardware).  For each
3810 	 * MSR that was (prematurely) loaded from the nested VMEntry load
3811 	 * list, reload it from the exit load list if it exists and differs
3812 	 * from the guest value.  The intent is to stuff host state as
3813 	 * silently as possible, not to fully process the exit load list.
3814 	 */
3815 	msr.host_initiated = false;
3816 	for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) {
3817 		gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g));
3818 		if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) {
3819 			pr_debug_ratelimited(
3820 				"%s read MSR index failed (%u, 0x%08llx)\n",
3821 				__func__, i, gpa);
3822 			goto vmabort;
3823 		}
3824 
3825 		for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) {
3826 			gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h));
3827 			if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) {
3828 				pr_debug_ratelimited(
3829 					"%s read MSR failed (%u, 0x%08llx)\n",
3830 					__func__, j, gpa);
3831 				goto vmabort;
3832 			}
3833 			if (h.index != g.index)
3834 				continue;
3835 			if (h.value == g.value)
3836 				break;
3837 
3838 			if (nested_vmx_load_msr_check(vcpu, &h)) {
3839 				pr_debug_ratelimited(
3840 					"%s check failed (%u, 0x%x, 0x%x)\n",
3841 					__func__, j, h.index, h.reserved);
3842 				goto vmabort;
3843 			}
3844 
3845 			msr.index = h.index;
3846 			msr.data = h.value;
3847 			if (kvm_set_msr(vcpu, &msr)) {
3848 				pr_debug_ratelimited(
3849 					"%s WRMSR failed (%u, 0x%x, 0x%llx)\n",
3850 					__func__, j, h.index, h.value);
3851 				goto vmabort;
3852 			}
3853 		}
3854 	}
3855 
3856 	return;
3857 
3858 vmabort:
3859 	nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL);
3860 }
3861 
3862 /*
3863  * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1
3864  * and modify vmcs12 to make it see what it would expect to see there if
3865  * L2 was its real guest. Must only be called when in L2 (is_guest_mode())
3866  */
3867 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
3868 		       u32 exit_intr_info, unsigned long exit_qualification)
3869 {
3870 	struct vcpu_vmx *vmx = to_vmx(vcpu);
3871 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
3872 
3873 	/* trying to cancel vmlaunch/vmresume is a bug */
3874 	WARN_ON_ONCE(vmx->nested.nested_run_pending);
3875 
3876 	leave_guest_mode(vcpu);
3877 
3878 	if (nested_cpu_has_preemption_timer(vmcs12))
3879 		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
3880 
3881 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
3882 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
3883 
3884 	if (likely(!vmx->fail)) {
3885 		if (exit_reason == -1)
3886 			sync_vmcs12(vcpu, vmcs12);
3887 		else
3888 			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
3889 				       exit_qualification);
3890 
3891 		/*
3892 		 * Must happen outside of sync_vmcs12() as it will
3893 		 * also be used to capture vmcs12 cache as part of
3894 		 * capturing nVMX state for snapshot (migration).
3895 		 *
3896 		 * Otherwise, this flush will dirty guest memory at a
3897 		 * point it is already assumed by user-space to be
3898 		 * immutable.
3899 		 */
3900 		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
3901 	} else {
3902 		/*
3903 		 * The only expected VM-instruction error is "VM entry with
3904 		 * invalid control field(s)." Anything else indicates a
3905 		 * problem with L0.  And we should never get here with a
3906 		 * VMFail of any type if early consistency checks are enabled.
3907 		 */
3908 		WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) !=
3909 			     VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3910 		WARN_ON_ONCE(nested_early_check);
3911 	}
3912 
3913 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
3914 
3915 	/* Update any VMCS fields that might have changed while L2 ran */
3916 	vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
3917 	vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
3918 	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
3919 
3920 	if (kvm_has_tsc_control)
3921 		decache_tsc_multiplier(vmx);
3922 
3923 	if (vmx->nested.change_vmcs01_virtual_apic_mode) {
3924 		vmx->nested.change_vmcs01_virtual_apic_mode = false;
3925 		vmx_set_virtual_apic_mode(vcpu);
3926 	} else if (!nested_cpu_has_ept(vmcs12) &&
3927 		   nested_cpu_has2(vmcs12,
3928 				   SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
3929 		vmx_flush_tlb(vcpu, true);
3930 	}
3931 
3932 	/* Unpin physical memory we referred to in vmcs02 */
3933 	if (vmx->nested.apic_access_page) {
3934 		kvm_release_page_dirty(vmx->nested.apic_access_page);
3935 		vmx->nested.apic_access_page = NULL;
3936 	}
3937 	kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
3938 	kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
3939 	vmx->nested.pi_desc = NULL;
3940 
3941 	/*
3942 	 * We are now running in L2, mmu_notifier will force to reload the
3943 	 * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
3944 	 */
3945 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
3946 
3947 	if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
3948 		vmx->nested.need_vmcs12_sync = true;
3949 
3950 	/* in case we halted in L2 */
3951 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3952 
3953 	if (likely(!vmx->fail)) {
3954 		/*
3955 		 * TODO: SDM says that with acknowledge interrupt on
3956 		 * exit, bit 31 of the VM-exit interrupt information
3957 		 * (valid interrupt) is always set to 1 on
3958 		 * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
3959 		 * need kvm_cpu_has_interrupt().  See the commit
3960 		 * message for details.
3961 		 */
3962 		if (nested_exit_intr_ack_set(vcpu) &&
3963 		    exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
3964 		    kvm_cpu_has_interrupt(vcpu)) {
3965 			int irq = kvm_cpu_get_interrupt(vcpu);
3966 			WARN_ON(irq < 0);
3967 			vmcs12->vm_exit_intr_info = irq |
3968 				INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
3969 		}
3970 
3971 		if (exit_reason != -1)
3972 			trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
3973 						       vmcs12->exit_qualification,
3974 						       vmcs12->idt_vectoring_info_field,
3975 						       vmcs12->vm_exit_intr_info,
3976 						       vmcs12->vm_exit_intr_error_code,
3977 						       KVM_ISA_VMX);
3978 
3979 		load_vmcs12_host_state(vcpu, vmcs12);
3980 
3981 		return;
3982 	}
3983 
3984 	/*
3985 	 * After an early L2 VM-entry failure, we're now back
3986 	 * in L1 which thinks it just finished a VMLAUNCH or
3987 	 * VMRESUME instruction, so we need to set the failure
3988 	 * flag and the VM-instruction error field of the VMCS
3989 	 * accordingly, and skip the emulated instruction.
3990 	 */
3991 	(void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
3992 
3993 	/*
3994 	 * Restore L1's host state to KVM's software model.  We're here
3995 	 * because a consistency check was caught by hardware, which
3996 	 * means some amount of guest state has been propagated to KVM's
3997 	 * model and needs to be unwound to the host's state.
3998 	 */
3999 	nested_vmx_restore_host_state(vcpu);
4000 
4001 	vmx->fail = 0;
4002 }
4003 
4004 /*
4005  * Decode the memory-address operand of a vmx instruction, as recorded on an
4006  * exit caused by such an instruction (run by a guest hypervisor).
4007  * On success, returns 0. When the operand is invalid, returns 1 and throws
4008  * #UD or #GP.
4009  */
4010 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
4011 			u32 vmx_instruction_info, bool wr, gva_t *ret)
4012 {
4013 	gva_t off;
4014 	bool exn;
4015 	struct kvm_segment s;
4016 
4017 	/*
4018 	 * According to Vol. 3B, "Information for VM Exits Due to Instruction
4019 	 * Execution", on an exit, vmx_instruction_info holds most of the
4020 	 * addressing components of the operand. Only the displacement part
4021 	 * is put in exit_qualification (see 3B, "Basic VM-Exit Information").
4022 	 * For how an actual address is calculated from all these components,
4023 	 * refer to Vol. 1, "Operand Addressing".
4024 	 */
4025 	int  scaling = vmx_instruction_info & 3;
4026 	int  addr_size = (vmx_instruction_info >> 7) & 7;
4027 	bool is_reg = vmx_instruction_info & (1u << 10);
4028 	int  seg_reg = (vmx_instruction_info >> 15) & 7;
4029 	int  index_reg = (vmx_instruction_info >> 18) & 0xf;
4030 	bool index_is_valid = !(vmx_instruction_info & (1u << 22));
4031 	int  base_reg       = (vmx_instruction_info >> 23) & 0xf;
4032 	bool base_is_valid  = !(vmx_instruction_info & (1u << 27));
4033 
4034 	if (is_reg) {
4035 		kvm_queue_exception(vcpu, UD_VECTOR);
4036 		return 1;
4037 	}
4038 
4039 	/* Addr = segment_base + offset */
4040 	/* offset = base + [index * scale] + displacement */
4041 	off = exit_qualification; /* holds the displacement */
4042 	if (addr_size == 1)
4043 		off = (gva_t)sign_extend64(off, 31);
4044 	else if (addr_size == 0)
4045 		off = (gva_t)sign_extend64(off, 15);
4046 	if (base_is_valid)
4047 		off += kvm_register_read(vcpu, base_reg);
4048 	if (index_is_valid)
4049 		off += kvm_register_read(vcpu, index_reg)<<scaling;
4050 	vmx_get_segment(vcpu, &s, seg_reg);
4051 
4052 	/*
4053 	 * The effective address, i.e. @off, of a memory operand is truncated
4054 	 * based on the address size of the instruction.  Note that this is
4055 	 * the *effective address*, i.e. the address prior to accounting for
4056 	 * the segment's base.
4057 	 */
4058 	if (addr_size == 1) /* 32 bit */
4059 		off &= 0xffffffff;
4060 	else if (addr_size == 0) /* 16 bit */
4061 		off &= 0xffff;
4062 
4063 	/* Checks for #GP/#SS exceptions. */
4064 	exn = false;
4065 	if (is_long_mode(vcpu)) {
4066 		/*
4067 		 * The virtual/linear address is never truncated in 64-bit
4068 		 * mode, e.g. a 32-bit address size can yield a 64-bit virtual
4069 		 * address when using FS/GS with a non-zero base.
4070 		 */
4071 		*ret = s.base + off;
4072 
4073 		/* Long mode: #GP(0)/#SS(0) if the memory address is in a
4074 		 * non-canonical form. This is the only check on the memory
4075 		 * destination for long mode!
4076 		 */
4077 		exn = is_noncanonical_address(*ret, vcpu);
4078 	} else {
4079 		/*
4080 		 * When not in long mode, the virtual/linear address is
4081 		 * unconditionally truncated to 32 bits regardless of the
4082 		 * address size.
4083 		 */
4084 		*ret = (s.base + off) & 0xffffffff;
4085 
4086 		/* Protected mode: apply checks for segment validity in the
4087 		 * following order:
4088 		 * - segment type check (#GP(0) may be thrown)
4089 		 * - usability check (#GP(0)/#SS(0))
4090 		 * - limit check (#GP(0)/#SS(0))
4091 		 */
4092 		if (wr)
4093 			/* #GP(0) if the destination operand is located in a
4094 			 * read-only data segment or any code segment.
4095 			 */
4096 			exn = ((s.type & 0xa) == 0 || (s.type & 8));
4097 		else
4098 			/* #GP(0) if the source operand is located in an
4099 			 * execute-only code segment
4100 			 */
4101 			exn = ((s.type & 0xa) == 8);
4102 		if (exn) {
4103 			kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
4104 			return 1;
4105 		}
4106 		/* Protected mode: #GP(0)/#SS(0) if the segment is unusable.
4107 		 */
4108 		exn = (s.unusable != 0);
4109 
4110 		/*
4111 		 * Protected mode: #GP(0)/#SS(0) if the memory operand is
4112 		 * outside the segment limit.  All CPUs that support VMX ignore
4113 		 * limit checks for flat segments, i.e. segments with base==0,
4114 		 * limit==0xffffffff and of type expand-up data or code.
4115 		 */
4116 		if (!(s.base == 0 && s.limit == 0xffffffff &&
4117 		     ((s.type & 8) || !(s.type & 4))))
4118 			exn = exn || (off + sizeof(u64) > s.limit);
4119 	}
4120 	if (exn) {
4121 		kvm_queue_exception_e(vcpu,
4122 				      seg_reg == VCPU_SREG_SS ?
4123 						SS_VECTOR : GP_VECTOR,
4124 				      0);
4125 		return 1;
4126 	}
4127 
4128 	return 0;
4129 }
4130 
4131 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
4132 {
4133 	gva_t gva;
4134 	struct x86_exception e;
4135 
4136 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4137 			vmcs_read32(VMX_INSTRUCTION_INFO), false, &gva))
4138 		return 1;
4139 
4140 	if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
4141 		kvm_inject_page_fault(vcpu, &e);
4142 		return 1;
4143 	}
4144 
4145 	return 0;
4146 }
4147 
4148 /*
4149  * Allocate a shadow VMCS and associate it with the currently loaded
4150  * VMCS, unless such a shadow VMCS already exists. The newly allocated
4151  * VMCS is also VMCLEARed, so that it is ready for use.
4152  */
4153 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
4154 {
4155 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4156 	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
4157 
4158 	/*
4159 	 * We should allocate a shadow vmcs for vmcs01 only when L1
4160 	 * executes VMXON and free it when L1 executes VMXOFF.
4161 	 * As it is invalid to execute VMXON twice, we shouldn't reach
4162 	 * here when vmcs01 already have an allocated shadow vmcs.
4163 	 */
4164 	WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
4165 
4166 	if (!loaded_vmcs->shadow_vmcs) {
4167 		loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
4168 		if (loaded_vmcs->shadow_vmcs)
4169 			vmcs_clear(loaded_vmcs->shadow_vmcs);
4170 	}
4171 	return loaded_vmcs->shadow_vmcs;
4172 }
4173 
4174 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
4175 {
4176 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4177 	int r;
4178 
4179 	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
4180 	if (r < 0)
4181 		goto out_vmcs02;
4182 
4183 	vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4184 	if (!vmx->nested.cached_vmcs12)
4185 		goto out_cached_vmcs12;
4186 
4187 	vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
4188 	if (!vmx->nested.cached_shadow_vmcs12)
4189 		goto out_cached_shadow_vmcs12;
4190 
4191 	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
4192 		goto out_shadow_vmcs;
4193 
4194 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
4195 		     HRTIMER_MODE_REL_PINNED);
4196 	vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
4197 
4198 	vmx->nested.vpid02 = allocate_vpid();
4199 
4200 	vmx->nested.vmcs02_initialized = false;
4201 	vmx->nested.vmxon = true;
4202 
4203 	if (pt_mode == PT_MODE_HOST_GUEST) {
4204 		vmx->pt_desc.guest.ctl = 0;
4205 		pt_update_intercept_for_msr(vmx);
4206 	}
4207 
4208 	return 0;
4209 
4210 out_shadow_vmcs:
4211 	kfree(vmx->nested.cached_shadow_vmcs12);
4212 
4213 out_cached_shadow_vmcs12:
4214 	kfree(vmx->nested.cached_vmcs12);
4215 
4216 out_cached_vmcs12:
4217 	free_loaded_vmcs(&vmx->nested.vmcs02);
4218 
4219 out_vmcs02:
4220 	return -ENOMEM;
4221 }
4222 
4223 /*
4224  * Emulate the VMXON instruction.
4225  * Currently, we just remember that VMX is active, and do not save or even
4226  * inspect the argument to VMXON (the so-called "VMXON pointer") because we
4227  * do not currently need to store anything in that guest-allocated memory
4228  * region. Consequently, VMCLEAR and VMPTRLD also do not verify that the their
4229  * argument is different from the VMXON pointer (which the spec says they do).
4230  */
4231 static int handle_vmon(struct kvm_vcpu *vcpu)
4232 {
4233 	int ret;
4234 	gpa_t vmptr;
4235 	uint32_t revision;
4236 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4237 	const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
4238 		| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
4239 
4240 	/*
4241 	 * The Intel VMX Instruction Reference lists a bunch of bits that are
4242 	 * prerequisite to running VMXON, most notably cr4.VMXE must be set to
4243 	 * 1 (see vmx_set_cr4() for when we allow the guest to set this).
4244 	 * Otherwise, we should fail with #UD.  But most faulting conditions
4245 	 * have already been checked by hardware, prior to the VM-exit for
4246 	 * VMXON.  We do test guest cr4.VMXE because processor CR4 always has
4247 	 * that bit set to 1 in non-root mode.
4248 	 */
4249 	if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
4250 		kvm_queue_exception(vcpu, UD_VECTOR);
4251 		return 1;
4252 	}
4253 
4254 	/* CPL=0 must be checked manually. */
4255 	if (vmx_get_cpl(vcpu)) {
4256 		kvm_inject_gp(vcpu, 0);
4257 		return 1;
4258 	}
4259 
4260 	if (vmx->nested.vmxon)
4261 		return nested_vmx_failValid(vcpu,
4262 			VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
4263 
4264 	if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
4265 			!= VMXON_NEEDED_FEATURES) {
4266 		kvm_inject_gp(vcpu, 0);
4267 		return 1;
4268 	}
4269 
4270 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4271 		return 1;
4272 
4273 	/*
4274 	 * SDM 3: 24.11.5
4275 	 * The first 4 bytes of VMXON region contain the supported
4276 	 * VMCS revision identifier
4277 	 *
4278 	 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case;
4279 	 * which replaces physical address width with 32
4280 	 */
4281 	if (!page_address_valid(vcpu, vmptr))
4282 		return nested_vmx_failInvalid(vcpu);
4283 
4284 	if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) ||
4285 	    revision != VMCS12_REVISION)
4286 		return nested_vmx_failInvalid(vcpu);
4287 
4288 	vmx->nested.vmxon_ptr = vmptr;
4289 	ret = enter_vmx_operation(vcpu);
4290 	if (ret)
4291 		return ret;
4292 
4293 	return nested_vmx_succeed(vcpu);
4294 }
4295 
4296 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu)
4297 {
4298 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4299 
4300 	if (vmx->nested.current_vmptr == -1ull)
4301 		return;
4302 
4303 	if (enable_shadow_vmcs) {
4304 		/* copy to memory all shadowed fields in case
4305 		   they were modified */
4306 		copy_shadow_to_vmcs12(vmx);
4307 		vmx->nested.need_vmcs12_sync = false;
4308 		vmx_disable_shadow_vmcs(vmx);
4309 	}
4310 	vmx->nested.posted_intr_nv = -1;
4311 
4312 	/* Flush VMCS12 to guest memory */
4313 	kvm_vcpu_write_guest_page(vcpu,
4314 				  vmx->nested.current_vmptr >> PAGE_SHIFT,
4315 				  vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
4316 
4317 	kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
4318 
4319 	vmx->nested.current_vmptr = -1ull;
4320 }
4321 
4322 /* Emulate the VMXOFF instruction */
4323 static int handle_vmoff(struct kvm_vcpu *vcpu)
4324 {
4325 	if (!nested_vmx_check_permission(vcpu))
4326 		return 1;
4327 	free_nested(vcpu);
4328 	return nested_vmx_succeed(vcpu);
4329 }
4330 
4331 /* Emulate the VMCLEAR instruction */
4332 static int handle_vmclear(struct kvm_vcpu *vcpu)
4333 {
4334 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4335 	u32 zero = 0;
4336 	gpa_t vmptr;
4337 
4338 	if (!nested_vmx_check_permission(vcpu))
4339 		return 1;
4340 
4341 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4342 		return 1;
4343 
4344 	if (!page_address_valid(vcpu, vmptr))
4345 		return nested_vmx_failValid(vcpu,
4346 			VMXERR_VMCLEAR_INVALID_ADDRESS);
4347 
4348 	if (vmptr == vmx->nested.vmxon_ptr)
4349 		return nested_vmx_failValid(vcpu,
4350 			VMXERR_VMCLEAR_VMXON_POINTER);
4351 
4352 	if (vmx->nested.hv_evmcs_map.hva) {
4353 		if (vmptr == vmx->nested.hv_evmcs_vmptr)
4354 			nested_release_evmcs(vcpu);
4355 	} else {
4356 		if (vmptr == vmx->nested.current_vmptr)
4357 			nested_release_vmcs12(vcpu);
4358 
4359 		kvm_vcpu_write_guest(vcpu,
4360 				     vmptr + offsetof(struct vmcs12,
4361 						      launch_state),
4362 				     &zero, sizeof(zero));
4363 	}
4364 
4365 	return nested_vmx_succeed(vcpu);
4366 }
4367 
4368 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
4369 
4370 /* Emulate the VMLAUNCH instruction */
4371 static int handle_vmlaunch(struct kvm_vcpu *vcpu)
4372 {
4373 	return nested_vmx_run(vcpu, true);
4374 }
4375 
4376 /* Emulate the VMRESUME instruction */
4377 static int handle_vmresume(struct kvm_vcpu *vcpu)
4378 {
4379 
4380 	return nested_vmx_run(vcpu, false);
4381 }
4382 
4383 static int handle_vmread(struct kvm_vcpu *vcpu)
4384 {
4385 	unsigned long field;
4386 	u64 field_value;
4387 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4388 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4389 	gva_t gva = 0;
4390 	struct vmcs12 *vmcs12;
4391 
4392 	if (!nested_vmx_check_permission(vcpu))
4393 		return 1;
4394 
4395 	if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
4396 		return nested_vmx_failInvalid(vcpu);
4397 
4398 	if (!is_guest_mode(vcpu))
4399 		vmcs12 = get_vmcs12(vcpu);
4400 	else {
4401 		/*
4402 		 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
4403 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
4404 		 */
4405 		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4406 			return nested_vmx_failInvalid(vcpu);
4407 		vmcs12 = get_shadow_vmcs12(vcpu);
4408 	}
4409 
4410 	/* Decode instruction info and find the field to read */
4411 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4412 	/* Read the field, zero-extended to a u64 field_value */
4413 	if (vmcs12_read_any(vmcs12, field, &field_value) < 0)
4414 		return nested_vmx_failValid(vcpu,
4415 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4416 
4417 	/*
4418 	 * Now copy part of this value to register or memory, as requested.
4419 	 * Note that the number of bits actually copied is 32 or 64 depending
4420 	 * on the guest's mode (32 or 64 bit), not on the given field's length.
4421 	 */
4422 	if (vmx_instruction_info & (1u << 10)) {
4423 		kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
4424 			field_value);
4425 	} else {
4426 		if (get_vmx_mem_address(vcpu, exit_qualification,
4427 				vmx_instruction_info, true, &gva))
4428 			return 1;
4429 		/* _system ok, nested_vmx_check_permission has verified cpl=0 */
4430 		kvm_write_guest_virt_system(vcpu, gva, &field_value,
4431 					    (is_long_mode(vcpu) ? 8 : 4), NULL);
4432 	}
4433 
4434 	return nested_vmx_succeed(vcpu);
4435 }
4436 
4437 
4438 static int handle_vmwrite(struct kvm_vcpu *vcpu)
4439 {
4440 	unsigned long field;
4441 	gva_t gva;
4442 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4443 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4444 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4445 
4446 	/* The value to write might be 32 or 64 bits, depending on L1's long
4447 	 * mode, and eventually we need to write that into a field of several
4448 	 * possible lengths. The code below first zero-extends the value to 64
4449 	 * bit (field_value), and then copies only the appropriate number of
4450 	 * bits into the vmcs12 field.
4451 	 */
4452 	u64 field_value = 0;
4453 	struct x86_exception e;
4454 	struct vmcs12 *vmcs12;
4455 
4456 	if (!nested_vmx_check_permission(vcpu))
4457 		return 1;
4458 
4459 	if (vmx->nested.current_vmptr == -1ull)
4460 		return nested_vmx_failInvalid(vcpu);
4461 
4462 	if (vmx_instruction_info & (1u << 10))
4463 		field_value = kvm_register_readl(vcpu,
4464 			(((vmx_instruction_info) >> 3) & 0xf));
4465 	else {
4466 		if (get_vmx_mem_address(vcpu, exit_qualification,
4467 				vmx_instruction_info, false, &gva))
4468 			return 1;
4469 		if (kvm_read_guest_virt(vcpu, gva, &field_value,
4470 					(is_64_bit_mode(vcpu) ? 8 : 4), &e)) {
4471 			kvm_inject_page_fault(vcpu, &e);
4472 			return 1;
4473 		}
4474 	}
4475 
4476 
4477 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
4478 	/*
4479 	 * If the vCPU supports "VMWRITE to any supported field in the
4480 	 * VMCS," then the "read-only" fields are actually read/write.
4481 	 */
4482 	if (vmcs_field_readonly(field) &&
4483 	    !nested_cpu_has_vmwrite_any_field(vcpu))
4484 		return nested_vmx_failValid(vcpu,
4485 			VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
4486 
4487 	if (!is_guest_mode(vcpu))
4488 		vmcs12 = get_vmcs12(vcpu);
4489 	else {
4490 		/*
4491 		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
4492 		 * to shadowed-field sets the ALU flags for VMfailInvalid.
4493 		 */
4494 		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
4495 			return nested_vmx_failInvalid(vcpu);
4496 		vmcs12 = get_shadow_vmcs12(vcpu);
4497 	}
4498 
4499 	if (vmcs12_write_any(vmcs12, field, field_value) < 0)
4500 		return nested_vmx_failValid(vcpu,
4501 			VMXERR_UNSUPPORTED_VMCS_COMPONENT);
4502 
4503 	/*
4504 	 * Do not track vmcs12 dirty-state if in guest-mode
4505 	 * as we actually dirty shadow vmcs12 instead of vmcs12.
4506 	 */
4507 	if (!is_guest_mode(vcpu)) {
4508 		switch (field) {
4509 #define SHADOW_FIELD_RW(x) case x:
4510 #include "vmcs_shadow_fields.h"
4511 			/*
4512 			 * The fields that can be updated by L1 without a vmexit are
4513 			 * always updated in the vmcs02, the others go down the slow
4514 			 * path of prepare_vmcs02.
4515 			 */
4516 			break;
4517 		default:
4518 			vmx->nested.dirty_vmcs12 = true;
4519 			break;
4520 		}
4521 	}
4522 
4523 	return nested_vmx_succeed(vcpu);
4524 }
4525 
4526 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
4527 {
4528 	vmx->nested.current_vmptr = vmptr;
4529 	if (enable_shadow_vmcs) {
4530 		vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
4531 			      SECONDARY_EXEC_SHADOW_VMCS);
4532 		vmcs_write64(VMCS_LINK_POINTER,
4533 			     __pa(vmx->vmcs01.shadow_vmcs));
4534 		vmx->nested.need_vmcs12_sync = true;
4535 	}
4536 	vmx->nested.dirty_vmcs12 = true;
4537 }
4538 
4539 /* Emulate the VMPTRLD instruction */
4540 static int handle_vmptrld(struct kvm_vcpu *vcpu)
4541 {
4542 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4543 	gpa_t vmptr;
4544 
4545 	if (!nested_vmx_check_permission(vcpu))
4546 		return 1;
4547 
4548 	if (nested_vmx_get_vmptr(vcpu, &vmptr))
4549 		return 1;
4550 
4551 	if (!page_address_valid(vcpu, vmptr))
4552 		return nested_vmx_failValid(vcpu,
4553 			VMXERR_VMPTRLD_INVALID_ADDRESS);
4554 
4555 	if (vmptr == vmx->nested.vmxon_ptr)
4556 		return nested_vmx_failValid(vcpu,
4557 			VMXERR_VMPTRLD_VMXON_POINTER);
4558 
4559 	/* Forbid normal VMPTRLD if Enlightened version was used */
4560 	if (vmx->nested.hv_evmcs)
4561 		return 1;
4562 
4563 	if (vmx->nested.current_vmptr != vmptr) {
4564 		struct kvm_host_map map;
4565 		struct vmcs12 *new_vmcs12;
4566 
4567 		if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmptr), &map)) {
4568 			/*
4569 			 * Reads from an unbacked page return all 1s,
4570 			 * which means that the 32 bits located at the
4571 			 * given physical address won't match the required
4572 			 * VMCS12_REVISION identifier.
4573 			 */
4574 			return nested_vmx_failValid(vcpu,
4575 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4576 		}
4577 
4578 		new_vmcs12 = map.hva;
4579 
4580 		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
4581 		    (new_vmcs12->hdr.shadow_vmcs &&
4582 		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
4583 			kvm_vcpu_unmap(vcpu, &map, false);
4584 			return nested_vmx_failValid(vcpu,
4585 				VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
4586 		}
4587 
4588 		nested_release_vmcs12(vcpu);
4589 
4590 		/*
4591 		 * Load VMCS12 from guest memory since it is not already
4592 		 * cached.
4593 		 */
4594 		memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
4595 		kvm_vcpu_unmap(vcpu, &map, false);
4596 
4597 		set_current_vmptr(vmx, vmptr);
4598 	}
4599 
4600 	return nested_vmx_succeed(vcpu);
4601 }
4602 
4603 /* Emulate the VMPTRST instruction */
4604 static int handle_vmptrst(struct kvm_vcpu *vcpu)
4605 {
4606 	unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
4607 	u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4608 	gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
4609 	struct x86_exception e;
4610 	gva_t gva;
4611 
4612 	if (!nested_vmx_check_permission(vcpu))
4613 		return 1;
4614 
4615 	if (unlikely(to_vmx(vcpu)->nested.hv_evmcs))
4616 		return 1;
4617 
4618 	if (get_vmx_mem_address(vcpu, exit_qual, instr_info, true, &gva))
4619 		return 1;
4620 	/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
4621 	if (kvm_write_guest_virt_system(vcpu, gva, (void *)&current_vmptr,
4622 					sizeof(gpa_t), &e)) {
4623 		kvm_inject_page_fault(vcpu, &e);
4624 		return 1;
4625 	}
4626 	return nested_vmx_succeed(vcpu);
4627 }
4628 
4629 /* Emulate the INVEPT instruction */
4630 static int handle_invept(struct kvm_vcpu *vcpu)
4631 {
4632 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4633 	u32 vmx_instruction_info, types;
4634 	unsigned long type;
4635 	gva_t gva;
4636 	struct x86_exception e;
4637 	struct {
4638 		u64 eptp, gpa;
4639 	} operand;
4640 
4641 	if (!(vmx->nested.msrs.secondary_ctls_high &
4642 	      SECONDARY_EXEC_ENABLE_EPT) ||
4643 	    !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) {
4644 		kvm_queue_exception(vcpu, UD_VECTOR);
4645 		return 1;
4646 	}
4647 
4648 	if (!nested_vmx_check_permission(vcpu))
4649 		return 1;
4650 
4651 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4652 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4653 
4654 	types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
4655 
4656 	if (type >= 32 || !(types & (1 << type)))
4657 		return nested_vmx_failValid(vcpu,
4658 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4659 
4660 	/* According to the Intel VMX instruction reference, the memory
4661 	 * operand is read even if it isn't needed (e.g., for type==global)
4662 	 */
4663 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4664 			vmx_instruction_info, false, &gva))
4665 		return 1;
4666 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4667 		kvm_inject_page_fault(vcpu, &e);
4668 		return 1;
4669 	}
4670 
4671 	switch (type) {
4672 	case VMX_EPT_EXTENT_GLOBAL:
4673 	/*
4674 	 * TODO: track mappings and invalidate
4675 	 * single context requests appropriately
4676 	 */
4677 	case VMX_EPT_EXTENT_CONTEXT:
4678 		kvm_mmu_sync_roots(vcpu);
4679 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
4680 		break;
4681 	default:
4682 		BUG_ON(1);
4683 		break;
4684 	}
4685 
4686 	return nested_vmx_succeed(vcpu);
4687 }
4688 
4689 static int handle_invvpid(struct kvm_vcpu *vcpu)
4690 {
4691 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4692 	u32 vmx_instruction_info;
4693 	unsigned long type, types;
4694 	gva_t gva;
4695 	struct x86_exception e;
4696 	struct {
4697 		u64 vpid;
4698 		u64 gla;
4699 	} operand;
4700 	u16 vpid02;
4701 
4702 	if (!(vmx->nested.msrs.secondary_ctls_high &
4703 	      SECONDARY_EXEC_ENABLE_VPID) ||
4704 			!(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) {
4705 		kvm_queue_exception(vcpu, UD_VECTOR);
4706 		return 1;
4707 	}
4708 
4709 	if (!nested_vmx_check_permission(vcpu))
4710 		return 1;
4711 
4712 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
4713 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
4714 
4715 	types = (vmx->nested.msrs.vpid_caps &
4716 			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
4717 
4718 	if (type >= 32 || !(types & (1 << type)))
4719 		return nested_vmx_failValid(vcpu,
4720 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4721 
4722 	/* according to the intel vmx instruction reference, the memory
4723 	 * operand is read even if it isn't needed (e.g., for type==global)
4724 	 */
4725 	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
4726 			vmx_instruction_info, false, &gva))
4727 		return 1;
4728 	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
4729 		kvm_inject_page_fault(vcpu, &e);
4730 		return 1;
4731 	}
4732 	if (operand.vpid >> 16)
4733 		return nested_vmx_failValid(vcpu,
4734 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4735 
4736 	vpid02 = nested_get_vpid02(vcpu);
4737 	switch (type) {
4738 	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
4739 		if (!operand.vpid ||
4740 		    is_noncanonical_address(operand.gla, vcpu))
4741 			return nested_vmx_failValid(vcpu,
4742 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4743 		if (cpu_has_vmx_invvpid_individual_addr()) {
4744 			__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
4745 				vpid02, operand.gla);
4746 		} else
4747 			__vmx_flush_tlb(vcpu, vpid02, false);
4748 		break;
4749 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
4750 	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
4751 		if (!operand.vpid)
4752 			return nested_vmx_failValid(vcpu,
4753 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
4754 		__vmx_flush_tlb(vcpu, vpid02, false);
4755 		break;
4756 	case VMX_VPID_EXTENT_ALL_CONTEXT:
4757 		__vmx_flush_tlb(vcpu, vpid02, false);
4758 		break;
4759 	default:
4760 		WARN_ON_ONCE(1);
4761 		return kvm_skip_emulated_instruction(vcpu);
4762 	}
4763 
4764 	return nested_vmx_succeed(vcpu);
4765 }
4766 
4767 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu,
4768 				     struct vmcs12 *vmcs12)
4769 {
4770 	u32 index = kvm_rcx_read(vcpu);
4771 	u64 address;
4772 	bool accessed_dirty;
4773 	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
4774 
4775 	if (!nested_cpu_has_eptp_switching(vmcs12) ||
4776 	    !nested_cpu_has_ept(vmcs12))
4777 		return 1;
4778 
4779 	if (index >= VMFUNC_EPTP_ENTRIES)
4780 		return 1;
4781 
4782 
4783 	if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
4784 				     &address, index * 8, 8))
4785 		return 1;
4786 
4787 	accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
4788 
4789 	/*
4790 	 * If the (L2) guest does a vmfunc to the currently
4791 	 * active ept pointer, we don't have to do anything else
4792 	 */
4793 	if (vmcs12->ept_pointer != address) {
4794 		if (!valid_ept_address(vcpu, address))
4795 			return 1;
4796 
4797 		kvm_mmu_unload(vcpu);
4798 		mmu->ept_ad = accessed_dirty;
4799 		mmu->mmu_role.base.ad_disabled = !accessed_dirty;
4800 		vmcs12->ept_pointer = address;
4801 		/*
4802 		 * TODO: Check what's the correct approach in case
4803 		 * mmu reload fails. Currently, we just let the next
4804 		 * reload potentially fail
4805 		 */
4806 		kvm_mmu_reload(vcpu);
4807 	}
4808 
4809 	return 0;
4810 }
4811 
4812 static int handle_vmfunc(struct kvm_vcpu *vcpu)
4813 {
4814 	struct vcpu_vmx *vmx = to_vmx(vcpu);
4815 	struct vmcs12 *vmcs12;
4816 	u32 function = kvm_rax_read(vcpu);
4817 
4818 	/*
4819 	 * VMFUNC is only supported for nested guests, but we always enable the
4820 	 * secondary control for simplicity; for non-nested mode, fake that we
4821 	 * didn't by injecting #UD.
4822 	 */
4823 	if (!is_guest_mode(vcpu)) {
4824 		kvm_queue_exception(vcpu, UD_VECTOR);
4825 		return 1;
4826 	}
4827 
4828 	vmcs12 = get_vmcs12(vcpu);
4829 	if ((vmcs12->vm_function_control & (1 << function)) == 0)
4830 		goto fail;
4831 
4832 	switch (function) {
4833 	case 0:
4834 		if (nested_vmx_eptp_switching(vcpu, vmcs12))
4835 			goto fail;
4836 		break;
4837 	default:
4838 		goto fail;
4839 	}
4840 	return kvm_skip_emulated_instruction(vcpu);
4841 
4842 fail:
4843 	nested_vmx_vmexit(vcpu, vmx->exit_reason,
4844 			  vmcs_read32(VM_EXIT_INTR_INFO),
4845 			  vmcs_readl(EXIT_QUALIFICATION));
4846 	return 1;
4847 }
4848 
4849 
4850 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
4851 				       struct vmcs12 *vmcs12)
4852 {
4853 	unsigned long exit_qualification;
4854 	gpa_t bitmap, last_bitmap;
4855 	unsigned int port;
4856 	int size;
4857 	u8 b;
4858 
4859 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
4860 		return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
4861 
4862 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4863 
4864 	port = exit_qualification >> 16;
4865 	size = (exit_qualification & 7) + 1;
4866 
4867 	last_bitmap = (gpa_t)-1;
4868 	b = -1;
4869 
4870 	while (size > 0) {
4871 		if (port < 0x8000)
4872 			bitmap = vmcs12->io_bitmap_a;
4873 		else if (port < 0x10000)
4874 			bitmap = vmcs12->io_bitmap_b;
4875 		else
4876 			return true;
4877 		bitmap += (port & 0x7fff) / 8;
4878 
4879 		if (last_bitmap != bitmap)
4880 			if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1))
4881 				return true;
4882 		if (b & (1 << (port & 7)))
4883 			return true;
4884 
4885 		port++;
4886 		size--;
4887 		last_bitmap = bitmap;
4888 	}
4889 
4890 	return false;
4891 }
4892 
4893 /*
4894  * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
4895  * rather than handle it ourselves in L0. I.e., check whether L1 expressed
4896  * disinterest in the current event (read or write a specific MSR) by using an
4897  * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
4898  */
4899 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
4900 	struct vmcs12 *vmcs12, u32 exit_reason)
4901 {
4902 	u32 msr_index = kvm_rcx_read(vcpu);
4903 	gpa_t bitmap;
4904 
4905 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
4906 		return true;
4907 
4908 	/*
4909 	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
4910 	 * for the four combinations of read/write and low/high MSR numbers.
4911 	 * First we need to figure out which of the four to use:
4912 	 */
4913 	bitmap = vmcs12->msr_bitmap;
4914 	if (exit_reason == EXIT_REASON_MSR_WRITE)
4915 		bitmap += 2048;
4916 	if (msr_index >= 0xc0000000) {
4917 		msr_index -= 0xc0000000;
4918 		bitmap += 1024;
4919 	}
4920 
4921 	/* Then read the msr_index'th bit from this bitmap: */
4922 	if (msr_index < 1024*8) {
4923 		unsigned char b;
4924 		if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1))
4925 			return true;
4926 		return 1 & (b >> (msr_index & 7));
4927 	} else
4928 		return true; /* let L1 handle the wrong parameter */
4929 }
4930 
4931 /*
4932  * Return 1 if we should exit from L2 to L1 to handle a CR access exit,
4933  * rather than handle it ourselves in L0. I.e., check if L1 wanted to
4934  * intercept (via guest_host_mask etc.) the current event.
4935  */
4936 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
4937 	struct vmcs12 *vmcs12)
4938 {
4939 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
4940 	int cr = exit_qualification & 15;
4941 	int reg;
4942 	unsigned long val;
4943 
4944 	switch ((exit_qualification >> 4) & 3) {
4945 	case 0: /* mov to cr */
4946 		reg = (exit_qualification >> 8) & 15;
4947 		val = kvm_register_readl(vcpu, reg);
4948 		switch (cr) {
4949 		case 0:
4950 			if (vmcs12->cr0_guest_host_mask &
4951 			    (val ^ vmcs12->cr0_read_shadow))
4952 				return true;
4953 			break;
4954 		case 3:
4955 			if ((vmcs12->cr3_target_count >= 1 &&
4956 					vmcs12->cr3_target_value0 == val) ||
4957 				(vmcs12->cr3_target_count >= 2 &&
4958 					vmcs12->cr3_target_value1 == val) ||
4959 				(vmcs12->cr3_target_count >= 3 &&
4960 					vmcs12->cr3_target_value2 == val) ||
4961 				(vmcs12->cr3_target_count >= 4 &&
4962 					vmcs12->cr3_target_value3 == val))
4963 				return false;
4964 			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
4965 				return true;
4966 			break;
4967 		case 4:
4968 			if (vmcs12->cr4_guest_host_mask &
4969 			    (vmcs12->cr4_read_shadow ^ val))
4970 				return true;
4971 			break;
4972 		case 8:
4973 			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
4974 				return true;
4975 			break;
4976 		}
4977 		break;
4978 	case 2: /* clts */
4979 		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
4980 		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
4981 			return true;
4982 		break;
4983 	case 1: /* mov from cr */
4984 		switch (cr) {
4985 		case 3:
4986 			if (vmcs12->cpu_based_vm_exec_control &
4987 			    CPU_BASED_CR3_STORE_EXITING)
4988 				return true;
4989 			break;
4990 		case 8:
4991 			if (vmcs12->cpu_based_vm_exec_control &
4992 			    CPU_BASED_CR8_STORE_EXITING)
4993 				return true;
4994 			break;
4995 		}
4996 		break;
4997 	case 3: /* lmsw */
4998 		/*
4999 		 * lmsw can change bits 1..3 of cr0, and only set bit 0 of
5000 		 * cr0. Other attempted changes are ignored, with no exit.
5001 		 */
5002 		val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
5003 		if (vmcs12->cr0_guest_host_mask & 0xe &
5004 		    (val ^ vmcs12->cr0_read_shadow))
5005 			return true;
5006 		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
5007 		    !(vmcs12->cr0_read_shadow & 0x1) &&
5008 		    (val & 0x1))
5009 			return true;
5010 		break;
5011 	}
5012 	return false;
5013 }
5014 
5015 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
5016 	struct vmcs12 *vmcs12, gpa_t bitmap)
5017 {
5018 	u32 vmx_instruction_info;
5019 	unsigned long field;
5020 	u8 b;
5021 
5022 	if (!nested_cpu_has_shadow_vmcs(vmcs12))
5023 		return true;
5024 
5025 	/* Decode instruction info and find the field to access */
5026 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
5027 	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
5028 
5029 	/* Out-of-range fields always cause a VM exit from L2 to L1 */
5030 	if (field >> 15)
5031 		return true;
5032 
5033 	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
5034 		return true;
5035 
5036 	return 1 & (b >> (field & 7));
5037 }
5038 
5039 /*
5040  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
5041  * should handle it ourselves in L0 (and then continue L2). Only call this
5042  * when in is_guest_mode (L2).
5043  */
5044 bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
5045 {
5046 	u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
5047 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5048 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5049 
5050 	if (vmx->nested.nested_run_pending)
5051 		return false;
5052 
5053 	if (unlikely(vmx->fail)) {
5054 		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
5055 				    vmcs_read32(VM_INSTRUCTION_ERROR));
5056 		return true;
5057 	}
5058 
5059 	/*
5060 	 * The host physical addresses of some pages of guest memory
5061 	 * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
5062 	 * Page). The CPU may write to these pages via their host
5063 	 * physical address while L2 is running, bypassing any
5064 	 * address-translation-based dirty tracking (e.g. EPT write
5065 	 * protection).
5066 	 *
5067 	 * Mark them dirty on every exit from L2 to prevent them from
5068 	 * getting out of sync with dirty tracking.
5069 	 */
5070 	nested_mark_vmcs12_pages_dirty(vcpu);
5071 
5072 	trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
5073 				vmcs_readl(EXIT_QUALIFICATION),
5074 				vmx->idt_vectoring_info,
5075 				intr_info,
5076 				vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
5077 				KVM_ISA_VMX);
5078 
5079 	switch (exit_reason) {
5080 	case EXIT_REASON_EXCEPTION_NMI:
5081 		if (is_nmi(intr_info))
5082 			return false;
5083 		else if (is_page_fault(intr_info))
5084 			return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
5085 		else if (is_debug(intr_info) &&
5086 			 vcpu->guest_debug &
5087 			 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
5088 			return false;
5089 		else if (is_breakpoint(intr_info) &&
5090 			 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5091 			return false;
5092 		return vmcs12->exception_bitmap &
5093 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
5094 	case EXIT_REASON_EXTERNAL_INTERRUPT:
5095 		return false;
5096 	case EXIT_REASON_TRIPLE_FAULT:
5097 		return true;
5098 	case EXIT_REASON_PENDING_INTERRUPT:
5099 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
5100 	case EXIT_REASON_NMI_WINDOW:
5101 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
5102 	case EXIT_REASON_TASK_SWITCH:
5103 		return true;
5104 	case EXIT_REASON_CPUID:
5105 		return true;
5106 	case EXIT_REASON_HLT:
5107 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
5108 	case EXIT_REASON_INVD:
5109 		return true;
5110 	case EXIT_REASON_INVLPG:
5111 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5112 	case EXIT_REASON_RDPMC:
5113 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
5114 	case EXIT_REASON_RDRAND:
5115 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
5116 	case EXIT_REASON_RDSEED:
5117 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
5118 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
5119 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
5120 	case EXIT_REASON_VMREAD:
5121 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5122 			vmcs12->vmread_bitmap);
5123 	case EXIT_REASON_VMWRITE:
5124 		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
5125 			vmcs12->vmwrite_bitmap);
5126 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
5127 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
5128 	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
5129 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
5130 	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
5131 		/*
5132 		 * VMX instructions trap unconditionally. This allows L1 to
5133 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
5134 		 */
5135 		return true;
5136 	case EXIT_REASON_CR_ACCESS:
5137 		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
5138 	case EXIT_REASON_DR_ACCESS:
5139 		return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING);
5140 	case EXIT_REASON_IO_INSTRUCTION:
5141 		return nested_vmx_exit_handled_io(vcpu, vmcs12);
5142 	case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR:
5143 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC);
5144 	case EXIT_REASON_MSR_READ:
5145 	case EXIT_REASON_MSR_WRITE:
5146 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
5147 	case EXIT_REASON_INVALID_STATE:
5148 		return true;
5149 	case EXIT_REASON_MWAIT_INSTRUCTION:
5150 		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
5151 	case EXIT_REASON_MONITOR_TRAP_FLAG:
5152 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
5153 	case EXIT_REASON_MONITOR_INSTRUCTION:
5154 		return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
5155 	case EXIT_REASON_PAUSE_INSTRUCTION:
5156 		return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) ||
5157 			nested_cpu_has2(vmcs12,
5158 				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
5159 	case EXIT_REASON_MCE_DURING_VMENTRY:
5160 		return false;
5161 	case EXIT_REASON_TPR_BELOW_THRESHOLD:
5162 		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
5163 	case EXIT_REASON_APIC_ACCESS:
5164 	case EXIT_REASON_APIC_WRITE:
5165 	case EXIT_REASON_EOI_INDUCED:
5166 		/*
5167 		 * The controls for "virtualize APIC accesses," "APIC-
5168 		 * register virtualization," and "virtual-interrupt
5169 		 * delivery" only come from vmcs12.
5170 		 */
5171 		return true;
5172 	case EXIT_REASON_EPT_VIOLATION:
5173 		/*
5174 		 * L0 always deals with the EPT violation. If nested EPT is
5175 		 * used, and the nested mmu code discovers that the address is
5176 		 * missing in the guest EPT table (EPT12), the EPT violation
5177 		 * will be injected with nested_ept_inject_page_fault()
5178 		 */
5179 		return false;
5180 	case EXIT_REASON_EPT_MISCONFIG:
5181 		/*
5182 		 * L2 never uses directly L1's EPT, but rather L0's own EPT
5183 		 * table (shadow on EPT) or a merged EPT table that L0 built
5184 		 * (EPT on EPT). So any problems with the structure of the
5185 		 * table is L0's fault.
5186 		 */
5187 		return false;
5188 	case EXIT_REASON_INVPCID:
5189 		return
5190 			nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
5191 			nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
5192 	case EXIT_REASON_WBINVD:
5193 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
5194 	case EXIT_REASON_XSETBV:
5195 		return true;
5196 	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
5197 		/*
5198 		 * This should never happen, since it is not possible to
5199 		 * set XSS to a non-zero value---neither in L1 nor in L2.
5200 		 * If if it were, XSS would have to be checked against
5201 		 * the XSS exit bitmap in vmcs12.
5202 		 */
5203 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
5204 	case EXIT_REASON_PREEMPTION_TIMER:
5205 		return false;
5206 	case EXIT_REASON_PML_FULL:
5207 		/* We emulate PML support to L1. */
5208 		return false;
5209 	case EXIT_REASON_VMFUNC:
5210 		/* VM functions are emulated through L2->L0 vmexits. */
5211 		return false;
5212 	case EXIT_REASON_ENCLS:
5213 		/* SGX is never exposed to L1 */
5214 		return false;
5215 	default:
5216 		return true;
5217 	}
5218 }
5219 
5220 
5221 static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
5222 				struct kvm_nested_state __user *user_kvm_nested_state,
5223 				u32 user_data_size)
5224 {
5225 	struct vcpu_vmx *vmx;
5226 	struct vmcs12 *vmcs12;
5227 	struct kvm_nested_state kvm_state = {
5228 		.flags = 0,
5229 		.format = 0,
5230 		.size = sizeof(kvm_state),
5231 		.vmx.vmxon_pa = -1ull,
5232 		.vmx.vmcs_pa = -1ull,
5233 	};
5234 
5235 	if (!vcpu)
5236 		return kvm_state.size + 2 * VMCS12_SIZE;
5237 
5238 	vmx = to_vmx(vcpu);
5239 	vmcs12 = get_vmcs12(vcpu);
5240 
5241 	if (nested_vmx_allowed(vcpu) && vmx->nested.enlightened_vmcs_enabled)
5242 		kvm_state.flags |= KVM_STATE_NESTED_EVMCS;
5243 
5244 	if (nested_vmx_allowed(vcpu) &&
5245 	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
5246 		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
5247 		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
5248 
5249 		if (vmx_has_valid_vmcs12(vcpu)) {
5250 			kvm_state.size += VMCS12_SIZE;
5251 
5252 			if (is_guest_mode(vcpu) &&
5253 			    nested_cpu_has_shadow_vmcs(vmcs12) &&
5254 			    vmcs12->vmcs_link_pointer != -1ull)
5255 				kvm_state.size += VMCS12_SIZE;
5256 		}
5257 
5258 		if (vmx->nested.smm.vmxon)
5259 			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
5260 
5261 		if (vmx->nested.smm.guest_mode)
5262 			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
5263 
5264 		if (is_guest_mode(vcpu)) {
5265 			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
5266 
5267 			if (vmx->nested.nested_run_pending)
5268 				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
5269 		}
5270 	}
5271 
5272 	if (user_data_size < kvm_state.size)
5273 		goto out;
5274 
5275 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
5276 		return -EFAULT;
5277 
5278 	if (!vmx_has_valid_vmcs12(vcpu))
5279 		goto out;
5280 
5281 	/*
5282 	 * When running L2, the authoritative vmcs12 state is in the
5283 	 * vmcs02. When running L1, the authoritative vmcs12 state is
5284 	 * in the shadow or enlightened vmcs linked to vmcs01, unless
5285 	 * need_vmcs12_sync is set, in which case, the authoritative
5286 	 * vmcs12 state is in the vmcs12 already.
5287 	 */
5288 	if (is_guest_mode(vcpu)) {
5289 		sync_vmcs12(vcpu, vmcs12);
5290 	} else if (!vmx->nested.need_vmcs12_sync) {
5291 		if (vmx->nested.hv_evmcs)
5292 			copy_enlightened_to_vmcs12(vmx);
5293 		else if (enable_shadow_vmcs)
5294 			copy_shadow_to_vmcs12(vmx);
5295 	}
5296 
5297 	/*
5298 	 * Copy over the full allocated size of vmcs12 rather than just the size
5299 	 * of the struct.
5300 	 */
5301 	if (copy_to_user(user_kvm_nested_state->data, vmcs12, VMCS12_SIZE))
5302 		return -EFAULT;
5303 
5304 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5305 	    vmcs12->vmcs_link_pointer != -1ull) {
5306 		if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
5307 				 get_shadow_vmcs12(vcpu), VMCS12_SIZE))
5308 			return -EFAULT;
5309 	}
5310 
5311 out:
5312 	return kvm_state.size;
5313 }
5314 
5315 /*
5316  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
5317  */
5318 void vmx_leave_nested(struct kvm_vcpu *vcpu)
5319 {
5320 	if (is_guest_mode(vcpu)) {
5321 		to_vmx(vcpu)->nested.nested_run_pending = 0;
5322 		nested_vmx_vmexit(vcpu, -1, 0, 0);
5323 	}
5324 	free_nested(vcpu);
5325 }
5326 
5327 static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
5328 				struct kvm_nested_state __user *user_kvm_nested_state,
5329 				struct kvm_nested_state *kvm_state)
5330 {
5331 	struct vcpu_vmx *vmx = to_vmx(vcpu);
5332 	struct vmcs12 *vmcs12;
5333 	u32 exit_qual;
5334 	int ret;
5335 
5336 	if (kvm_state->format != 0)
5337 		return -EINVAL;
5338 
5339 	if (!nested_vmx_allowed(vcpu))
5340 		return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
5341 
5342 	if (kvm_state->vmx.vmxon_pa == -1ull) {
5343 		if (kvm_state->vmx.smm.flags)
5344 			return -EINVAL;
5345 
5346 		if (kvm_state->vmx.vmcs_pa != -1ull)
5347 			return -EINVAL;
5348 
5349 		vmx_leave_nested(vcpu);
5350 		return 0;
5351 	}
5352 
5353 	if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
5354 		return -EINVAL;
5355 
5356 	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5357 	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5358 		return -EINVAL;
5359 
5360 	if (kvm_state->vmx.smm.flags &
5361 	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
5362 		return -EINVAL;
5363 
5364 	/*
5365 	 * SMM temporarily disables VMX, so we cannot be in guest mode,
5366 	 * nor can VMLAUNCH/VMRESUME be pending.  Outside SMM, SMM flags
5367 	 * must be zero.
5368 	 */
5369 	if (is_smm(vcpu) ? kvm_state->flags : kvm_state->vmx.smm.flags)
5370 		return -EINVAL;
5371 
5372 	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
5373 	    !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
5374 		return -EINVAL;
5375 
5376 	vmx_leave_nested(vcpu);
5377 	if (kvm_state->vmx.vmxon_pa == -1ull)
5378 		return 0;
5379 
5380 	if (kvm_state->flags & KVM_STATE_NESTED_EVMCS)
5381 		nested_enable_evmcs(vcpu, NULL);
5382 
5383 	vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
5384 	ret = enter_vmx_operation(vcpu);
5385 	if (ret)
5386 		return ret;
5387 
5388 	/* Empty 'VMXON' state is permitted */
5389 	if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
5390 		return 0;
5391 
5392 	if (kvm_state->vmx.vmcs_pa != -1ull) {
5393 		if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
5394 		    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
5395 			return -EINVAL;
5396 
5397 		set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
5398 	} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
5399 		/*
5400 		 * Sync eVMCS upon entry as we may not have
5401 		 * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
5402 		 */
5403 		vmx->nested.need_vmcs12_sync = true;
5404 	} else {
5405 		return -EINVAL;
5406 	}
5407 
5408 	if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
5409 		vmx->nested.smm.vmxon = true;
5410 		vmx->nested.vmxon = false;
5411 
5412 		if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
5413 			vmx->nested.smm.guest_mode = true;
5414 	}
5415 
5416 	vmcs12 = get_vmcs12(vcpu);
5417 	if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
5418 		return -EFAULT;
5419 
5420 	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
5421 		return -EINVAL;
5422 
5423 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
5424 		return 0;
5425 
5426 	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
5427 	    vmcs12->vmcs_link_pointer != -1ull) {
5428 		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
5429 
5430 		if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12))
5431 			return -EINVAL;
5432 
5433 		if (copy_from_user(shadow_vmcs12,
5434 				   user_kvm_nested_state->data + VMCS12_SIZE,
5435 				   sizeof(*vmcs12)))
5436 			return -EFAULT;
5437 
5438 		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
5439 		    !shadow_vmcs12->hdr.shadow_vmcs)
5440 			return -EINVAL;
5441 	}
5442 
5443 	if (nested_vmx_check_controls(vcpu, vmcs12) ||
5444 	    nested_vmx_check_host_state(vcpu, vmcs12) ||
5445 	    nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
5446 		return -EINVAL;
5447 
5448 	vmx->nested.dirty_vmcs12 = true;
5449 	vmx->nested.nested_run_pending =
5450 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
5451 
5452 	ret = nested_vmx_enter_non_root_mode(vcpu, false);
5453 	if (ret) {
5454 		vmx->nested.nested_run_pending = 0;
5455 		return -EINVAL;
5456 	}
5457 
5458 	return 0;
5459 }
5460 
5461 void nested_vmx_vcpu_setup(void)
5462 {
5463 	if (enable_shadow_vmcs) {
5464 		/*
5465 		 * At vCPU creation, "VMWRITE to any supported field
5466 		 * in the VMCS" is supported, so use the more
5467 		 * permissive vmx_vmread_bitmap to specify both read
5468 		 * and write permissions for the shadow VMCS.
5469 		 */
5470 		vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
5471 		vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmread_bitmap));
5472 	}
5473 }
5474 
5475 /*
5476  * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be
5477  * returned for the various VMX controls MSRs when nested VMX is enabled.
5478  * The same values should also be used to verify that vmcs12 control fields are
5479  * valid during nested entry from L1 to L2.
5480  * Each of these control msrs has a low and high 32-bit half: A low bit is on
5481  * if the corresponding bit in the (32-bit) control field *must* be on, and a
5482  * bit in the high half is on if the corresponding bit in the control field
5483  * may be on. See also vmx_control_verify().
5484  */
5485 void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
5486 				bool apicv)
5487 {
5488 	/*
5489 	 * Note that as a general rule, the high half of the MSRs (bits in
5490 	 * the control fields which may be 1) should be initialized by the
5491 	 * intersection of the underlying hardware's MSR (i.e., features which
5492 	 * can be supported) and the list of features we want to expose -
5493 	 * because they are known to be properly supported in our code.
5494 	 * Also, usually, the low half of the MSRs (bits which must be 1) can
5495 	 * be set to 0, meaning that L1 may turn off any of these bits. The
5496 	 * reason is that if one of these bits is necessary, it will appear
5497 	 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
5498 	 * fields of vmcs01 and vmcs02, will turn these bits off - and
5499 	 * nested_vmx_exit_reflected() will not pass related exits to L1.
5500 	 * These rules have exceptions below.
5501 	 */
5502 
5503 	/* pin-based controls */
5504 	rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
5505 		msrs->pinbased_ctls_low,
5506 		msrs->pinbased_ctls_high);
5507 	msrs->pinbased_ctls_low |=
5508 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5509 	msrs->pinbased_ctls_high &=
5510 		PIN_BASED_EXT_INTR_MASK |
5511 		PIN_BASED_NMI_EXITING |
5512 		PIN_BASED_VIRTUAL_NMIS |
5513 		(apicv ? PIN_BASED_POSTED_INTR : 0);
5514 	msrs->pinbased_ctls_high |=
5515 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5516 		PIN_BASED_VMX_PREEMPTION_TIMER;
5517 
5518 	/* exit controls */
5519 	rdmsr(MSR_IA32_VMX_EXIT_CTLS,
5520 		msrs->exit_ctls_low,
5521 		msrs->exit_ctls_high);
5522 	msrs->exit_ctls_low =
5523 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
5524 
5525 	msrs->exit_ctls_high &=
5526 #ifdef CONFIG_X86_64
5527 		VM_EXIT_HOST_ADDR_SPACE_SIZE |
5528 #endif
5529 		VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
5530 	msrs->exit_ctls_high |=
5531 		VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
5532 		VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
5533 		VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
5534 
5535 	/* We support free control of debug control saving. */
5536 	msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
5537 
5538 	/* entry controls */
5539 	rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
5540 		msrs->entry_ctls_low,
5541 		msrs->entry_ctls_high);
5542 	msrs->entry_ctls_low =
5543 		VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
5544 	msrs->entry_ctls_high &=
5545 #ifdef CONFIG_X86_64
5546 		VM_ENTRY_IA32E_MODE |
5547 #endif
5548 		VM_ENTRY_LOAD_IA32_PAT;
5549 	msrs->entry_ctls_high |=
5550 		(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
5551 
5552 	/* We support free control of debug control loading. */
5553 	msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
5554 
5555 	/* cpu-based controls */
5556 	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
5557 		msrs->procbased_ctls_low,
5558 		msrs->procbased_ctls_high);
5559 	msrs->procbased_ctls_low =
5560 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
5561 	msrs->procbased_ctls_high &=
5562 		CPU_BASED_VIRTUAL_INTR_PENDING |
5563 		CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
5564 		CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
5565 		CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
5566 		CPU_BASED_CR3_STORE_EXITING |
5567 #ifdef CONFIG_X86_64
5568 		CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING |
5569 #endif
5570 		CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING |
5571 		CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG |
5572 		CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING |
5573 		CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING |
5574 		CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
5575 	/*
5576 	 * We can allow some features even when not supported by the
5577 	 * hardware. For example, L1 can specify an MSR bitmap - and we
5578 	 * can use it to avoid exits to L1 - even when L0 runs L2
5579 	 * without MSR bitmaps.
5580 	 */
5581 	msrs->procbased_ctls_high |=
5582 		CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
5583 		CPU_BASED_USE_MSR_BITMAPS;
5584 
5585 	/* We support free control of CR3 access interception. */
5586 	msrs->procbased_ctls_low &=
5587 		~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
5588 
5589 	/*
5590 	 * secondary cpu-based controls.  Do not include those that
5591 	 * depend on CPUID bits, they are added later by vmx_cpuid_update.
5592 	 */
5593 	if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
5594 		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
5595 		      msrs->secondary_ctls_low,
5596 		      msrs->secondary_ctls_high);
5597 
5598 	msrs->secondary_ctls_low = 0;
5599 	msrs->secondary_ctls_high &=
5600 		SECONDARY_EXEC_DESC |
5601 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
5602 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
5603 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
5604 		SECONDARY_EXEC_WBINVD_EXITING;
5605 
5606 	/*
5607 	 * We can emulate "VMCS shadowing," even if the hardware
5608 	 * doesn't support it.
5609 	 */
5610 	msrs->secondary_ctls_high |=
5611 		SECONDARY_EXEC_SHADOW_VMCS;
5612 
5613 	if (enable_ept) {
5614 		/* nested EPT: emulate EPT also to L1 */
5615 		msrs->secondary_ctls_high |=
5616 			SECONDARY_EXEC_ENABLE_EPT;
5617 		msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
5618 			 VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
5619 		if (cpu_has_vmx_ept_execute_only())
5620 			msrs->ept_caps |=
5621 				VMX_EPT_EXECUTE_ONLY_BIT;
5622 		msrs->ept_caps &= ept_caps;
5623 		msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
5624 			VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
5625 			VMX_EPT_1GB_PAGE_BIT;
5626 		if (enable_ept_ad_bits) {
5627 			msrs->secondary_ctls_high |=
5628 				SECONDARY_EXEC_ENABLE_PML;
5629 			msrs->ept_caps |= VMX_EPT_AD_BIT;
5630 		}
5631 	}
5632 
5633 	if (cpu_has_vmx_vmfunc()) {
5634 		msrs->secondary_ctls_high |=
5635 			SECONDARY_EXEC_ENABLE_VMFUNC;
5636 		/*
5637 		 * Advertise EPTP switching unconditionally
5638 		 * since we emulate it
5639 		 */
5640 		if (enable_ept)
5641 			msrs->vmfunc_controls =
5642 				VMX_VMFUNC_EPTP_SWITCHING;
5643 	}
5644 
5645 	/*
5646 	 * Old versions of KVM use the single-context version without
5647 	 * checking for support, so declare that it is supported even
5648 	 * though it is treated as global context.  The alternative is
5649 	 * not failing the single-context invvpid, and it is worse.
5650 	 */
5651 	if (enable_vpid) {
5652 		msrs->secondary_ctls_high |=
5653 			SECONDARY_EXEC_ENABLE_VPID;
5654 		msrs->vpid_caps = VMX_VPID_INVVPID_BIT |
5655 			VMX_VPID_EXTENT_SUPPORTED_MASK;
5656 	}
5657 
5658 	if (enable_unrestricted_guest)
5659 		msrs->secondary_ctls_high |=
5660 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
5661 
5662 	if (flexpriority_enabled)
5663 		msrs->secondary_ctls_high |=
5664 			SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
5665 
5666 	/* miscellaneous data */
5667 	rdmsr(MSR_IA32_VMX_MISC,
5668 		msrs->misc_low,
5669 		msrs->misc_high);
5670 	msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
5671 	msrs->misc_low |=
5672 		MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
5673 		VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
5674 		VMX_MISC_ACTIVITY_HLT;
5675 	msrs->misc_high = 0;
5676 
5677 	/*
5678 	 * This MSR reports some information about VMX support. We
5679 	 * should return information about the VMX we emulate for the
5680 	 * guest, and the VMCS structure we give it - not about the
5681 	 * VMX support of the underlying hardware.
5682 	 */
5683 	msrs->basic =
5684 		VMCS12_REVISION |
5685 		VMX_BASIC_TRUE_CTLS |
5686 		((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) |
5687 		(VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT);
5688 
5689 	if (cpu_has_vmx_basic_inout())
5690 		msrs->basic |= VMX_BASIC_INOUT;
5691 
5692 	/*
5693 	 * These MSRs specify bits which the guest must keep fixed on
5694 	 * while L1 is in VMXON mode (in L1's root mode, or running an L2).
5695 	 * We picked the standard core2 setting.
5696 	 */
5697 #define VMXON_CR0_ALWAYSON     (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE)
5698 #define VMXON_CR4_ALWAYSON     X86_CR4_VMXE
5699 	msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON;
5700 	msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON;
5701 
5702 	/* These MSRs specify bits which the guest must keep fixed off. */
5703 	rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1);
5704 	rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1);
5705 
5706 	/* highest index: VMX_PREEMPTION_TIMER_VALUE */
5707 	msrs->vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1;
5708 }
5709 
5710 void nested_vmx_hardware_unsetup(void)
5711 {
5712 	int i;
5713 
5714 	if (enable_shadow_vmcs) {
5715 		for (i = 0; i < VMX_BITMAP_NR; i++)
5716 			free_page((unsigned long)vmx_bitmap[i]);
5717 	}
5718 }
5719 
5720 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
5721 {
5722 	int i;
5723 
5724 	/*
5725 	 * Without EPT it is not possible to restore L1's CR3 and PDPTR on
5726 	 * VMfail, because they are not available in vmcs01.  Just always
5727 	 * use hardware checks.
5728 	 */
5729 	if (!enable_ept)
5730 		nested_early_check = 1;
5731 
5732 	if (!cpu_has_vmx_shadow_vmcs())
5733 		enable_shadow_vmcs = 0;
5734 	if (enable_shadow_vmcs) {
5735 		for (i = 0; i < VMX_BITMAP_NR; i++) {
5736 			/*
5737 			 * The vmx_bitmap is not tied to a VM and so should
5738 			 * not be charged to a memcg.
5739 			 */
5740 			vmx_bitmap[i] = (unsigned long *)
5741 				__get_free_page(GFP_KERNEL);
5742 			if (!vmx_bitmap[i]) {
5743 				nested_vmx_hardware_unsetup();
5744 				return -ENOMEM;
5745 			}
5746 		}
5747 
5748 		init_vmcs_shadow_fields();
5749 	}
5750 
5751 	exit_handlers[EXIT_REASON_VMCLEAR]	= handle_vmclear,
5752 	exit_handlers[EXIT_REASON_VMLAUNCH]	= handle_vmlaunch,
5753 	exit_handlers[EXIT_REASON_VMPTRLD]	= handle_vmptrld,
5754 	exit_handlers[EXIT_REASON_VMPTRST]	= handle_vmptrst,
5755 	exit_handlers[EXIT_REASON_VMREAD]	= handle_vmread,
5756 	exit_handlers[EXIT_REASON_VMRESUME]	= handle_vmresume,
5757 	exit_handlers[EXIT_REASON_VMWRITE]	= handle_vmwrite,
5758 	exit_handlers[EXIT_REASON_VMOFF]	= handle_vmoff,
5759 	exit_handlers[EXIT_REASON_VMON]		= handle_vmon,
5760 	exit_handlers[EXIT_REASON_INVEPT]	= handle_invept,
5761 	exit_handlers[EXIT_REASON_INVVPID]	= handle_invvpid,
5762 	exit_handlers[EXIT_REASON_VMFUNC]	= handle_vmfunc,
5763 
5764 	kvm_x86_ops->check_nested_events = vmx_check_nested_events;
5765 	kvm_x86_ops->get_nested_state = vmx_get_nested_state;
5766 	kvm_x86_ops->set_nested_state = vmx_set_nested_state;
5767 	kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
5768 	kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
5769 	kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
5770 
5771 	return 0;
5772 }
5773