xref: /openbmc/linux/arch/x86/kvm/svm/nested.c (revision 7cc39531)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Kernel-based Virtual Machine driver for Linux
4  *
5  * AMD SVM support
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9  *
10  * Authors:
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *   Avi Kivity   <avi@qumranet.com>
13  */
14 
15 #define pr_fmt(fmt) "SVM: " fmt
16 
17 #include <linux/kvm_types.h>
18 #include <linux/kvm_host.h>
19 #include <linux/kernel.h>
20 
21 #include <asm/msr-index.h>
22 #include <asm/debugreg.h>
23 
24 #include "kvm_emulate.h"
25 #include "trace.h"
26 #include "mmu.h"
27 #include "x86.h"
28 #include "cpuid.h"
29 #include "lapic.h"
30 #include "svm.h"
31 #include "hyperv.h"
32 
33 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
34 
35 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
36 				       struct x86_exception *fault)
37 {
38 	struct vcpu_svm *svm = to_svm(vcpu);
39 	struct vmcb *vmcb = svm->vmcb;
40 
41 	if (vmcb->control.exit_code != SVM_EXIT_NPF) {
42 		/*
43 		 * TODO: track the cause of the nested page fault, and
44 		 * correctly fill in the high bits of exit_info_1.
45 		 */
46 		vmcb->control.exit_code = SVM_EXIT_NPF;
47 		vmcb->control.exit_code_hi = 0;
48 		vmcb->control.exit_info_1 = (1ULL << 32);
49 		vmcb->control.exit_info_2 = fault->address;
50 	}
51 
52 	vmcb->control.exit_info_1 &= ~0xffffffffULL;
53 	vmcb->control.exit_info_1 |= fault->error_code;
54 
55 	nested_svm_vmexit(svm);
56 }
57 
58 static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
59 						    struct x86_exception *fault)
60 {
61 	struct vcpu_svm *svm = to_svm(vcpu);
62 	struct vmcb *vmcb = svm->vmcb;
63 
64  	WARN_ON(!is_guest_mode(vcpu));
65 
66 	if (vmcb12_is_intercept(&svm->nested.ctl,
67 				INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
68 	    !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
69 	     	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
70 		vmcb->control.exit_code_hi = 0;
71 		vmcb->control.exit_info_1 = fault->error_code;
72 		vmcb->control.exit_info_2 = fault->address;
73 		nested_svm_vmexit(svm);
74 		return true;
75 	}
76 
77 	return false;
78 }
79 
80 static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
81 {
82 	struct vcpu_svm *svm = to_svm(vcpu);
83 	u64 cr3 = svm->nested.ctl.nested_cr3;
84 	u64 pdpte;
85 	int ret;
86 
87 	ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
88 				       offset_in_page(cr3) + index * 8, 8);
89 	if (ret)
90 		return 0;
91 	return pdpte;
92 }
93 
94 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
95 {
96 	struct vcpu_svm *svm = to_svm(vcpu);
97 
98 	return svm->nested.ctl.nested_cr3;
99 }
100 
101 static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
102 {
103 	struct vcpu_svm *svm = to_svm(vcpu);
104 
105 	WARN_ON(mmu_is_nested(vcpu));
106 
107 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
108 
109 	/*
110 	 * The NPT format depends on L1's CR4 and EFER, which is in vmcb01.  Note,
111 	 * when called via KVM_SET_NESTED_STATE, that state may _not_ match current
112 	 * vCPU state.  CR0.WP is explicitly ignored, while CR0.PG is required.
113 	 */
114 	kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, svm->vmcb01.ptr->save.cr4,
115 				svm->vmcb01.ptr->save.efer,
116 				svm->nested.ctl.nested_cr3);
117 	vcpu->arch.mmu->get_guest_pgd     = nested_svm_get_tdp_cr3;
118 	vcpu->arch.mmu->get_pdptr         = nested_svm_get_tdp_pdptr;
119 	vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
120 	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
121 }
122 
123 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
124 {
125 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
126 	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
127 }
128 
129 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
130 {
131 	if (!svm->v_vmload_vmsave_enabled)
132 		return true;
133 
134 	if (!nested_npt_enabled(svm))
135 		return true;
136 
137 	if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
138 		return true;
139 
140 	return false;
141 }
142 
143 void recalc_intercepts(struct vcpu_svm *svm)
144 {
145 	struct vmcb_control_area *c, *h;
146 	struct vmcb_ctrl_area_cached *g;
147 	unsigned int i;
148 
149 	vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
150 
151 	if (!is_guest_mode(&svm->vcpu))
152 		return;
153 
154 	c = &svm->vmcb->control;
155 	h = &svm->vmcb01.ptr->control;
156 	g = &svm->nested.ctl;
157 
158 	for (i = 0; i < MAX_INTERCEPT; i++)
159 		c->intercepts[i] = h->intercepts[i];
160 
161 	if (g->int_ctl & V_INTR_MASKING_MASK) {
162 		/* We only want the cr8 intercept bits of L1 */
163 		vmcb_clr_intercept(c, INTERCEPT_CR8_READ);
164 		vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE);
165 
166 		/*
167 		 * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not
168 		 * affect any interrupt we may want to inject; therefore,
169 		 * interrupt window vmexits are irrelevant to L0.
170 		 */
171 		vmcb_clr_intercept(c, INTERCEPT_VINTR);
172 	}
173 
174 	/* We don't want to see VMMCALLs from a nested guest */
175 	vmcb_clr_intercept(c, INTERCEPT_VMMCALL);
176 
177 	for (i = 0; i < MAX_INTERCEPT; i++)
178 		c->intercepts[i] |= g->intercepts[i];
179 
180 	/* If SMI is not intercepted, ignore guest SMI intercept as well  */
181 	if (!intercept_smi)
182 		vmcb_clr_intercept(c, INTERCEPT_SMI);
183 
184 	if (nested_vmcb_needs_vls_intercept(svm)) {
185 		/*
186 		 * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
187 		 * we must intercept these instructions to correctly
188 		 * emulate them in case L1 doesn't intercept them.
189 		 */
190 		vmcb_set_intercept(c, INTERCEPT_VMLOAD);
191 		vmcb_set_intercept(c, INTERCEPT_VMSAVE);
192 	} else {
193 		WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
194 	}
195 }
196 
197 /*
198  * Merge L0's (KVM) and L1's (Nested VMCB) MSR permission bitmaps. The function
199  * is optimized in that it only merges the parts where KVM MSR permission bitmap
200  * may contain zero bits.
201  */
202 static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
203 {
204 	struct hv_enlightenments *hve =
205 		(struct hv_enlightenments *)svm->nested.ctl.reserved_sw;
206 	int i;
207 
208 	/*
209 	 * MSR bitmap update can be skipped when:
210 	 * - MSR bitmap for L1 hasn't changed.
211 	 * - Nested hypervisor (L1) is attempting to launch the same L2 as
212 	 *   before.
213 	 * - Nested hypervisor (L1) is using Hyper-V emulation interface and
214 	 * tells KVM (L0) there were no changes in MSR bitmap for L2.
215 	 */
216 	if (!svm->nested.force_msr_bitmap_recalc &&
217 	    kvm_hv_hypercall_enabled(&svm->vcpu) &&
218 	    hve->hv_enlightenments_control.msr_bitmap &&
219 	    (svm->nested.ctl.clean & BIT(VMCB_HV_NESTED_ENLIGHTENMENTS)))
220 		goto set_msrpm_base_pa;
221 
222 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
223 		return true;
224 
225 	for (i = 0; i < MSRPM_OFFSETS; i++) {
226 		u32 value, p;
227 		u64 offset;
228 
229 		if (msrpm_offsets[i] == 0xffffffff)
230 			break;
231 
232 		p      = msrpm_offsets[i];
233 
234 		/* x2apic msrs are intercepted always for the nested guest */
235 		if (is_x2apic_msrpm_offset(p))
236 			continue;
237 
238 		offset = svm->nested.ctl.msrpm_base_pa + (p * 4);
239 
240 		if (kvm_vcpu_read_guest(&svm->vcpu, offset, &value, 4))
241 			return false;
242 
243 		svm->nested.msrpm[p] = svm->msrpm[p] | value;
244 	}
245 
246 	svm->nested.force_msr_bitmap_recalc = false;
247 
248 set_msrpm_base_pa:
249 	svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
250 
251 	return true;
252 }
253 
254 /*
255  * Bits 11:0 of bitmap address are ignored by hardware
256  */
257 static bool nested_svm_check_bitmap_pa(struct kvm_vcpu *vcpu, u64 pa, u32 size)
258 {
259 	u64 addr = PAGE_ALIGN(pa);
260 
261 	return kvm_vcpu_is_legal_gpa(vcpu, addr) &&
262 	    kvm_vcpu_is_legal_gpa(vcpu, addr + size - 1);
263 }
264 
265 static bool nested_svm_check_tlb_ctl(struct kvm_vcpu *vcpu, u8 tlb_ctl)
266 {
267 	/* Nested FLUSHBYASID is not supported yet.  */
268 	switch(tlb_ctl) {
269 		case TLB_CONTROL_DO_NOTHING:
270 		case TLB_CONTROL_FLUSH_ALL_ASID:
271 			return true;
272 		default:
273 			return false;
274 	}
275 }
276 
277 static bool __nested_vmcb_check_controls(struct kvm_vcpu *vcpu,
278 					 struct vmcb_ctrl_area_cached *control)
279 {
280 	if (CC(!vmcb12_is_intercept(control, INTERCEPT_VMRUN)))
281 		return false;
282 
283 	if (CC(control->asid == 0))
284 		return false;
285 
286 	if (CC((control->nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && !npt_enabled))
287 		return false;
288 
289 	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->msrpm_base_pa,
290 					   MSRPM_SIZE)))
291 		return false;
292 	if (CC(!nested_svm_check_bitmap_pa(vcpu, control->iopm_base_pa,
293 					   IOPM_SIZE)))
294 		return false;
295 
296 	if (CC(!nested_svm_check_tlb_ctl(vcpu, control->tlb_ctl)))
297 		return false;
298 
299 	return true;
300 }
301 
302 /* Common checks that apply to both L1 and L2 state.  */
303 static bool __nested_vmcb_check_save(struct kvm_vcpu *vcpu,
304 				     struct vmcb_save_area_cached *save)
305 {
306 	if (CC(!(save->efer & EFER_SVME)))
307 		return false;
308 
309 	if (CC((save->cr0 & X86_CR0_CD) == 0 && (save->cr0 & X86_CR0_NW)) ||
310 	    CC(save->cr0 & ~0xffffffffULL))
311 		return false;
312 
313 	if (CC(!kvm_dr6_valid(save->dr6)) || CC(!kvm_dr7_valid(save->dr7)))
314 		return false;
315 
316 	/*
317 	 * These checks are also performed by KVM_SET_SREGS,
318 	 * except that EFER.LMA is not checked by SVM against
319 	 * CR0.PG && EFER.LME.
320 	 */
321 	if ((save->efer & EFER_LME) && (save->cr0 & X86_CR0_PG)) {
322 		if (CC(!(save->cr4 & X86_CR4_PAE)) ||
323 		    CC(!(save->cr0 & X86_CR0_PE)) ||
324 		    CC(kvm_vcpu_is_illegal_gpa(vcpu, save->cr3)))
325 			return false;
326 	}
327 
328 	/* Note, SVM doesn't have any additional restrictions on CR4. */
329 	if (CC(!__kvm_is_valid_cr4(vcpu, save->cr4)))
330 		return false;
331 
332 	if (CC(!kvm_valid_efer(vcpu, save->efer)))
333 		return false;
334 
335 	return true;
336 }
337 
338 static bool nested_vmcb_check_save(struct kvm_vcpu *vcpu)
339 {
340 	struct vcpu_svm *svm = to_svm(vcpu);
341 	struct vmcb_save_area_cached *save = &svm->nested.save;
342 
343 	return __nested_vmcb_check_save(vcpu, save);
344 }
345 
346 static bool nested_vmcb_check_controls(struct kvm_vcpu *vcpu)
347 {
348 	struct vcpu_svm *svm = to_svm(vcpu);
349 	struct vmcb_ctrl_area_cached *ctl = &svm->nested.ctl;
350 
351 	return __nested_vmcb_check_controls(vcpu, ctl);
352 }
353 
354 static
355 void __nested_copy_vmcb_control_to_cache(struct kvm_vcpu *vcpu,
356 					 struct vmcb_ctrl_area_cached *to,
357 					 struct vmcb_control_area *from)
358 {
359 	unsigned int i;
360 
361 	for (i = 0; i < MAX_INTERCEPT; i++)
362 		to->intercepts[i] = from->intercepts[i];
363 
364 	to->iopm_base_pa        = from->iopm_base_pa;
365 	to->msrpm_base_pa       = from->msrpm_base_pa;
366 	to->tsc_offset          = from->tsc_offset;
367 	to->tlb_ctl             = from->tlb_ctl;
368 	to->int_ctl             = from->int_ctl;
369 	to->int_vector          = from->int_vector;
370 	to->int_state           = from->int_state;
371 	to->exit_code           = from->exit_code;
372 	to->exit_code_hi        = from->exit_code_hi;
373 	to->exit_info_1         = from->exit_info_1;
374 	to->exit_info_2         = from->exit_info_2;
375 	to->exit_int_info       = from->exit_int_info;
376 	to->exit_int_info_err   = from->exit_int_info_err;
377 	to->nested_ctl          = from->nested_ctl;
378 	to->event_inj           = from->event_inj;
379 	to->event_inj_err       = from->event_inj_err;
380 	to->next_rip            = from->next_rip;
381 	to->nested_cr3          = from->nested_cr3;
382 	to->virt_ext            = from->virt_ext;
383 	to->pause_filter_count  = from->pause_filter_count;
384 	to->pause_filter_thresh = from->pause_filter_thresh;
385 
386 	/* Copy asid here because nested_vmcb_check_controls will check it.  */
387 	to->asid           = from->asid;
388 	to->msrpm_base_pa &= ~0x0fffULL;
389 	to->iopm_base_pa  &= ~0x0fffULL;
390 
391 	/* Hyper-V extensions (Enlightened VMCB) */
392 	if (kvm_hv_hypercall_enabled(vcpu)) {
393 		to->clean = from->clean;
394 		memcpy(to->reserved_sw, from->reserved_sw,
395 		       sizeof(struct hv_enlightenments));
396 	}
397 }
398 
399 void nested_copy_vmcb_control_to_cache(struct vcpu_svm *svm,
400 				       struct vmcb_control_area *control)
401 {
402 	__nested_copy_vmcb_control_to_cache(&svm->vcpu, &svm->nested.ctl, control);
403 }
404 
405 static void __nested_copy_vmcb_save_to_cache(struct vmcb_save_area_cached *to,
406 					     struct vmcb_save_area *from)
407 {
408 	/*
409 	 * Copy only fields that are validated, as we need them
410 	 * to avoid TOC/TOU races.
411 	 */
412 	to->efer = from->efer;
413 	to->cr0 = from->cr0;
414 	to->cr3 = from->cr3;
415 	to->cr4 = from->cr4;
416 
417 	to->dr6 = from->dr6;
418 	to->dr7 = from->dr7;
419 }
420 
421 void nested_copy_vmcb_save_to_cache(struct vcpu_svm *svm,
422 				    struct vmcb_save_area *save)
423 {
424 	__nested_copy_vmcb_save_to_cache(&svm->nested.save, save);
425 }
426 
427 /*
428  * Synchronize fields that are written by the processor, so that
429  * they can be copied back into the vmcb12.
430  */
431 void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
432 {
433 	u32 mask;
434 	svm->nested.ctl.event_inj      = svm->vmcb->control.event_inj;
435 	svm->nested.ctl.event_inj_err  = svm->vmcb->control.event_inj_err;
436 
437 	/* Only a few fields of int_ctl are written by the processor.  */
438 	mask = V_IRQ_MASK | V_TPR_MASK;
439 	if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) &&
440 	    svm_is_intercept(svm, INTERCEPT_VINTR)) {
441 		/*
442 		 * In order to request an interrupt window, L0 is usurping
443 		 * svm->vmcb->control.int_ctl and possibly setting V_IRQ
444 		 * even if it was clear in L1's VMCB.  Restoring it would be
445 		 * wrong.  However, in this case V_IRQ will remain true until
446 		 * interrupt_window_interception calls svm_clear_vintr and
447 		 * restores int_ctl.  We can just leave it aside.
448 		 */
449 		mask &= ~V_IRQ_MASK;
450 	}
451 
452 	if (nested_vgif_enabled(svm))
453 		mask |= V_GIF_MASK;
454 
455 	svm->nested.ctl.int_ctl        &= ~mask;
456 	svm->nested.ctl.int_ctl        |= svm->vmcb->control.int_ctl & mask;
457 }
458 
459 /*
460  * Transfer any event that L0 or L1 wanted to inject into L2 to
461  * EXIT_INT_INFO.
462  */
463 static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
464 						struct vmcb *vmcb12)
465 {
466 	struct kvm_vcpu *vcpu = &svm->vcpu;
467 	u32 exit_int_info = 0;
468 	unsigned int nr;
469 
470 	if (vcpu->arch.exception.injected) {
471 		nr = vcpu->arch.exception.nr;
472 		exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
473 
474 		if (vcpu->arch.exception.has_error_code) {
475 			exit_int_info |= SVM_EVTINJ_VALID_ERR;
476 			vmcb12->control.exit_int_info_err =
477 				vcpu->arch.exception.error_code;
478 		}
479 
480 	} else if (vcpu->arch.nmi_injected) {
481 		exit_int_info = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
482 
483 	} else if (vcpu->arch.interrupt.injected) {
484 		nr = vcpu->arch.interrupt.nr;
485 		exit_int_info = nr | SVM_EVTINJ_VALID;
486 
487 		if (vcpu->arch.interrupt.soft)
488 			exit_int_info |= SVM_EVTINJ_TYPE_SOFT;
489 		else
490 			exit_int_info |= SVM_EVTINJ_TYPE_INTR;
491 	}
492 
493 	vmcb12->control.exit_int_info = exit_int_info;
494 }
495 
496 static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
497 {
498 	/*
499 	 * TODO: optimize unconditional TLB flush/MMU sync.  A partial list of
500 	 * things to fix before this can be conditional:
501 	 *
502 	 *  - Flush TLBs for both L1 and L2 remote TLB flush
503 	 *  - Honor L1's request to flush an ASID on nested VMRUN
504 	 *  - Sync nested NPT MMU on VMRUN that flushes L2's ASID[*]
505 	 *  - Don't crush a pending TLB flush in vmcb02 on nested VMRUN
506 	 *  - Flush L1's ASID on KVM_REQ_TLB_FLUSH_GUEST
507 	 *
508 	 * [*] Unlike nested EPT, SVM's ASID management can invalidate nested
509 	 *     NPT guest-physical mappings on VMRUN.
510 	 */
511 	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
512 	kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
513 }
514 
515 /*
516  * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true
517  * if we are emulating VM-Entry into a guest with NPT enabled.
518  */
519 static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3,
520 			       bool nested_npt, bool reload_pdptrs)
521 {
522 	if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3)))
523 		return -EINVAL;
524 
525 	if (reload_pdptrs && !nested_npt && is_pae_paging(vcpu) &&
526 	    CC(!load_pdptrs(vcpu, cr3)))
527 		return -EINVAL;
528 
529 	vcpu->arch.cr3 = cr3;
530 
531 	/* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */
532 	kvm_init_mmu(vcpu);
533 
534 	if (!nested_npt)
535 		kvm_mmu_new_pgd(vcpu, cr3);
536 
537 	return 0;
538 }
539 
540 void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
541 {
542 	if (!svm->nested.vmcb02.ptr)
543 		return;
544 
545 	/* FIXME: merge g_pat from vmcb01 and vmcb12.  */
546 	svm->nested.vmcb02.ptr->save.g_pat = svm->vmcb01.ptr->save.g_pat;
547 }
548 
549 static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
550 {
551 	bool new_vmcb12 = false;
552 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
553 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
554 
555 	nested_vmcb02_compute_g_pat(svm);
556 
557 	/* Load the nested guest state */
558 	if (svm->nested.vmcb12_gpa != svm->nested.last_vmcb12_gpa) {
559 		new_vmcb12 = true;
560 		svm->nested.last_vmcb12_gpa = svm->nested.vmcb12_gpa;
561 		svm->nested.force_msr_bitmap_recalc = true;
562 	}
563 
564 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
565 		vmcb02->save.es = vmcb12->save.es;
566 		vmcb02->save.cs = vmcb12->save.cs;
567 		vmcb02->save.ss = vmcb12->save.ss;
568 		vmcb02->save.ds = vmcb12->save.ds;
569 		vmcb02->save.cpl = vmcb12->save.cpl;
570 		vmcb_mark_dirty(vmcb02, VMCB_SEG);
571 	}
572 
573 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
574 		vmcb02->save.gdtr = vmcb12->save.gdtr;
575 		vmcb02->save.idtr = vmcb12->save.idtr;
576 		vmcb_mark_dirty(vmcb02, VMCB_DT);
577 	}
578 
579 	kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
580 
581 	svm_set_efer(&svm->vcpu, svm->nested.save.efer);
582 
583 	svm_set_cr0(&svm->vcpu, svm->nested.save.cr0);
584 	svm_set_cr4(&svm->vcpu, svm->nested.save.cr4);
585 
586 	svm->vcpu.arch.cr2 = vmcb12->save.cr2;
587 
588 	kvm_rax_write(&svm->vcpu, vmcb12->save.rax);
589 	kvm_rsp_write(&svm->vcpu, vmcb12->save.rsp);
590 	kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
591 
592 	/* In case we don't even reach vcpu_run, the fields are not updated */
593 	vmcb02->save.rax = vmcb12->save.rax;
594 	vmcb02->save.rsp = vmcb12->save.rsp;
595 	vmcb02->save.rip = vmcb12->save.rip;
596 
597 	/* These bits will be set properly on the first execution when new_vmc12 is true */
598 	if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
599 		vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
600 		svm->vcpu.arch.dr6  = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
601 		vmcb_mark_dirty(vmcb02, VMCB_DR);
602 	}
603 
604 	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
605 		/*
606 		 * Reserved bits of DEBUGCTL are ignored.  Be consistent with
607 		 * svm_set_msr's definition of reserved bits.
608 		 */
609 		svm_copy_lbrs(vmcb02, vmcb12);
610 		vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
611 		svm_update_lbrv(&svm->vcpu);
612 
613 	} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
614 		svm_copy_lbrs(vmcb02, vmcb01);
615 	}
616 }
617 
618 static inline bool is_evtinj_soft(u32 evtinj)
619 {
620 	u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
621 	u8 vector = evtinj & SVM_EVTINJ_VEC_MASK;
622 
623 	if (!(evtinj & SVM_EVTINJ_VALID))
624 		return false;
625 
626 	if (type == SVM_EVTINJ_TYPE_SOFT)
627 		return true;
628 
629 	return type == SVM_EVTINJ_TYPE_EXEPT && kvm_exception_is_soft(vector);
630 }
631 
632 static bool is_evtinj_nmi(u32 evtinj)
633 {
634 	u32 type = evtinj & SVM_EVTINJ_TYPE_MASK;
635 
636 	if (!(evtinj & SVM_EVTINJ_VALID))
637 		return false;
638 
639 	return type == SVM_EVTINJ_TYPE_NMI;
640 }
641 
642 static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
643 					  unsigned long vmcb12_rip,
644 					  unsigned long vmcb12_csbase)
645 {
646 	u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
647 	u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
648 
649 	struct kvm_vcpu *vcpu = &svm->vcpu;
650 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
651 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
652 	u32 pause_count12;
653 	u32 pause_thresh12;
654 
655 	/*
656 	 * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
657 	 * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
658 	 */
659 
660 	if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
661 		int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
662 	else
663 		int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
664 
665 	/* Copied from vmcb01.  msrpm_base can be overwritten later.  */
666 	vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
667 	vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
668 	vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
669 
670 	/* Done at vmrun: asid.  */
671 
672 	/* Also overwritten later if necessary.  */
673 	vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
674 
675 	/* nested_cr3.  */
676 	if (nested_npt_enabled(svm))
677 		nested_svm_init_mmu_context(vcpu);
678 
679 	vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset(
680 			vcpu->arch.l1_tsc_offset,
681 			svm->nested.ctl.tsc_offset,
682 			svm->tsc_ratio_msr);
683 
684 	vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
685 
686 	if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
687 		WARN_ON(!svm->tsc_scaling_enabled);
688 		nested_svm_update_tsc_ratio_msr(vcpu);
689 	}
690 
691 	vmcb02->control.int_ctl             =
692 		(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
693 		(vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
694 
695 	vmcb02->control.int_vector          = svm->nested.ctl.int_vector;
696 	vmcb02->control.int_state           = svm->nested.ctl.int_state;
697 	vmcb02->control.event_inj           = svm->nested.ctl.event_inj;
698 	vmcb02->control.event_inj_err       = svm->nested.ctl.event_inj_err;
699 
700 	/*
701 	 * next_rip is consumed on VMRUN as the return address pushed on the
702 	 * stack for injected soft exceptions/interrupts.  If nrips is exposed
703 	 * to L1, take it verbatim from vmcb12.  If nrips is supported in
704 	 * hardware but not exposed to L1, stuff the actual L2 RIP to emulate
705 	 * what a nrips=0 CPU would do (L1 is responsible for advancing RIP
706 	 * prior to injecting the event).
707 	 */
708 	if (svm->nrips_enabled)
709 		vmcb02->control.next_rip    = svm->nested.ctl.next_rip;
710 	else if (boot_cpu_has(X86_FEATURE_NRIPS))
711 		vmcb02->control.next_rip    = vmcb12_rip;
712 
713 	svm->nmi_l1_to_l2 = is_evtinj_nmi(vmcb02->control.event_inj);
714 	if (is_evtinj_soft(vmcb02->control.event_inj)) {
715 		svm->soft_int_injected = true;
716 		svm->soft_int_csbase = vmcb12_csbase;
717 		svm->soft_int_old_rip = vmcb12_rip;
718 		if (svm->nrips_enabled)
719 			svm->soft_int_next_rip = svm->nested.ctl.next_rip;
720 		else
721 			svm->soft_int_next_rip = vmcb12_rip;
722 	}
723 
724 	vmcb02->control.virt_ext            = vmcb01->control.virt_ext &
725 					      LBR_CTL_ENABLE_MASK;
726 	if (svm->lbrv_enabled)
727 		vmcb02->control.virt_ext  |=
728 			(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
729 
730 	if (!nested_vmcb_needs_vls_intercept(svm))
731 		vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
732 
733 	pause_count12 = svm->pause_filter_enabled ? svm->nested.ctl.pause_filter_count : 0;
734 	pause_thresh12 = svm->pause_threshold_enabled ? svm->nested.ctl.pause_filter_thresh : 0;
735 	if (kvm_pause_in_guest(svm->vcpu.kvm)) {
736 		/* use guest values since host doesn't intercept PAUSE */
737 		vmcb02->control.pause_filter_count = pause_count12;
738 		vmcb02->control.pause_filter_thresh = pause_thresh12;
739 
740 	} else {
741 		/* start from host values otherwise */
742 		vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
743 		vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
744 
745 		/* ... but ensure filtering is disabled if so requested.  */
746 		if (vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
747 			if (!pause_count12)
748 				vmcb02->control.pause_filter_count = 0;
749 			if (!pause_thresh12)
750 				vmcb02->control.pause_filter_thresh = 0;
751 		}
752 	}
753 
754 	nested_svm_transition_tlb_flush(vcpu);
755 
756 	/* Enter Guest-Mode */
757 	enter_guest_mode(vcpu);
758 
759 	/*
760 	 * Merge guest and host intercepts - must be called with vcpu in
761 	 * guest-mode to take effect.
762 	 */
763 	recalc_intercepts(svm);
764 }
765 
766 static void nested_svm_copy_common_state(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
767 {
768 	/*
769 	 * Some VMCB state is shared between L1 and L2 and thus has to be
770 	 * moved at the time of nested vmrun and vmexit.
771 	 *
772 	 * VMLOAD/VMSAVE state would also belong in this category, but KVM
773 	 * always performs VMLOAD and VMSAVE from the VMCB01.
774 	 */
775 	to_vmcb->save.spec_ctrl = from_vmcb->save.spec_ctrl;
776 }
777 
778 int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
779 			 struct vmcb *vmcb12, bool from_vmrun)
780 {
781 	struct vcpu_svm *svm = to_svm(vcpu);
782 	int ret;
783 
784 	trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
785 			       vmcb12->save.rip,
786 			       vmcb12->control.int_ctl,
787 			       vmcb12->control.event_inj,
788 			       vmcb12->control.nested_ctl);
789 
790 	trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
791 				    vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
792 				    vmcb12->control.intercepts[INTERCEPT_EXCEPTION],
793 				    vmcb12->control.intercepts[INTERCEPT_WORD3],
794 				    vmcb12->control.intercepts[INTERCEPT_WORD4],
795 				    vmcb12->control.intercepts[INTERCEPT_WORD5]);
796 
797 
798 	svm->nested.vmcb12_gpa = vmcb12_gpa;
799 
800 	WARN_ON(svm->vmcb == svm->nested.vmcb02.ptr);
801 
802 	nested_svm_copy_common_state(svm->vmcb01.ptr, svm->nested.vmcb02.ptr);
803 
804 	svm_switch_vmcb(svm, &svm->nested.vmcb02);
805 	nested_vmcb02_prepare_control(svm, vmcb12->save.rip, vmcb12->save.cs.base);
806 	nested_vmcb02_prepare_save(svm, vmcb12);
807 
808 	ret = nested_svm_load_cr3(&svm->vcpu, svm->nested.save.cr3,
809 				  nested_npt_enabled(svm), from_vmrun);
810 	if (ret)
811 		return ret;
812 
813 	if (!from_vmrun)
814 		kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
815 
816 	svm_set_gif(svm, true);
817 
818 	if (kvm_vcpu_apicv_active(vcpu))
819 		kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
820 
821 	return 0;
822 }
823 
824 int nested_svm_vmrun(struct kvm_vcpu *vcpu)
825 {
826 	struct vcpu_svm *svm = to_svm(vcpu);
827 	int ret;
828 	struct vmcb *vmcb12;
829 	struct kvm_host_map map;
830 	u64 vmcb12_gpa;
831 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
832 
833 	if (!svm->nested.hsave_msr) {
834 		kvm_inject_gp(vcpu, 0);
835 		return 1;
836 	}
837 
838 	if (is_smm(vcpu)) {
839 		kvm_queue_exception(vcpu, UD_VECTOR);
840 		return 1;
841 	}
842 
843 	vmcb12_gpa = svm->vmcb->save.rax;
844 	ret = kvm_vcpu_map(vcpu, gpa_to_gfn(vmcb12_gpa), &map);
845 	if (ret == -EINVAL) {
846 		kvm_inject_gp(vcpu, 0);
847 		return 1;
848 	} else if (ret) {
849 		return kvm_skip_emulated_instruction(vcpu);
850 	}
851 
852 	ret = kvm_skip_emulated_instruction(vcpu);
853 
854 	vmcb12 = map.hva;
855 
856 	if (WARN_ON_ONCE(!svm->nested.initialized))
857 		return -EINVAL;
858 
859 	nested_copy_vmcb_control_to_cache(svm, &vmcb12->control);
860 	nested_copy_vmcb_save_to_cache(svm, &vmcb12->save);
861 
862 	if (!nested_vmcb_check_save(vcpu) ||
863 	    !nested_vmcb_check_controls(vcpu)) {
864 		vmcb12->control.exit_code    = SVM_EXIT_ERR;
865 		vmcb12->control.exit_code_hi = 0;
866 		vmcb12->control.exit_info_1  = 0;
867 		vmcb12->control.exit_info_2  = 0;
868 		goto out;
869 	}
870 
871 	/*
872 	 * Since vmcb01 is not in use, we can use it to store some of the L1
873 	 * state.
874 	 */
875 	vmcb01->save.efer   = vcpu->arch.efer;
876 	vmcb01->save.cr0    = kvm_read_cr0(vcpu);
877 	vmcb01->save.cr4    = vcpu->arch.cr4;
878 	vmcb01->save.rflags = kvm_get_rflags(vcpu);
879 	vmcb01->save.rip    = kvm_rip_read(vcpu);
880 
881 	if (!npt_enabled)
882 		vmcb01->save.cr3 = kvm_read_cr3(vcpu);
883 
884 	svm->nested.nested_run_pending = 1;
885 
886 	if (enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, true))
887 		goto out_exit_err;
888 
889 	if (nested_svm_vmrun_msrpm(svm))
890 		goto out;
891 
892 out_exit_err:
893 	svm->nested.nested_run_pending = 0;
894 	svm->nmi_l1_to_l2 = false;
895 	svm->soft_int_injected = false;
896 
897 	svm->vmcb->control.exit_code    = SVM_EXIT_ERR;
898 	svm->vmcb->control.exit_code_hi = 0;
899 	svm->vmcb->control.exit_info_1  = 0;
900 	svm->vmcb->control.exit_info_2  = 0;
901 
902 	nested_svm_vmexit(svm);
903 
904 out:
905 	kvm_vcpu_unmap(vcpu, &map, true);
906 
907 	return ret;
908 }
909 
910 /* Copy state save area fields which are handled by VMRUN */
911 void svm_copy_vmrun_state(struct vmcb_save_area *to_save,
912 			  struct vmcb_save_area *from_save)
913 {
914 	to_save->es = from_save->es;
915 	to_save->cs = from_save->cs;
916 	to_save->ss = from_save->ss;
917 	to_save->ds = from_save->ds;
918 	to_save->gdtr = from_save->gdtr;
919 	to_save->idtr = from_save->idtr;
920 	to_save->rflags = from_save->rflags | X86_EFLAGS_FIXED;
921 	to_save->efer = from_save->efer;
922 	to_save->cr0 = from_save->cr0;
923 	to_save->cr3 = from_save->cr3;
924 	to_save->cr4 = from_save->cr4;
925 	to_save->rax = from_save->rax;
926 	to_save->rsp = from_save->rsp;
927 	to_save->rip = from_save->rip;
928 	to_save->cpl = 0;
929 }
930 
931 void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
932 {
933 	to_vmcb->save.fs = from_vmcb->save.fs;
934 	to_vmcb->save.gs = from_vmcb->save.gs;
935 	to_vmcb->save.tr = from_vmcb->save.tr;
936 	to_vmcb->save.ldtr = from_vmcb->save.ldtr;
937 	to_vmcb->save.kernel_gs_base = from_vmcb->save.kernel_gs_base;
938 	to_vmcb->save.star = from_vmcb->save.star;
939 	to_vmcb->save.lstar = from_vmcb->save.lstar;
940 	to_vmcb->save.cstar = from_vmcb->save.cstar;
941 	to_vmcb->save.sfmask = from_vmcb->save.sfmask;
942 	to_vmcb->save.sysenter_cs = from_vmcb->save.sysenter_cs;
943 	to_vmcb->save.sysenter_esp = from_vmcb->save.sysenter_esp;
944 	to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
945 }
946 
947 int nested_svm_vmexit(struct vcpu_svm *svm)
948 {
949 	struct kvm_vcpu *vcpu = &svm->vcpu;
950 	struct vmcb *vmcb01 = svm->vmcb01.ptr;
951 	struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
952 	struct vmcb *vmcb12;
953 	struct kvm_host_map map;
954 	int rc;
955 
956 	rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
957 	if (rc) {
958 		if (rc == -EINVAL)
959 			kvm_inject_gp(vcpu, 0);
960 		return 1;
961 	}
962 
963 	vmcb12 = map.hva;
964 
965 	/* Exit Guest-Mode */
966 	leave_guest_mode(vcpu);
967 	svm->nested.vmcb12_gpa = 0;
968 	WARN_ON_ONCE(svm->nested.nested_run_pending);
969 
970 	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
971 
972 	/* in case we halted in L2 */
973 	svm->vcpu.arch.mp_state = KVM_MP_STATE_RUNNABLE;
974 
975 	/* Give the current vmcb to the guest */
976 
977 	vmcb12->save.es     = vmcb02->save.es;
978 	vmcb12->save.cs     = vmcb02->save.cs;
979 	vmcb12->save.ss     = vmcb02->save.ss;
980 	vmcb12->save.ds     = vmcb02->save.ds;
981 	vmcb12->save.gdtr   = vmcb02->save.gdtr;
982 	vmcb12->save.idtr   = vmcb02->save.idtr;
983 	vmcb12->save.efer   = svm->vcpu.arch.efer;
984 	vmcb12->save.cr0    = kvm_read_cr0(vcpu);
985 	vmcb12->save.cr3    = kvm_read_cr3(vcpu);
986 	vmcb12->save.cr2    = vmcb02->save.cr2;
987 	vmcb12->save.cr4    = svm->vcpu.arch.cr4;
988 	vmcb12->save.rflags = kvm_get_rflags(vcpu);
989 	vmcb12->save.rip    = kvm_rip_read(vcpu);
990 	vmcb12->save.rsp    = kvm_rsp_read(vcpu);
991 	vmcb12->save.rax    = kvm_rax_read(vcpu);
992 	vmcb12->save.dr7    = vmcb02->save.dr7;
993 	vmcb12->save.dr6    = svm->vcpu.arch.dr6;
994 	vmcb12->save.cpl    = vmcb02->save.cpl;
995 
996 	vmcb12->control.int_state         = vmcb02->control.int_state;
997 	vmcb12->control.exit_code         = vmcb02->control.exit_code;
998 	vmcb12->control.exit_code_hi      = vmcb02->control.exit_code_hi;
999 	vmcb12->control.exit_info_1       = vmcb02->control.exit_info_1;
1000 	vmcb12->control.exit_info_2       = vmcb02->control.exit_info_2;
1001 
1002 	if (vmcb12->control.exit_code != SVM_EXIT_ERR)
1003 		nested_save_pending_event_to_vmcb12(svm, vmcb12);
1004 
1005 	if (svm->nrips_enabled)
1006 		vmcb12->control.next_rip  = vmcb02->control.next_rip;
1007 
1008 	vmcb12->control.int_ctl           = svm->nested.ctl.int_ctl;
1009 	vmcb12->control.tlb_ctl           = svm->nested.ctl.tlb_ctl;
1010 	vmcb12->control.event_inj         = svm->nested.ctl.event_inj;
1011 	vmcb12->control.event_inj_err     = svm->nested.ctl.event_inj_err;
1012 
1013 	if (!kvm_pause_in_guest(vcpu->kvm)) {
1014 		vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
1015 		vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
1016 
1017 	}
1018 
1019 	nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
1020 
1021 	svm_switch_vmcb(svm, &svm->vmcb01);
1022 
1023 	if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
1024 		svm_copy_lbrs(vmcb12, vmcb02);
1025 		svm_update_lbrv(vcpu);
1026 	} else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
1027 		svm_copy_lbrs(vmcb01, vmcb02);
1028 		svm_update_lbrv(vcpu);
1029 	}
1030 
1031 	/*
1032 	 * On vmexit the  GIF is set to false and
1033 	 * no event can be injected in L1.
1034 	 */
1035 	svm_set_gif(svm, false);
1036 	vmcb01->control.exit_int_info = 0;
1037 
1038 	svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
1039 	if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
1040 		vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
1041 		vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
1042 	}
1043 
1044 	if (svm->tsc_ratio_msr != kvm_caps.default_tsc_scaling_ratio) {
1045 		WARN_ON(!svm->tsc_scaling_enabled);
1046 		vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
1047 		__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1048 	}
1049 
1050 	svm->nested.ctl.nested_cr3 = 0;
1051 
1052 	/*
1053 	 * Restore processor state that had been saved in vmcb01
1054 	 */
1055 	kvm_set_rflags(vcpu, vmcb01->save.rflags);
1056 	svm_set_efer(vcpu, vmcb01->save.efer);
1057 	svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
1058 	svm_set_cr4(vcpu, vmcb01->save.cr4);
1059 	kvm_rax_write(vcpu, vmcb01->save.rax);
1060 	kvm_rsp_write(vcpu, vmcb01->save.rsp);
1061 	kvm_rip_write(vcpu, vmcb01->save.rip);
1062 
1063 	svm->vcpu.arch.dr7 = DR7_FIXED_1;
1064 	kvm_update_dr7(&svm->vcpu);
1065 
1066 	trace_kvm_nested_vmexit_inject(vmcb12->control.exit_code,
1067 				       vmcb12->control.exit_info_1,
1068 				       vmcb12->control.exit_info_2,
1069 				       vmcb12->control.exit_int_info,
1070 				       vmcb12->control.exit_int_info_err,
1071 				       KVM_ISA_SVM);
1072 
1073 	kvm_vcpu_unmap(vcpu, &map, true);
1074 
1075 	nested_svm_transition_tlb_flush(vcpu);
1076 
1077 	nested_svm_uninit_mmu_context(vcpu);
1078 
1079 	rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
1080 	if (rc)
1081 		return 1;
1082 
1083 	/*
1084 	 * Drop what we picked up for L2 via svm_complete_interrupts() so it
1085 	 * doesn't end up in L1.
1086 	 */
1087 	svm->vcpu.arch.nmi_injected = false;
1088 	kvm_clear_exception_queue(vcpu);
1089 	kvm_clear_interrupt_queue(vcpu);
1090 
1091 	/*
1092 	 * If we are here following the completion of a VMRUN that
1093 	 * is being single-stepped, queue the pending #DB intercept
1094 	 * right now so that it an be accounted for before we execute
1095 	 * L1's next instruction.
1096 	 */
1097 	if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
1098 		kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
1099 
1100 	/*
1101 	 * Un-inhibit the AVIC right away, so that other vCPUs can start
1102 	 * to benefit from it right away.
1103 	 */
1104 	if (kvm_apicv_activated(vcpu->kvm))
1105 		kvm_vcpu_update_apicv(vcpu);
1106 
1107 	return 0;
1108 }
1109 
1110 static void nested_svm_triple_fault(struct kvm_vcpu *vcpu)
1111 {
1112 	nested_svm_simple_vmexit(to_svm(vcpu), SVM_EXIT_SHUTDOWN);
1113 }
1114 
1115 int svm_allocate_nested(struct vcpu_svm *svm)
1116 {
1117 	struct page *vmcb02_page;
1118 
1119 	if (svm->nested.initialized)
1120 		return 0;
1121 
1122 	vmcb02_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
1123 	if (!vmcb02_page)
1124 		return -ENOMEM;
1125 	svm->nested.vmcb02.ptr = page_address(vmcb02_page);
1126 	svm->nested.vmcb02.pa = __sme_set(page_to_pfn(vmcb02_page) << PAGE_SHIFT);
1127 
1128 	svm->nested.msrpm = svm_vcpu_alloc_msrpm();
1129 	if (!svm->nested.msrpm)
1130 		goto err_free_vmcb02;
1131 	svm_vcpu_init_msrpm(&svm->vcpu, svm->nested.msrpm);
1132 
1133 	svm->nested.initialized = true;
1134 	return 0;
1135 
1136 err_free_vmcb02:
1137 	__free_page(vmcb02_page);
1138 	return -ENOMEM;
1139 }
1140 
1141 void svm_free_nested(struct vcpu_svm *svm)
1142 {
1143 	if (!svm->nested.initialized)
1144 		return;
1145 
1146 	svm_vcpu_free_msrpm(svm->nested.msrpm);
1147 	svm->nested.msrpm = NULL;
1148 
1149 	__free_page(virt_to_page(svm->nested.vmcb02.ptr));
1150 	svm->nested.vmcb02.ptr = NULL;
1151 
1152 	/*
1153 	 * When last_vmcb12_gpa matches the current vmcb12 gpa,
1154 	 * some vmcb12 fields are not loaded if they are marked clean
1155 	 * in the vmcb12, since in this case they are up to date already.
1156 	 *
1157 	 * When the vmcb02 is freed, this optimization becomes invalid.
1158 	 */
1159 	svm->nested.last_vmcb12_gpa = INVALID_GPA;
1160 
1161 	svm->nested.initialized = false;
1162 }
1163 
1164 /*
1165  * Forcibly leave nested mode in order to be able to reset the VCPU later on.
1166  */
1167 void svm_leave_nested(struct kvm_vcpu *vcpu)
1168 {
1169 	struct vcpu_svm *svm = to_svm(vcpu);
1170 
1171 	if (is_guest_mode(vcpu)) {
1172 		svm->nested.nested_run_pending = 0;
1173 		svm->nested.vmcb12_gpa = INVALID_GPA;
1174 
1175 		leave_guest_mode(vcpu);
1176 
1177 		svm_switch_vmcb(svm, &svm->vmcb01);
1178 
1179 		nested_svm_uninit_mmu_context(vcpu);
1180 		vmcb_mark_all_dirty(svm->vmcb);
1181 	}
1182 
1183 	kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1184 }
1185 
1186 static int nested_svm_exit_handled_msr(struct vcpu_svm *svm)
1187 {
1188 	u32 offset, msr, value;
1189 	int write, mask;
1190 
1191 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_MSR_PROT)))
1192 		return NESTED_EXIT_HOST;
1193 
1194 	msr    = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1195 	offset = svm_msrpm_offset(msr);
1196 	write  = svm->vmcb->control.exit_info_1 & 1;
1197 	mask   = 1 << ((2 * (msr & 0xf)) + write);
1198 
1199 	if (offset == MSR_INVALID)
1200 		return NESTED_EXIT_DONE;
1201 
1202 	/* Offset is in 32 bit units but need in 8 bit units */
1203 	offset *= 4;
1204 
1205 	if (kvm_vcpu_read_guest(&svm->vcpu, svm->nested.ctl.msrpm_base_pa + offset, &value, 4))
1206 		return NESTED_EXIT_DONE;
1207 
1208 	return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1209 }
1210 
1211 static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
1212 {
1213 	unsigned port, size, iopm_len;
1214 	u16 val, mask;
1215 	u8 start_bit;
1216 	u64 gpa;
1217 
1218 	if (!(vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_IOIO_PROT)))
1219 		return NESTED_EXIT_HOST;
1220 
1221 	port = svm->vmcb->control.exit_info_1 >> 16;
1222 	size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >>
1223 		SVM_IOIO_SIZE_SHIFT;
1224 	gpa  = svm->nested.ctl.iopm_base_pa + (port / 8);
1225 	start_bit = port % 8;
1226 	iopm_len = (start_bit + size > 8) ? 2 : 1;
1227 	mask = (0xf >> (4 - size)) << start_bit;
1228 	val = 0;
1229 
1230 	if (kvm_vcpu_read_guest(&svm->vcpu, gpa, &val, iopm_len))
1231 		return NESTED_EXIT_DONE;
1232 
1233 	return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST;
1234 }
1235 
1236 static int nested_svm_intercept(struct vcpu_svm *svm)
1237 {
1238 	u32 exit_code = svm->vmcb->control.exit_code;
1239 	int vmexit = NESTED_EXIT_HOST;
1240 
1241 	switch (exit_code) {
1242 	case SVM_EXIT_MSR:
1243 		vmexit = nested_svm_exit_handled_msr(svm);
1244 		break;
1245 	case SVM_EXIT_IOIO:
1246 		vmexit = nested_svm_intercept_ioio(svm);
1247 		break;
1248 	case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1249 		if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
1250 			vmexit = NESTED_EXIT_DONE;
1251 		break;
1252 	}
1253 	case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1254 		if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
1255 			vmexit = NESTED_EXIT_DONE;
1256 		break;
1257 	}
1258 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1259 		/*
1260 		 * Host-intercepted exceptions have been checked already in
1261 		 * nested_svm_exit_special.  There is nothing to do here,
1262 		 * the vmexit is injected by svm_check_nested_events.
1263 		 */
1264 		vmexit = NESTED_EXIT_DONE;
1265 		break;
1266 	}
1267 	case SVM_EXIT_ERR: {
1268 		vmexit = NESTED_EXIT_DONE;
1269 		break;
1270 	}
1271 	default: {
1272 		if (vmcb12_is_intercept(&svm->nested.ctl, exit_code))
1273 			vmexit = NESTED_EXIT_DONE;
1274 	}
1275 	}
1276 
1277 	return vmexit;
1278 }
1279 
1280 int nested_svm_exit_handled(struct vcpu_svm *svm)
1281 {
1282 	int vmexit;
1283 
1284 	vmexit = nested_svm_intercept(svm);
1285 
1286 	if (vmexit == NESTED_EXIT_DONE)
1287 		nested_svm_vmexit(svm);
1288 
1289 	return vmexit;
1290 }
1291 
1292 int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
1293 {
1294 	if (!(vcpu->arch.efer & EFER_SVME) || !is_paging(vcpu)) {
1295 		kvm_queue_exception(vcpu, UD_VECTOR);
1296 		return 1;
1297 	}
1298 
1299 	if (to_svm(vcpu)->vmcb->save.cpl) {
1300 		kvm_inject_gp(vcpu, 0);
1301 		return 1;
1302 	}
1303 
1304 	return 0;
1305 }
1306 
1307 static bool nested_exit_on_exception(struct vcpu_svm *svm)
1308 {
1309 	unsigned int nr = svm->vcpu.arch.exception.nr;
1310 
1311 	return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
1312 }
1313 
1314 static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
1315 {
1316 	unsigned int nr = svm->vcpu.arch.exception.nr;
1317 	struct vmcb *vmcb = svm->vmcb;
1318 
1319 	vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
1320 	vmcb->control.exit_code_hi = 0;
1321 
1322 	if (svm->vcpu.arch.exception.has_error_code)
1323 		vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
1324 
1325 	/*
1326 	 * EXITINFO2 is undefined for all exception intercepts other
1327 	 * than #PF.
1328 	 */
1329 	if (nr == PF_VECTOR) {
1330 		if (svm->vcpu.arch.exception.nested_apf)
1331 			vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
1332 		else if (svm->vcpu.arch.exception.has_payload)
1333 			vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
1334 		else
1335 			vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
1336 	} else if (nr == DB_VECTOR) {
1337 		/* See inject_pending_event.  */
1338 		kvm_deliver_exception_payload(&svm->vcpu);
1339 		if (svm->vcpu.arch.dr7 & DR7_GD) {
1340 			svm->vcpu.arch.dr7 &= ~DR7_GD;
1341 			kvm_update_dr7(&svm->vcpu);
1342 		}
1343 	} else
1344 		WARN_ON(svm->vcpu.arch.exception.has_payload);
1345 
1346 	nested_svm_vmexit(svm);
1347 }
1348 
1349 static inline bool nested_exit_on_init(struct vcpu_svm *svm)
1350 {
1351 	return vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_INIT);
1352 }
1353 
1354 static int svm_check_nested_events(struct kvm_vcpu *vcpu)
1355 {
1356 	struct vcpu_svm *svm = to_svm(vcpu);
1357 	bool block_nested_events =
1358 		kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
1359 	struct kvm_lapic *apic = vcpu->arch.apic;
1360 
1361 	if (lapic_in_kernel(vcpu) &&
1362 	    test_bit(KVM_APIC_INIT, &apic->pending_events)) {
1363 		if (block_nested_events)
1364 			return -EBUSY;
1365 		if (!nested_exit_on_init(svm))
1366 			return 0;
1367 		nested_svm_simple_vmexit(svm, SVM_EXIT_INIT);
1368 		return 0;
1369 	}
1370 
1371 	if (vcpu->arch.exception.pending) {
1372 		/*
1373 		 * Only a pending nested run can block a pending exception.
1374 		 * Otherwise an injected NMI/interrupt should either be
1375 		 * lost or delivered to the nested hypervisor in the EXITINTINFO
1376 		 * vmcb field, while delivering the pending exception.
1377 		 */
1378 		if (svm->nested.nested_run_pending)
1379                         return -EBUSY;
1380 		if (!nested_exit_on_exception(svm))
1381 			return 0;
1382 		nested_svm_inject_exception_vmexit(svm);
1383 		return 0;
1384 	}
1385 
1386 	if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) {
1387 		if (block_nested_events)
1388 			return -EBUSY;
1389 		if (!nested_exit_on_smi(svm))
1390 			return 0;
1391 		nested_svm_simple_vmexit(svm, SVM_EXIT_SMI);
1392 		return 0;
1393 	}
1394 
1395 	if (vcpu->arch.nmi_pending && !svm_nmi_blocked(vcpu)) {
1396 		if (block_nested_events)
1397 			return -EBUSY;
1398 		if (!nested_exit_on_nmi(svm))
1399 			return 0;
1400 		nested_svm_simple_vmexit(svm, SVM_EXIT_NMI);
1401 		return 0;
1402 	}
1403 
1404 	if (kvm_cpu_has_interrupt(vcpu) && !svm_interrupt_blocked(vcpu)) {
1405 		if (block_nested_events)
1406 			return -EBUSY;
1407 		if (!nested_exit_on_intr(svm))
1408 			return 0;
1409 		trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1410 		nested_svm_simple_vmexit(svm, SVM_EXIT_INTR);
1411 		return 0;
1412 	}
1413 
1414 	return 0;
1415 }
1416 
1417 int nested_svm_exit_special(struct vcpu_svm *svm)
1418 {
1419 	u32 exit_code = svm->vmcb->control.exit_code;
1420 
1421 	switch (exit_code) {
1422 	case SVM_EXIT_INTR:
1423 	case SVM_EXIT_NMI:
1424 	case SVM_EXIT_NPF:
1425 		return NESTED_EXIT_HOST;
1426 	case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: {
1427 		u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1428 
1429 		if (svm->vmcb01.ptr->control.intercepts[INTERCEPT_EXCEPTION] &
1430 		    excp_bits)
1431 			return NESTED_EXIT_HOST;
1432 		else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR &&
1433 			 svm->vcpu.arch.apf.host_apf_flags)
1434 			/* Trap async PF even if not shadowing */
1435 			return NESTED_EXIT_HOST;
1436 		break;
1437 	}
1438 	default:
1439 		break;
1440 	}
1441 
1442 	return NESTED_EXIT_CONTINUE;
1443 }
1444 
1445 void nested_svm_update_tsc_ratio_msr(struct kvm_vcpu *vcpu)
1446 {
1447 	struct vcpu_svm *svm = to_svm(vcpu);
1448 
1449 	vcpu->arch.tsc_scaling_ratio =
1450 		kvm_calc_nested_tsc_multiplier(vcpu->arch.l1_tsc_scaling_ratio,
1451 					       svm->tsc_ratio_msr);
1452 	__svm_write_tsc_multiplier(vcpu->arch.tsc_scaling_ratio);
1453 }
1454 
1455 /* Inverse operation of nested_copy_vmcb_control_to_cache(). asid is copied too. */
1456 static void nested_copy_vmcb_cache_to_control(struct vmcb_control_area *dst,
1457 					      struct vmcb_ctrl_area_cached *from)
1458 {
1459 	unsigned int i;
1460 
1461 	memset(dst, 0, sizeof(struct vmcb_control_area));
1462 
1463 	for (i = 0; i < MAX_INTERCEPT; i++)
1464 		dst->intercepts[i] = from->intercepts[i];
1465 
1466 	dst->iopm_base_pa         = from->iopm_base_pa;
1467 	dst->msrpm_base_pa        = from->msrpm_base_pa;
1468 	dst->tsc_offset           = from->tsc_offset;
1469 	dst->asid                 = from->asid;
1470 	dst->tlb_ctl              = from->tlb_ctl;
1471 	dst->int_ctl              = from->int_ctl;
1472 	dst->int_vector           = from->int_vector;
1473 	dst->int_state            = from->int_state;
1474 	dst->exit_code            = from->exit_code;
1475 	dst->exit_code_hi         = from->exit_code_hi;
1476 	dst->exit_info_1          = from->exit_info_1;
1477 	dst->exit_info_2          = from->exit_info_2;
1478 	dst->exit_int_info        = from->exit_int_info;
1479 	dst->exit_int_info_err    = from->exit_int_info_err;
1480 	dst->nested_ctl           = from->nested_ctl;
1481 	dst->event_inj            = from->event_inj;
1482 	dst->event_inj_err        = from->event_inj_err;
1483 	dst->next_rip             = from->next_rip;
1484 	dst->nested_cr3           = from->nested_cr3;
1485 	dst->virt_ext              = from->virt_ext;
1486 	dst->pause_filter_count   = from->pause_filter_count;
1487 	dst->pause_filter_thresh  = from->pause_filter_thresh;
1488 	/* 'clean' and 'reserved_sw' are not changed by KVM */
1489 }
1490 
1491 static int svm_get_nested_state(struct kvm_vcpu *vcpu,
1492 				struct kvm_nested_state __user *user_kvm_nested_state,
1493 				u32 user_data_size)
1494 {
1495 	struct vcpu_svm *svm;
1496 	struct vmcb_control_area *ctl;
1497 	unsigned long r;
1498 	struct kvm_nested_state kvm_state = {
1499 		.flags = 0,
1500 		.format = KVM_STATE_NESTED_FORMAT_SVM,
1501 		.size = sizeof(kvm_state),
1502 	};
1503 	struct vmcb __user *user_vmcb = (struct vmcb __user *)
1504 		&user_kvm_nested_state->data.svm[0];
1505 
1506 	if (!vcpu)
1507 		return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE;
1508 
1509 	svm = to_svm(vcpu);
1510 
1511 	if (user_data_size < kvm_state.size)
1512 		goto out;
1513 
1514 	/* First fill in the header and copy it out.  */
1515 	if (is_guest_mode(vcpu)) {
1516 		kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb12_gpa;
1517 		kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE;
1518 		kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
1519 
1520 		if (svm->nested.nested_run_pending)
1521 			kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
1522 	}
1523 
1524 	if (gif_set(svm))
1525 		kvm_state.flags |= KVM_STATE_NESTED_GIF_SET;
1526 
1527 	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
1528 		return -EFAULT;
1529 
1530 	if (!is_guest_mode(vcpu))
1531 		goto out;
1532 
1533 	/*
1534 	 * Copy over the full size of the VMCB rather than just the size
1535 	 * of the structs.
1536 	 */
1537 	if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE))
1538 		return -EFAULT;
1539 
1540 	ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
1541 	if (!ctl)
1542 		return -ENOMEM;
1543 
1544 	nested_copy_vmcb_cache_to_control(ctl, &svm->nested.ctl);
1545 	r = copy_to_user(&user_vmcb->control, ctl,
1546 			 sizeof(user_vmcb->control));
1547 	kfree(ctl);
1548 	if (r)
1549 		return -EFAULT;
1550 
1551 	if (copy_to_user(&user_vmcb->save, &svm->vmcb01.ptr->save,
1552 			 sizeof(user_vmcb->save)))
1553 		return -EFAULT;
1554 out:
1555 	return kvm_state.size;
1556 }
1557 
1558 static int svm_set_nested_state(struct kvm_vcpu *vcpu,
1559 				struct kvm_nested_state __user *user_kvm_nested_state,
1560 				struct kvm_nested_state *kvm_state)
1561 {
1562 	struct vcpu_svm *svm = to_svm(vcpu);
1563 	struct vmcb __user *user_vmcb = (struct vmcb __user *)
1564 		&user_kvm_nested_state->data.svm[0];
1565 	struct vmcb_control_area *ctl;
1566 	struct vmcb_save_area *save;
1567 	struct vmcb_save_area_cached save_cached;
1568 	struct vmcb_ctrl_area_cached ctl_cached;
1569 	unsigned long cr0;
1570 	int ret;
1571 
1572 	BUILD_BUG_ON(sizeof(struct vmcb_control_area) + sizeof(struct vmcb_save_area) >
1573 		     KVM_STATE_NESTED_SVM_VMCB_SIZE);
1574 
1575 	if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM)
1576 		return -EINVAL;
1577 
1578 	if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE |
1579 				 KVM_STATE_NESTED_RUN_PENDING |
1580 				 KVM_STATE_NESTED_GIF_SET))
1581 		return -EINVAL;
1582 
1583 	/*
1584 	 * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's
1585 	 * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed.
1586 	 */
1587 	if (!(vcpu->arch.efer & EFER_SVME)) {
1588 		/* GIF=1 and no guest mode are required if SVME=0.  */
1589 		if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET)
1590 			return -EINVAL;
1591 	}
1592 
1593 	/* SMM temporarily disables SVM, so we cannot be in guest mode.  */
1594 	if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
1595 		return -EINVAL;
1596 
1597 	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
1598 		svm_leave_nested(vcpu);
1599 		svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
1600 		return 0;
1601 	}
1602 
1603 	if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa))
1604 		return -EINVAL;
1605 	if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE)
1606 		return -EINVAL;
1607 
1608 	ret  = -ENOMEM;
1609 	ctl  = kzalloc(sizeof(*ctl),  GFP_KERNEL_ACCOUNT);
1610 	save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
1611 	if (!ctl || !save)
1612 		goto out_free;
1613 
1614 	ret = -EFAULT;
1615 	if (copy_from_user(ctl, &user_vmcb->control, sizeof(*ctl)))
1616 		goto out_free;
1617 	if (copy_from_user(save, &user_vmcb->save, sizeof(*save)))
1618 		goto out_free;
1619 
1620 	ret = -EINVAL;
1621 	__nested_copy_vmcb_control_to_cache(vcpu, &ctl_cached, ctl);
1622 	if (!__nested_vmcb_check_controls(vcpu, &ctl_cached))
1623 		goto out_free;
1624 
1625 	/*
1626 	 * Processor state contains L2 state.  Check that it is
1627 	 * valid for guest mode (see nested_vmcb_check_save).
1628 	 */
1629 	cr0 = kvm_read_cr0(vcpu);
1630         if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW))
1631 		goto out_free;
1632 
1633 	/*
1634 	 * Validate host state saved from before VMRUN (see
1635 	 * nested_svm_check_permissions).
1636 	 */
1637 	__nested_copy_vmcb_save_to_cache(&save_cached, save);
1638 	if (!(save->cr0 & X86_CR0_PG) ||
1639 	    !(save->cr0 & X86_CR0_PE) ||
1640 	    (save->rflags & X86_EFLAGS_VM) ||
1641 	    !__nested_vmcb_check_save(vcpu, &save_cached))
1642 		goto out_free;
1643 
1644 
1645 	/*
1646 	 * All checks done, we can enter guest mode. Userspace provides
1647 	 * vmcb12.control, which will be combined with L1 and stored into
1648 	 * vmcb02, and the L1 save state which we store in vmcb01.
1649 	 * L2 registers if needed are moved from the current VMCB to VMCB02.
1650 	 */
1651 
1652 	if (is_guest_mode(vcpu))
1653 		svm_leave_nested(vcpu);
1654 	else
1655 		svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
1656 
1657 	svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
1658 
1659 	svm->nested.nested_run_pending =
1660 		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
1661 
1662 	svm->nested.vmcb12_gpa = kvm_state->hdr.svm.vmcb_pa;
1663 
1664 	svm_copy_vmrun_state(&svm->vmcb01.ptr->save, save);
1665 	nested_copy_vmcb_control_to_cache(svm, ctl);
1666 
1667 	svm_switch_vmcb(svm, &svm->nested.vmcb02);
1668 	nested_vmcb02_prepare_control(svm, svm->vmcb->save.rip, svm->vmcb->save.cs.base);
1669 
1670 	/*
1671 	 * While the nested guest CR3 is already checked and set by
1672 	 * KVM_SET_SREGS, it was set when nested state was yet loaded,
1673 	 * thus MMU might not be initialized correctly.
1674 	 * Set it again to fix this.
1675 	 */
1676 
1677 	ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3,
1678 				  nested_npt_enabled(svm), false);
1679 	if (WARN_ON_ONCE(ret))
1680 		goto out_free;
1681 
1682 	svm->nested.force_msr_bitmap_recalc = true;
1683 
1684 	kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
1685 	ret = 0;
1686 out_free:
1687 	kfree(save);
1688 	kfree(ctl);
1689 
1690 	return ret;
1691 }
1692 
1693 static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
1694 {
1695 	struct vcpu_svm *svm = to_svm(vcpu);
1696 
1697 	if (WARN_ON(!is_guest_mode(vcpu)))
1698 		return true;
1699 
1700 	if (!vcpu->arch.pdptrs_from_userspace &&
1701 	    !nested_npt_enabled(svm) && is_pae_paging(vcpu))
1702 		/*
1703 		 * Reload the guest's PDPTRs since after a migration
1704 		 * the guest CR3 might be restored prior to setting the nested
1705 		 * state which can lead to a load of wrong PDPTRs.
1706 		 */
1707 		if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3)))
1708 			return false;
1709 
1710 	if (!nested_svm_vmrun_msrpm(svm)) {
1711 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
1712 		vcpu->run->internal.suberror =
1713 			KVM_INTERNAL_ERROR_EMULATION;
1714 		vcpu->run->internal.ndata = 0;
1715 		return false;
1716 	}
1717 
1718 	return true;
1719 }
1720 
1721 struct kvm_x86_nested_ops svm_nested_ops = {
1722 	.leave_nested = svm_leave_nested,
1723 	.check_events = svm_check_nested_events,
1724 	.handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
1725 	.triple_fault = nested_svm_triple_fault,
1726 	.get_nested_state_pages = svm_get_nested_state_pages,
1727 	.get_state = svm_get_nested_state,
1728 	.set_state = svm_set_nested_state,
1729 };
1730