1 // SPDX-License-Identifier: GPL-2.0 2 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 3 4 #include <linux/objtool.h> 5 #include <linux/percpu.h> 6 7 #include <asm/debugreg.h> 8 #include <asm/mmu_context.h> 9 10 #include "cpuid.h" 11 #include "hyperv.h" 12 #include "mmu.h" 13 #include "nested.h" 14 #include "pmu.h" 15 #include "posted_intr.h" 16 #include "sgx.h" 17 #include "trace.h" 18 #include "vmx.h" 19 #include "x86.h" 20 #include "smm.h" 21 22 static bool __read_mostly enable_shadow_vmcs = 1; 23 module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO); 24 25 static bool __read_mostly nested_early_check = 0; 26 module_param(nested_early_check, bool, S_IRUGO); 27 28 #define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK 29 30 /* 31 * Hyper-V requires all of these, so mark them as supported even though 32 * they are just treated the same as all-context. 33 */ 34 #define VMX_VPID_EXTENT_SUPPORTED_MASK \ 35 (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ 36 VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ 37 VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ 38 VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) 39 40 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 41 42 enum { 43 VMX_VMREAD_BITMAP, 44 VMX_VMWRITE_BITMAP, 45 VMX_BITMAP_NR 46 }; 47 static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; 48 49 #define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) 50 #define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) 51 52 struct shadow_vmcs_field { 53 u16 encoding; 54 u16 offset; 55 }; 56 static struct shadow_vmcs_field shadow_read_only_fields[] = { 57 #define SHADOW_FIELD_RO(x, y) { x, offsetof(struct vmcs12, y) }, 58 #include "vmcs_shadow_fields.h" 59 }; 60 static int max_shadow_read_only_fields = 61 ARRAY_SIZE(shadow_read_only_fields); 62 63 static struct shadow_vmcs_field shadow_read_write_fields[] = { 64 #define SHADOW_FIELD_RW(x, y) { x, offsetof(struct vmcs12, y) }, 65 #include "vmcs_shadow_fields.h" 66 }; 67 static int max_shadow_read_write_fields = 68 ARRAY_SIZE(shadow_read_write_fields); 69 70 static void init_vmcs_shadow_fields(void) 71 { 72 int i, j; 73 74 memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); 75 memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); 76 77 for (i = j = 0; i < max_shadow_read_only_fields; i++) { 78 struct shadow_vmcs_field entry = shadow_read_only_fields[i]; 79 u16 field = entry.encoding; 80 81 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 82 (i + 1 == max_shadow_read_only_fields || 83 shadow_read_only_fields[i + 1].encoding != field + 1)) 84 pr_err("Missing field from shadow_read_only_field %x\n", 85 field + 1); 86 87 clear_bit(field, vmx_vmread_bitmap); 88 if (field & 1) 89 #ifdef CONFIG_X86_64 90 continue; 91 #else 92 entry.offset += sizeof(u32); 93 #endif 94 shadow_read_only_fields[j++] = entry; 95 } 96 max_shadow_read_only_fields = j; 97 98 for (i = j = 0; i < max_shadow_read_write_fields; i++) { 99 struct shadow_vmcs_field entry = shadow_read_write_fields[i]; 100 u16 field = entry.encoding; 101 102 if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && 103 (i + 1 == max_shadow_read_write_fields || 104 shadow_read_write_fields[i + 1].encoding != field + 1)) 105 pr_err("Missing field from shadow_read_write_field %x\n", 106 field + 1); 107 108 WARN_ONCE(field >= GUEST_ES_AR_BYTES && 109 field <= GUEST_TR_AR_BYTES, 110 "Update vmcs12_write_any() to drop reserved bits from AR_BYTES"); 111 112 /* 113 * PML and the preemption timer can be emulated, but the 114 * processor cannot vmwrite to fields that don't exist 115 * on bare metal. 116 */ 117 switch (field) { 118 case GUEST_PML_INDEX: 119 if (!cpu_has_vmx_pml()) 120 continue; 121 break; 122 case VMX_PREEMPTION_TIMER_VALUE: 123 if (!cpu_has_vmx_preemption_timer()) 124 continue; 125 break; 126 case GUEST_INTR_STATUS: 127 if (!cpu_has_vmx_apicv()) 128 continue; 129 break; 130 default: 131 break; 132 } 133 134 clear_bit(field, vmx_vmwrite_bitmap); 135 clear_bit(field, vmx_vmread_bitmap); 136 if (field & 1) 137 #ifdef CONFIG_X86_64 138 continue; 139 #else 140 entry.offset += sizeof(u32); 141 #endif 142 shadow_read_write_fields[j++] = entry; 143 } 144 max_shadow_read_write_fields = j; 145 } 146 147 /* 148 * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), 149 * set the success or error code of an emulated VMX instruction (as specified 150 * by Vol 2B, VMX Instruction Reference, "Conventions"), and skip the emulated 151 * instruction. 152 */ 153 static int nested_vmx_succeed(struct kvm_vcpu *vcpu) 154 { 155 vmx_set_rflags(vcpu, vmx_get_rflags(vcpu) 156 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 157 X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF)); 158 return kvm_skip_emulated_instruction(vcpu); 159 } 160 161 static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) 162 { 163 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 164 & ~(X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF | 165 X86_EFLAGS_SF | X86_EFLAGS_OF)) 166 | X86_EFLAGS_CF); 167 return kvm_skip_emulated_instruction(vcpu); 168 } 169 170 static int nested_vmx_failValid(struct kvm_vcpu *vcpu, 171 u32 vm_instruction_error) 172 { 173 vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) 174 & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | 175 X86_EFLAGS_SF | X86_EFLAGS_OF)) 176 | X86_EFLAGS_ZF); 177 get_vmcs12(vcpu)->vm_instruction_error = vm_instruction_error; 178 /* 179 * We don't need to force sync to shadow VMCS because 180 * VM_INSTRUCTION_ERROR is not shadowed. Enlightened VMCS 'shadows' all 181 * fields and thus must be synced. 182 */ 183 if (to_vmx(vcpu)->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 184 to_vmx(vcpu)->nested.need_vmcs12_to_shadow_sync = true; 185 186 return kvm_skip_emulated_instruction(vcpu); 187 } 188 189 static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) 190 { 191 struct vcpu_vmx *vmx = to_vmx(vcpu); 192 193 /* 194 * failValid writes the error number to the current VMCS, which 195 * can't be done if there isn't a current VMCS. 196 */ 197 if (vmx->nested.current_vmptr == INVALID_GPA && 198 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 199 return nested_vmx_failInvalid(vcpu); 200 201 return nested_vmx_failValid(vcpu, vm_instruction_error); 202 } 203 204 static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) 205 { 206 /* TODO: not to reset guest simply here. */ 207 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 208 pr_debug_ratelimited("nested vmx abort, indicator %d\n", indicator); 209 } 210 211 static inline bool vmx_control_verify(u32 control, u32 low, u32 high) 212 { 213 return fixed_bits_valid(control, low, high); 214 } 215 216 static inline u64 vmx_control_msr(u32 low, u32 high) 217 { 218 return low | ((u64)high << 32); 219 } 220 221 static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) 222 { 223 secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 224 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 225 vmx->nested.need_vmcs12_to_shadow_sync = false; 226 } 227 228 static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) 229 { 230 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); 231 struct vcpu_vmx *vmx = to_vmx(vcpu); 232 233 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 234 kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true); 235 vmx->nested.hv_evmcs = NULL; 236 } 237 238 vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID; 239 240 if (hv_vcpu) { 241 hv_vcpu->nested.pa_page_gpa = INVALID_GPA; 242 hv_vcpu->nested.vm_id = 0; 243 hv_vcpu->nested.vp_id = 0; 244 } 245 } 246 247 static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, 248 struct loaded_vmcs *prev) 249 { 250 struct vmcs_host_state *dest, *src; 251 252 if (unlikely(!vmx->guest_state_loaded)) 253 return; 254 255 src = &prev->host_state; 256 dest = &vmx->loaded_vmcs->host_state; 257 258 vmx_set_host_fs_gs(dest, src->fs_sel, src->gs_sel, src->fs_base, src->gs_base); 259 dest->ldt_sel = src->ldt_sel; 260 #ifdef CONFIG_X86_64 261 dest->ds_sel = src->ds_sel; 262 dest->es_sel = src->es_sel; 263 #endif 264 } 265 266 static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) 267 { 268 struct vcpu_vmx *vmx = to_vmx(vcpu); 269 struct loaded_vmcs *prev; 270 int cpu; 271 272 if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs)) 273 return; 274 275 cpu = get_cpu(); 276 prev = vmx->loaded_vmcs; 277 vmx->loaded_vmcs = vmcs; 278 vmx_vcpu_load_vmcs(vcpu, cpu, prev); 279 vmx_sync_vmcs_host_state(vmx, prev); 280 put_cpu(); 281 282 vcpu->arch.regs_avail = ~VMX_REGS_LAZY_LOAD_SET; 283 284 /* 285 * All lazily updated registers will be reloaded from VMCS12 on both 286 * vmentry and vmexit. 287 */ 288 vcpu->arch.regs_dirty = 0; 289 } 290 291 /* 292 * Free whatever needs to be freed from vmx->nested when L1 goes down, or 293 * just stops using VMX. 294 */ 295 static void free_nested(struct kvm_vcpu *vcpu) 296 { 297 struct vcpu_vmx *vmx = to_vmx(vcpu); 298 299 if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01)) 300 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 301 302 if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon) 303 return; 304 305 kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 306 307 vmx->nested.vmxon = false; 308 vmx->nested.smm.vmxon = false; 309 vmx->nested.vmxon_ptr = INVALID_GPA; 310 free_vpid(vmx->nested.vpid02); 311 vmx->nested.posted_intr_nv = -1; 312 vmx->nested.current_vmptr = INVALID_GPA; 313 if (enable_shadow_vmcs) { 314 vmx_disable_shadow_vmcs(vmx); 315 vmcs_clear(vmx->vmcs01.shadow_vmcs); 316 free_vmcs(vmx->vmcs01.shadow_vmcs); 317 vmx->vmcs01.shadow_vmcs = NULL; 318 } 319 kfree(vmx->nested.cached_vmcs12); 320 vmx->nested.cached_vmcs12 = NULL; 321 kfree(vmx->nested.cached_shadow_vmcs12); 322 vmx->nested.cached_shadow_vmcs12 = NULL; 323 /* 324 * Unpin physical memory we referred to in the vmcs02. The APIC access 325 * page's backing page (yeah, confusing) shouldn't actually be accessed, 326 * and if it is written, the contents are irrelevant. 327 */ 328 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 329 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 330 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 331 vmx->nested.pi_desc = NULL; 332 333 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 334 335 nested_release_evmcs(vcpu); 336 337 free_loaded_vmcs(&vmx->nested.vmcs02); 338 } 339 340 /* 341 * Ensure that the current vmcs of the logical processor is the 342 * vmcs01 of the vcpu before calling free_nested(). 343 */ 344 void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu) 345 { 346 vcpu_load(vcpu); 347 vmx_leave_nested(vcpu); 348 vcpu_put(vcpu); 349 } 350 351 #define EPTP_PA_MASK GENMASK_ULL(51, 12) 352 353 static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp) 354 { 355 return VALID_PAGE(root_hpa) && 356 ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK)); 357 } 358 359 static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp, 360 gpa_t addr) 361 { 362 unsigned long roots = 0; 363 uint i; 364 struct kvm_mmu_root_info *cached_root; 365 366 WARN_ON_ONCE(!mmu_is_nested(vcpu)); 367 368 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 369 cached_root = &vcpu->arch.mmu->prev_roots[i]; 370 371 if (nested_ept_root_matches(cached_root->hpa, cached_root->pgd, 372 eptp)) 373 roots |= KVM_MMU_ROOT_PREVIOUS(i); 374 } 375 if (roots) 376 kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots); 377 } 378 379 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu, 380 struct x86_exception *fault) 381 { 382 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 383 struct vcpu_vmx *vmx = to_vmx(vcpu); 384 u32 vm_exit_reason; 385 unsigned long exit_qualification = vcpu->arch.exit_qualification; 386 387 if (vmx->nested.pml_full) { 388 vm_exit_reason = EXIT_REASON_PML_FULL; 389 vmx->nested.pml_full = false; 390 exit_qualification &= INTR_INFO_UNBLOCK_NMI; 391 } else { 392 if (fault->error_code & PFERR_RSVD_MASK) 393 vm_exit_reason = EXIT_REASON_EPT_MISCONFIG; 394 else 395 vm_exit_reason = EXIT_REASON_EPT_VIOLATION; 396 397 /* 398 * Although the caller (kvm_inject_emulated_page_fault) would 399 * have already synced the faulting address in the shadow EPT 400 * tables for the current EPTP12, we also need to sync it for 401 * any other cached EPTP02s based on the same EP4TA, since the 402 * TLB associates mappings to the EP4TA rather than the full EPTP. 403 */ 404 nested_ept_invalidate_addr(vcpu, vmcs12->ept_pointer, 405 fault->address); 406 } 407 408 nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification); 409 vmcs12->guest_physical_address = fault->address; 410 } 411 412 static void nested_ept_new_eptp(struct kvm_vcpu *vcpu) 413 { 414 struct vcpu_vmx *vmx = to_vmx(vcpu); 415 bool execonly = vmx->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT; 416 int ept_lpage_level = ept_caps_to_lpage_level(vmx->nested.msrs.ept_caps); 417 418 kvm_init_shadow_ept_mmu(vcpu, execonly, ept_lpage_level, 419 nested_ept_ad_enabled(vcpu), 420 nested_ept_get_eptp(vcpu)); 421 } 422 423 static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) 424 { 425 WARN_ON(mmu_is_nested(vcpu)); 426 427 vcpu->arch.mmu = &vcpu->arch.guest_mmu; 428 nested_ept_new_eptp(vcpu); 429 vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; 430 vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; 431 vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; 432 433 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; 434 } 435 436 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) 437 { 438 vcpu->arch.mmu = &vcpu->arch.root_mmu; 439 vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; 440 } 441 442 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12, 443 u16 error_code) 444 { 445 bool inequality, bit; 446 447 bit = (vmcs12->exception_bitmap & (1u << PF_VECTOR)) != 0; 448 inequality = 449 (error_code & vmcs12->page_fault_error_code_mask) != 450 vmcs12->page_fault_error_code_match; 451 return inequality ^ bit; 452 } 453 454 static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector, 455 u32 error_code) 456 { 457 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 458 459 /* 460 * Drop bits 31:16 of the error code when performing the #PF mask+match 461 * check. All VMCS fields involved are 32 bits, but Intel CPUs never 462 * set bits 31:16 and VMX disallows setting bits 31:16 in the injected 463 * error code. Including the to-be-dropped bits in the check might 464 * result in an "impossible" or missed exit from L1's perspective. 465 */ 466 if (vector == PF_VECTOR) 467 return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code); 468 469 return (vmcs12->exception_bitmap & (1u << vector)); 470 } 471 472 static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, 473 struct vmcs12 *vmcs12) 474 { 475 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 476 return 0; 477 478 if (CC(!page_address_valid(vcpu, vmcs12->io_bitmap_a)) || 479 CC(!page_address_valid(vcpu, vmcs12->io_bitmap_b))) 480 return -EINVAL; 481 482 return 0; 483 } 484 485 static int nested_vmx_check_msr_bitmap_controls(struct kvm_vcpu *vcpu, 486 struct vmcs12 *vmcs12) 487 { 488 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 489 return 0; 490 491 if (CC(!page_address_valid(vcpu, vmcs12->msr_bitmap))) 492 return -EINVAL; 493 494 return 0; 495 } 496 497 static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, 498 struct vmcs12 *vmcs12) 499 { 500 if (!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) 501 return 0; 502 503 if (CC(!page_address_valid(vcpu, vmcs12->virtual_apic_page_addr))) 504 return -EINVAL; 505 506 return 0; 507 } 508 509 /* 510 * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 511 * itself utilizing x2APIC. All MSRs were previously set to be intercepted, 512 * only the "disable intercept" case needs to be handled. 513 */ 514 static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, 515 unsigned long *msr_bitmap_l0, 516 u32 msr, int type) 517 { 518 if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) 519 vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); 520 521 if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) 522 vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); 523 } 524 525 static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) 526 { 527 int msr; 528 529 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 530 unsigned word = msr / BITS_PER_LONG; 531 532 msr_bitmap[word] = ~0; 533 msr_bitmap[word + (0x800 / sizeof(long))] = ~0; 534 } 535 } 536 537 #define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ 538 static inline \ 539 void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ 540 unsigned long *msr_bitmap_l1, \ 541 unsigned long *msr_bitmap_l0, u32 msr) \ 542 { \ 543 if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ 544 vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ 545 vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 546 else \ 547 vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ 548 } 549 BUILD_NVMX_MSR_INTERCEPT_HELPER(read) 550 BUILD_NVMX_MSR_INTERCEPT_HELPER(write) 551 552 static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, 553 unsigned long *msr_bitmap_l1, 554 unsigned long *msr_bitmap_l0, 555 u32 msr, int types) 556 { 557 if (types & MSR_TYPE_R) 558 nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, 559 msr_bitmap_l0, msr); 560 if (types & MSR_TYPE_W) 561 nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, 562 msr_bitmap_l0, msr); 563 } 564 565 /* 566 * Merge L0's and L1's MSR bitmap, return false to indicate that 567 * we do not use the hardware. 568 */ 569 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, 570 struct vmcs12 *vmcs12) 571 { 572 struct vcpu_vmx *vmx = to_vmx(vcpu); 573 int msr; 574 unsigned long *msr_bitmap_l1; 575 unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; 576 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 577 struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; 578 579 /* Nothing to do if the MSR bitmap is not in use. */ 580 if (!cpu_has_vmx_msr_bitmap() || 581 !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 582 return false; 583 584 /* 585 * MSR bitmap update can be skipped when: 586 * - MSR bitmap for L1 hasn't changed. 587 * - Nested hypervisor (L1) is attempting to launch the same L2 as 588 * before. 589 * - Nested hypervisor (L1) has enabled 'Enlightened MSR Bitmap' feature 590 * and tells KVM (L0) there were no changes in MSR bitmap for L2. 591 */ 592 if (!vmx->nested.force_msr_bitmap_recalc && evmcs && 593 evmcs->hv_enlightenments_control.msr_bitmap && 594 evmcs->hv_clean_fields & HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP) 595 return true; 596 597 if (kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->msr_bitmap), map)) 598 return false; 599 600 msr_bitmap_l1 = (unsigned long *)map->hva; 601 602 /* 603 * To keep the control flow simple, pay eight 8-byte writes (sixteen 604 * 4-byte writes on 32-bit systems) up front to enable intercepts for 605 * the x2APIC MSR range and selectively toggle those relevant to L2. 606 */ 607 enable_x2apic_msr_intercepts(msr_bitmap_l0); 608 609 if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { 610 if (nested_cpu_has_apic_reg_virt(vmcs12)) { 611 /* 612 * L0 need not intercept reads for MSRs between 0x800 613 * and 0x8ff, it just lets the processor take the value 614 * from the virtual-APIC page; take those 256 bits 615 * directly from the L1 bitmap. 616 */ 617 for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { 618 unsigned word = msr / BITS_PER_LONG; 619 620 msr_bitmap_l0[word] = msr_bitmap_l1[word]; 621 } 622 } 623 624 nested_vmx_disable_intercept_for_x2apic_msr( 625 msr_bitmap_l1, msr_bitmap_l0, 626 X2APIC_MSR(APIC_TASKPRI), 627 MSR_TYPE_R | MSR_TYPE_W); 628 629 if (nested_cpu_has_vid(vmcs12)) { 630 nested_vmx_disable_intercept_for_x2apic_msr( 631 msr_bitmap_l1, msr_bitmap_l0, 632 X2APIC_MSR(APIC_EOI), 633 MSR_TYPE_W); 634 nested_vmx_disable_intercept_for_x2apic_msr( 635 msr_bitmap_l1, msr_bitmap_l0, 636 X2APIC_MSR(APIC_SELF_IPI), 637 MSR_TYPE_W); 638 } 639 } 640 641 /* 642 * Always check vmcs01's bitmap to honor userspace MSR filters and any 643 * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. 644 */ 645 #ifdef CONFIG_X86_64 646 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 647 MSR_FS_BASE, MSR_TYPE_RW); 648 649 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 650 MSR_GS_BASE, MSR_TYPE_RW); 651 652 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 653 MSR_KERNEL_GS_BASE, MSR_TYPE_RW); 654 #endif 655 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 656 MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); 657 658 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 659 MSR_IA32_PRED_CMD, MSR_TYPE_W); 660 661 nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, 662 MSR_IA32_FLUSH_CMD, MSR_TYPE_W); 663 664 kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); 665 666 vmx->nested.force_msr_bitmap_recalc = false; 667 668 return true; 669 } 670 671 static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu, 672 struct vmcs12 *vmcs12) 673 { 674 struct vcpu_vmx *vmx = to_vmx(vcpu); 675 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 676 677 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 678 vmcs12->vmcs_link_pointer == INVALID_GPA) 679 return; 680 681 if (ghc->gpa != vmcs12->vmcs_link_pointer && 682 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 683 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 684 return; 685 686 kvm_read_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 687 VMCS12_SIZE); 688 } 689 690 static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu, 691 struct vmcs12 *vmcs12) 692 { 693 struct vcpu_vmx *vmx = to_vmx(vcpu); 694 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 695 696 if (!nested_cpu_has_shadow_vmcs(vmcs12) || 697 vmcs12->vmcs_link_pointer == INVALID_GPA) 698 return; 699 700 if (ghc->gpa != vmcs12->vmcs_link_pointer && 701 kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 702 vmcs12->vmcs_link_pointer, VMCS12_SIZE)) 703 return; 704 705 kvm_write_guest_cached(vmx->vcpu.kvm, ghc, get_shadow_vmcs12(vcpu), 706 VMCS12_SIZE); 707 } 708 709 /* 710 * In nested virtualization, check if L1 has set 711 * VM_EXIT_ACK_INTR_ON_EXIT 712 */ 713 static bool nested_exit_intr_ack_set(struct kvm_vcpu *vcpu) 714 { 715 return get_vmcs12(vcpu)->vm_exit_controls & 716 VM_EXIT_ACK_INTR_ON_EXIT; 717 } 718 719 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu, 720 struct vmcs12 *vmcs12) 721 { 722 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && 723 CC(!page_address_valid(vcpu, vmcs12->apic_access_addr))) 724 return -EINVAL; 725 else 726 return 0; 727 } 728 729 static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu, 730 struct vmcs12 *vmcs12) 731 { 732 if (!nested_cpu_has_virt_x2apic_mode(vmcs12) && 733 !nested_cpu_has_apic_reg_virt(vmcs12) && 734 !nested_cpu_has_vid(vmcs12) && 735 !nested_cpu_has_posted_intr(vmcs12)) 736 return 0; 737 738 /* 739 * If virtualize x2apic mode is enabled, 740 * virtualize apic access must be disabled. 741 */ 742 if (CC(nested_cpu_has_virt_x2apic_mode(vmcs12) && 743 nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))) 744 return -EINVAL; 745 746 /* 747 * If virtual interrupt delivery is enabled, 748 * we must exit on external interrupts. 749 */ 750 if (CC(nested_cpu_has_vid(vmcs12) && !nested_exit_on_intr(vcpu))) 751 return -EINVAL; 752 753 /* 754 * bits 15:8 should be zero in posted_intr_nv, 755 * the descriptor address has been already checked 756 * in nested_get_vmcs12_pages. 757 * 758 * bits 5:0 of posted_intr_desc_addr should be zero. 759 */ 760 if (nested_cpu_has_posted_intr(vmcs12) && 761 (CC(!nested_cpu_has_vid(vmcs12)) || 762 CC(!nested_exit_intr_ack_set(vcpu)) || 763 CC((vmcs12->posted_intr_nv & 0xff00)) || 764 CC(!kvm_vcpu_is_legal_aligned_gpa(vcpu, vmcs12->posted_intr_desc_addr, 64)))) 765 return -EINVAL; 766 767 /* tpr shadow is needed by all apicv features. */ 768 if (CC(!nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))) 769 return -EINVAL; 770 771 return 0; 772 } 773 774 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu, 775 u32 count, u64 addr) 776 { 777 if (count == 0) 778 return 0; 779 780 if (!kvm_vcpu_is_legal_aligned_gpa(vcpu, addr, 16) || 781 !kvm_vcpu_is_legal_gpa(vcpu, (addr + count * sizeof(struct vmx_msr_entry) - 1))) 782 return -EINVAL; 783 784 return 0; 785 } 786 787 static int nested_vmx_check_exit_msr_switch_controls(struct kvm_vcpu *vcpu, 788 struct vmcs12 *vmcs12) 789 { 790 if (CC(nested_vmx_check_msr_switch(vcpu, 791 vmcs12->vm_exit_msr_load_count, 792 vmcs12->vm_exit_msr_load_addr)) || 793 CC(nested_vmx_check_msr_switch(vcpu, 794 vmcs12->vm_exit_msr_store_count, 795 vmcs12->vm_exit_msr_store_addr))) 796 return -EINVAL; 797 798 return 0; 799 } 800 801 static int nested_vmx_check_entry_msr_switch_controls(struct kvm_vcpu *vcpu, 802 struct vmcs12 *vmcs12) 803 { 804 if (CC(nested_vmx_check_msr_switch(vcpu, 805 vmcs12->vm_entry_msr_load_count, 806 vmcs12->vm_entry_msr_load_addr))) 807 return -EINVAL; 808 809 return 0; 810 } 811 812 static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu, 813 struct vmcs12 *vmcs12) 814 { 815 if (!nested_cpu_has_pml(vmcs12)) 816 return 0; 817 818 if (CC(!nested_cpu_has_ept(vmcs12)) || 819 CC(!page_address_valid(vcpu, vmcs12->pml_address))) 820 return -EINVAL; 821 822 return 0; 823 } 824 825 static int nested_vmx_check_unrestricted_guest_controls(struct kvm_vcpu *vcpu, 826 struct vmcs12 *vmcs12) 827 { 828 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST) && 829 !nested_cpu_has_ept(vmcs12))) 830 return -EINVAL; 831 return 0; 832 } 833 834 static int nested_vmx_check_mode_based_ept_exec_controls(struct kvm_vcpu *vcpu, 835 struct vmcs12 *vmcs12) 836 { 837 if (CC(nested_cpu_has2(vmcs12, SECONDARY_EXEC_MODE_BASED_EPT_EXEC) && 838 !nested_cpu_has_ept(vmcs12))) 839 return -EINVAL; 840 return 0; 841 } 842 843 static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu, 844 struct vmcs12 *vmcs12) 845 { 846 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 847 return 0; 848 849 if (CC(!page_address_valid(vcpu, vmcs12->vmread_bitmap)) || 850 CC(!page_address_valid(vcpu, vmcs12->vmwrite_bitmap))) 851 return -EINVAL; 852 853 return 0; 854 } 855 856 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu, 857 struct vmx_msr_entry *e) 858 { 859 /* x2APIC MSR accesses are not allowed */ 860 if (CC(vcpu->arch.apic_base & X2APIC_ENABLE && e->index >> 8 == 0x8)) 861 return -EINVAL; 862 if (CC(e->index == MSR_IA32_UCODE_WRITE) || /* SDM Table 35-2 */ 863 CC(e->index == MSR_IA32_UCODE_REV)) 864 return -EINVAL; 865 if (CC(e->reserved != 0)) 866 return -EINVAL; 867 return 0; 868 } 869 870 static int nested_vmx_load_msr_check(struct kvm_vcpu *vcpu, 871 struct vmx_msr_entry *e) 872 { 873 if (CC(e->index == MSR_FS_BASE) || 874 CC(e->index == MSR_GS_BASE) || 875 CC(e->index == MSR_IA32_SMM_MONITOR_CTL) || /* SMM is not supported */ 876 nested_vmx_msr_check_common(vcpu, e)) 877 return -EINVAL; 878 return 0; 879 } 880 881 static int nested_vmx_store_msr_check(struct kvm_vcpu *vcpu, 882 struct vmx_msr_entry *e) 883 { 884 if (CC(e->index == MSR_IA32_SMBASE) || /* SMM is not supported */ 885 nested_vmx_msr_check_common(vcpu, e)) 886 return -EINVAL; 887 return 0; 888 } 889 890 static u32 nested_vmx_max_atomic_switch_msrs(struct kvm_vcpu *vcpu) 891 { 892 struct vcpu_vmx *vmx = to_vmx(vcpu); 893 u64 vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low, 894 vmx->nested.msrs.misc_high); 895 896 return (vmx_misc_max_msr(vmx_misc) + 1) * VMX_MISC_MSR_LIST_MULTIPLIER; 897 } 898 899 /* 900 * Load guest's/host's msr at nested entry/exit. 901 * return 0 for success, entry index for failure. 902 * 903 * One of the failure modes for MSR load/store is when a list exceeds the 904 * virtual hardware's capacity. To maintain compatibility with hardware inasmuch 905 * as possible, process all valid entries before failing rather than precheck 906 * for a capacity violation. 907 */ 908 static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 909 { 910 u32 i; 911 struct vmx_msr_entry e; 912 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 913 914 for (i = 0; i < count; i++) { 915 if (unlikely(i >= max_msr_list_size)) 916 goto fail; 917 918 if (kvm_vcpu_read_guest(vcpu, gpa + i * sizeof(e), 919 &e, sizeof(e))) { 920 pr_debug_ratelimited( 921 "%s cannot read MSR entry (%u, 0x%08llx)\n", 922 __func__, i, gpa + i * sizeof(e)); 923 goto fail; 924 } 925 if (nested_vmx_load_msr_check(vcpu, &e)) { 926 pr_debug_ratelimited( 927 "%s check failed (%u, 0x%x, 0x%x)\n", 928 __func__, i, e.index, e.reserved); 929 goto fail; 930 } 931 if (kvm_set_msr(vcpu, e.index, e.value)) { 932 pr_debug_ratelimited( 933 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 934 __func__, i, e.index, e.value); 935 goto fail; 936 } 937 } 938 return 0; 939 fail: 940 /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ 941 return i + 1; 942 } 943 944 static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu, 945 u32 msr_index, 946 u64 *data) 947 { 948 struct vcpu_vmx *vmx = to_vmx(vcpu); 949 950 /* 951 * If the L0 hypervisor stored a more accurate value for the TSC that 952 * does not include the time taken for emulation of the L2->L1 953 * VM-exit in L0, use the more accurate value. 954 */ 955 if (msr_index == MSR_IA32_TSC) { 956 int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest, 957 MSR_IA32_TSC); 958 959 if (i >= 0) { 960 u64 val = vmx->msr_autostore.guest.val[i].value; 961 962 *data = kvm_read_l1_tsc(vcpu, val); 963 return true; 964 } 965 } 966 967 if (kvm_get_msr(vcpu, msr_index, data)) { 968 pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__, 969 msr_index); 970 return false; 971 } 972 return true; 973 } 974 975 static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i, 976 struct vmx_msr_entry *e) 977 { 978 if (kvm_vcpu_read_guest(vcpu, 979 gpa + i * sizeof(*e), 980 e, 2 * sizeof(u32))) { 981 pr_debug_ratelimited( 982 "%s cannot read MSR entry (%u, 0x%08llx)\n", 983 __func__, i, gpa + i * sizeof(*e)); 984 return false; 985 } 986 if (nested_vmx_store_msr_check(vcpu, e)) { 987 pr_debug_ratelimited( 988 "%s check failed (%u, 0x%x, 0x%x)\n", 989 __func__, i, e->index, e->reserved); 990 return false; 991 } 992 return true; 993 } 994 995 static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) 996 { 997 u64 data; 998 u32 i; 999 struct vmx_msr_entry e; 1000 u32 max_msr_list_size = nested_vmx_max_atomic_switch_msrs(vcpu); 1001 1002 for (i = 0; i < count; i++) { 1003 if (unlikely(i >= max_msr_list_size)) 1004 return -EINVAL; 1005 1006 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1007 return -EINVAL; 1008 1009 if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data)) 1010 return -EINVAL; 1011 1012 if (kvm_vcpu_write_guest(vcpu, 1013 gpa + i * sizeof(e) + 1014 offsetof(struct vmx_msr_entry, value), 1015 &data, sizeof(data))) { 1016 pr_debug_ratelimited( 1017 "%s cannot write MSR (%u, 0x%x, 0x%llx)\n", 1018 __func__, i, e.index, data); 1019 return -EINVAL; 1020 } 1021 } 1022 return 0; 1023 } 1024 1025 static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index) 1026 { 1027 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1028 u32 count = vmcs12->vm_exit_msr_store_count; 1029 u64 gpa = vmcs12->vm_exit_msr_store_addr; 1030 struct vmx_msr_entry e; 1031 u32 i; 1032 1033 for (i = 0; i < count; i++) { 1034 if (!read_and_check_msr_entry(vcpu, gpa, i, &e)) 1035 return false; 1036 1037 if (e.index == msr_index) 1038 return true; 1039 } 1040 return false; 1041 } 1042 1043 static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu, 1044 u32 msr_index) 1045 { 1046 struct vcpu_vmx *vmx = to_vmx(vcpu); 1047 struct vmx_msrs *autostore = &vmx->msr_autostore.guest; 1048 bool in_vmcs12_store_list; 1049 int msr_autostore_slot; 1050 bool in_autostore_list; 1051 int last; 1052 1053 msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index); 1054 in_autostore_list = msr_autostore_slot >= 0; 1055 in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index); 1056 1057 if (in_vmcs12_store_list && !in_autostore_list) { 1058 if (autostore->nr == MAX_NR_LOADSTORE_MSRS) { 1059 /* 1060 * Emulated VMEntry does not fail here. Instead a less 1061 * accurate value will be returned by 1062 * nested_vmx_get_vmexit_msr_value() using kvm_get_msr() 1063 * instead of reading the value from the vmcs02 VMExit 1064 * MSR-store area. 1065 */ 1066 pr_warn_ratelimited( 1067 "Not enough msr entries in msr_autostore. Can't add msr %x\n", 1068 msr_index); 1069 return; 1070 } 1071 last = autostore->nr++; 1072 autostore->val[last].index = msr_index; 1073 } else if (!in_vmcs12_store_list && in_autostore_list) { 1074 last = --autostore->nr; 1075 autostore->val[msr_autostore_slot] = autostore->val[last]; 1076 } 1077 } 1078 1079 /* 1080 * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are 1081 * emulating VM-Entry into a guest with EPT enabled. On failure, the expected 1082 * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to 1083 * @entry_failure_code. 1084 */ 1085 static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, 1086 bool nested_ept, bool reload_pdptrs, 1087 enum vm_entry_failure_code *entry_failure_code) 1088 { 1089 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, cr3))) { 1090 *entry_failure_code = ENTRY_FAIL_DEFAULT; 1091 return -EINVAL; 1092 } 1093 1094 /* 1095 * If PAE paging and EPT are both on, CR3 is not used by the CPU and 1096 * must not be dereferenced. 1097 */ 1098 if (reload_pdptrs && !nested_ept && is_pae_paging(vcpu) && 1099 CC(!load_pdptrs(vcpu, cr3))) { 1100 *entry_failure_code = ENTRY_FAIL_PDPTE; 1101 return -EINVAL; 1102 } 1103 1104 vcpu->arch.cr3 = cr3; 1105 kvm_register_mark_dirty(vcpu, VCPU_EXREG_CR3); 1106 1107 /* Re-initialize the MMU, e.g. to pick up CR4 MMU role changes. */ 1108 kvm_init_mmu(vcpu); 1109 1110 if (!nested_ept) 1111 kvm_mmu_new_pgd(vcpu, cr3); 1112 1113 return 0; 1114 } 1115 1116 /* 1117 * Returns if KVM is able to config CPU to tag TLB entries 1118 * populated by L2 differently than TLB entries populated 1119 * by L1. 1120 * 1121 * If L0 uses EPT, L1 and L2 run with different EPTP because 1122 * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries 1123 * are tagged with different EPTP. 1124 * 1125 * If L1 uses VPID and we allocated a vpid02, TLB entries are tagged 1126 * with different VPID (L1 entries are tagged with vmx->vpid 1127 * while L2 entries are tagged with vmx->nested.vpid02). 1128 */ 1129 static bool nested_has_guest_tlb_tag(struct kvm_vcpu *vcpu) 1130 { 1131 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 1132 1133 return enable_ept || 1134 (nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02); 1135 } 1136 1137 static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, 1138 struct vmcs12 *vmcs12, 1139 bool is_vmenter) 1140 { 1141 struct vcpu_vmx *vmx = to_vmx(vcpu); 1142 1143 /* 1144 * KVM_REQ_HV_TLB_FLUSH flushes entries from either L1's VP_ID or 1145 * L2's VP_ID upon request from the guest. Make sure we check for 1146 * pending entries in the right FIFO upon L1/L2 transition as these 1147 * requests are put by other vCPUs asynchronously. 1148 */ 1149 if (to_hv_vcpu(vcpu) && enable_ept) 1150 kvm_make_request(KVM_REQ_HV_TLB_FLUSH, vcpu); 1151 1152 /* 1153 * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings 1154 * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a 1155 * full TLB flush from the guest's perspective. This is required even 1156 * if VPID is disabled in the host as KVM may need to synchronize the 1157 * MMU in response to the guest TLB flush. 1158 * 1159 * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. 1160 * EPT is a special snowflake, as guest-physical mappings aren't 1161 * flushed on VPID invalidations, including VM-Enter or VM-Exit with 1162 * VPID disabled. As a result, KVM _never_ needs to sync nEPT 1163 * entries on VM-Enter because L1 can't rely on VM-Enter to flush 1164 * those mappings. 1165 */ 1166 if (!nested_cpu_has_vpid(vmcs12)) { 1167 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1168 return; 1169 } 1170 1171 /* L2 should never have a VPID if VPID is disabled. */ 1172 WARN_ON(!enable_vpid); 1173 1174 /* 1175 * VPID is enabled and in use by vmcs12. If vpid12 is changing, then 1176 * emulate a guest TLB flush as KVM does not track vpid12 history nor 1177 * is the VPID incorporated into the MMU context. I.e. KVM must assume 1178 * that the new vpid12 has never been used and thus represents a new 1179 * guest ASID that cannot have entries in the TLB. 1180 */ 1181 if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) { 1182 vmx->nested.last_vpid = vmcs12->virtual_processor_id; 1183 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 1184 return; 1185 } 1186 1187 /* 1188 * If VPID is enabled, used by vmc12, and vpid12 is not changing but 1189 * does not have a unique TLB tag (ASID), i.e. EPT is disabled and 1190 * KVM was unable to allocate a VPID for L2, flush the current context 1191 * as the effective ASID is common to both L1 and L2. 1192 */ 1193 if (!nested_has_guest_tlb_tag(vcpu)) 1194 kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); 1195 } 1196 1197 static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) 1198 { 1199 superset &= mask; 1200 subset &= mask; 1201 1202 return (superset | subset) == superset; 1203 } 1204 1205 static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) 1206 { 1207 const u64 feature_and_reserved = 1208 /* feature (except bit 48; see below) */ 1209 BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | 1210 /* reserved */ 1211 BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); 1212 u64 vmx_basic = vmcs_config.nested.basic; 1213 1214 if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) 1215 return -EINVAL; 1216 1217 /* 1218 * KVM does not emulate a version of VMX that constrains physical 1219 * addresses of VMX structures (e.g. VMCS) to 32-bits. 1220 */ 1221 if (data & BIT_ULL(48)) 1222 return -EINVAL; 1223 1224 if (vmx_basic_vmcs_revision_id(vmx_basic) != 1225 vmx_basic_vmcs_revision_id(data)) 1226 return -EINVAL; 1227 1228 if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) 1229 return -EINVAL; 1230 1231 vmx->nested.msrs.basic = data; 1232 return 0; 1233 } 1234 1235 static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index, 1236 u32 **low, u32 **high) 1237 { 1238 switch (msr_index) { 1239 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1240 *low = &msrs->pinbased_ctls_low; 1241 *high = &msrs->pinbased_ctls_high; 1242 break; 1243 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1244 *low = &msrs->procbased_ctls_low; 1245 *high = &msrs->procbased_ctls_high; 1246 break; 1247 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1248 *low = &msrs->exit_ctls_low; 1249 *high = &msrs->exit_ctls_high; 1250 break; 1251 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1252 *low = &msrs->entry_ctls_low; 1253 *high = &msrs->entry_ctls_high; 1254 break; 1255 case MSR_IA32_VMX_PROCBASED_CTLS2: 1256 *low = &msrs->secondary_ctls_low; 1257 *high = &msrs->secondary_ctls_high; 1258 break; 1259 default: 1260 BUG(); 1261 } 1262 } 1263 1264 static int 1265 vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1266 { 1267 u32 *lowp, *highp; 1268 u64 supported; 1269 1270 vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp); 1271 1272 supported = vmx_control_msr(*lowp, *highp); 1273 1274 /* Check must-be-1 bits are still 1. */ 1275 if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) 1276 return -EINVAL; 1277 1278 /* Check must-be-0 bits are still 0. */ 1279 if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) 1280 return -EINVAL; 1281 1282 vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp); 1283 *lowp = data; 1284 *highp = data >> 32; 1285 return 0; 1286 } 1287 1288 static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) 1289 { 1290 const u64 feature_and_reserved_bits = 1291 /* feature */ 1292 BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | 1293 BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | 1294 /* reserved */ 1295 GENMASK_ULL(13, 9) | BIT_ULL(31); 1296 u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low, 1297 vmcs_config.nested.misc_high); 1298 1299 if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) 1300 return -EINVAL; 1301 1302 if ((vmx->nested.msrs.pinbased_ctls_high & 1303 PIN_BASED_VMX_PREEMPTION_TIMER) && 1304 vmx_misc_preemption_timer_rate(data) != 1305 vmx_misc_preemption_timer_rate(vmx_misc)) 1306 return -EINVAL; 1307 1308 if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) 1309 return -EINVAL; 1310 1311 if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) 1312 return -EINVAL; 1313 1314 if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) 1315 return -EINVAL; 1316 1317 vmx->nested.msrs.misc_low = data; 1318 vmx->nested.msrs.misc_high = data >> 32; 1319 1320 return 0; 1321 } 1322 1323 static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) 1324 { 1325 u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps, 1326 vmcs_config.nested.vpid_caps); 1327 1328 /* Every bit is either reserved or a feature bit. */ 1329 if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) 1330 return -EINVAL; 1331 1332 vmx->nested.msrs.ept_caps = data; 1333 vmx->nested.msrs.vpid_caps = data >> 32; 1334 return 0; 1335 } 1336 1337 static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index) 1338 { 1339 switch (msr_index) { 1340 case MSR_IA32_VMX_CR0_FIXED0: 1341 return &msrs->cr0_fixed0; 1342 case MSR_IA32_VMX_CR4_FIXED0: 1343 return &msrs->cr4_fixed0; 1344 default: 1345 BUG(); 1346 } 1347 } 1348 1349 static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) 1350 { 1351 const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index); 1352 1353 /* 1354 * 1 bits (which indicates bits which "must-be-1" during VMX operation) 1355 * must be 1 in the restored value. 1356 */ 1357 if (!is_bitwise_subset(data, *msr, -1ULL)) 1358 return -EINVAL; 1359 1360 *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data; 1361 return 0; 1362 } 1363 1364 /* 1365 * Called when userspace is restoring VMX MSRs. 1366 * 1367 * Returns 0 on success, non-0 otherwise. 1368 */ 1369 int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1370 { 1371 struct vcpu_vmx *vmx = to_vmx(vcpu); 1372 1373 /* 1374 * Don't allow changes to the VMX capability MSRs while the vCPU 1375 * is in VMX operation. 1376 */ 1377 if (vmx->nested.vmxon) 1378 return -EBUSY; 1379 1380 switch (msr_index) { 1381 case MSR_IA32_VMX_BASIC: 1382 return vmx_restore_vmx_basic(vmx, data); 1383 case MSR_IA32_VMX_PINBASED_CTLS: 1384 case MSR_IA32_VMX_PROCBASED_CTLS: 1385 case MSR_IA32_VMX_EXIT_CTLS: 1386 case MSR_IA32_VMX_ENTRY_CTLS: 1387 /* 1388 * The "non-true" VMX capability MSRs are generated from the 1389 * "true" MSRs, so we do not support restoring them directly. 1390 * 1391 * If userspace wants to emulate VMX_BASIC[55]=0, userspace 1392 * should restore the "true" MSRs with the must-be-1 bits 1393 * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND 1394 * DEFAULT SETTINGS". 1395 */ 1396 return -EINVAL; 1397 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1398 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1399 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1400 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1401 case MSR_IA32_VMX_PROCBASED_CTLS2: 1402 return vmx_restore_control_msr(vmx, msr_index, data); 1403 case MSR_IA32_VMX_MISC: 1404 return vmx_restore_vmx_misc(vmx, data); 1405 case MSR_IA32_VMX_CR0_FIXED0: 1406 case MSR_IA32_VMX_CR4_FIXED0: 1407 return vmx_restore_fixed0_msr(vmx, msr_index, data); 1408 case MSR_IA32_VMX_CR0_FIXED1: 1409 case MSR_IA32_VMX_CR4_FIXED1: 1410 /* 1411 * These MSRs are generated based on the vCPU's CPUID, so we 1412 * do not support restoring them directly. 1413 */ 1414 return -EINVAL; 1415 case MSR_IA32_VMX_EPT_VPID_CAP: 1416 return vmx_restore_vmx_ept_vpid_cap(vmx, data); 1417 case MSR_IA32_VMX_VMCS_ENUM: 1418 vmx->nested.msrs.vmcs_enum = data; 1419 return 0; 1420 case MSR_IA32_VMX_VMFUNC: 1421 if (data & ~vmcs_config.nested.vmfunc_controls) 1422 return -EINVAL; 1423 vmx->nested.msrs.vmfunc_controls = data; 1424 return 0; 1425 default: 1426 /* 1427 * The rest of the VMX capability MSRs do not support restore. 1428 */ 1429 return -EINVAL; 1430 } 1431 } 1432 1433 /* Returns 0 on success, non-0 otherwise. */ 1434 int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata) 1435 { 1436 switch (msr_index) { 1437 case MSR_IA32_VMX_BASIC: 1438 *pdata = msrs->basic; 1439 break; 1440 case MSR_IA32_VMX_TRUE_PINBASED_CTLS: 1441 case MSR_IA32_VMX_PINBASED_CTLS: 1442 *pdata = vmx_control_msr( 1443 msrs->pinbased_ctls_low, 1444 msrs->pinbased_ctls_high); 1445 if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) 1446 *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1447 break; 1448 case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: 1449 case MSR_IA32_VMX_PROCBASED_CTLS: 1450 *pdata = vmx_control_msr( 1451 msrs->procbased_ctls_low, 1452 msrs->procbased_ctls_high); 1453 if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) 1454 *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 1455 break; 1456 case MSR_IA32_VMX_TRUE_EXIT_CTLS: 1457 case MSR_IA32_VMX_EXIT_CTLS: 1458 *pdata = vmx_control_msr( 1459 msrs->exit_ctls_low, 1460 msrs->exit_ctls_high); 1461 if (msr_index == MSR_IA32_VMX_EXIT_CTLS) 1462 *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 1463 break; 1464 case MSR_IA32_VMX_TRUE_ENTRY_CTLS: 1465 case MSR_IA32_VMX_ENTRY_CTLS: 1466 *pdata = vmx_control_msr( 1467 msrs->entry_ctls_low, 1468 msrs->entry_ctls_high); 1469 if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) 1470 *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 1471 break; 1472 case MSR_IA32_VMX_MISC: 1473 *pdata = vmx_control_msr( 1474 msrs->misc_low, 1475 msrs->misc_high); 1476 break; 1477 case MSR_IA32_VMX_CR0_FIXED0: 1478 *pdata = msrs->cr0_fixed0; 1479 break; 1480 case MSR_IA32_VMX_CR0_FIXED1: 1481 *pdata = msrs->cr0_fixed1; 1482 break; 1483 case MSR_IA32_VMX_CR4_FIXED0: 1484 *pdata = msrs->cr4_fixed0; 1485 break; 1486 case MSR_IA32_VMX_CR4_FIXED1: 1487 *pdata = msrs->cr4_fixed1; 1488 break; 1489 case MSR_IA32_VMX_VMCS_ENUM: 1490 *pdata = msrs->vmcs_enum; 1491 break; 1492 case MSR_IA32_VMX_PROCBASED_CTLS2: 1493 *pdata = vmx_control_msr( 1494 msrs->secondary_ctls_low, 1495 msrs->secondary_ctls_high); 1496 break; 1497 case MSR_IA32_VMX_EPT_VPID_CAP: 1498 *pdata = msrs->ept_caps | 1499 ((u64)msrs->vpid_caps << 32); 1500 break; 1501 case MSR_IA32_VMX_VMFUNC: 1502 *pdata = msrs->vmfunc_controls; 1503 break; 1504 default: 1505 return 1; 1506 } 1507 1508 return 0; 1509 } 1510 1511 /* 1512 * Copy the writable VMCS shadow fields back to the VMCS12, in case they have 1513 * been modified by the L1 guest. Note, "writable" in this context means 1514 * "writable by the guest", i.e. tagged SHADOW_FIELD_RW; the set of 1515 * fields tagged SHADOW_FIELD_RO may or may not align with the "read-only" 1516 * VM-exit information fields (which are actually writable if the vCPU is 1517 * configured to support "VMWRITE to any supported field in the VMCS"). 1518 */ 1519 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) 1520 { 1521 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1522 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1523 struct shadow_vmcs_field field; 1524 unsigned long val; 1525 int i; 1526 1527 if (WARN_ON(!shadow_vmcs)) 1528 return; 1529 1530 preempt_disable(); 1531 1532 vmcs_load(shadow_vmcs); 1533 1534 for (i = 0; i < max_shadow_read_write_fields; i++) { 1535 field = shadow_read_write_fields[i]; 1536 val = __vmcs_readl(field.encoding); 1537 vmcs12_write_any(vmcs12, field.encoding, field.offset, val); 1538 } 1539 1540 vmcs_clear(shadow_vmcs); 1541 vmcs_load(vmx->loaded_vmcs->vmcs); 1542 1543 preempt_enable(); 1544 } 1545 1546 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) 1547 { 1548 const struct shadow_vmcs_field *fields[] = { 1549 shadow_read_write_fields, 1550 shadow_read_only_fields 1551 }; 1552 const int max_fields[] = { 1553 max_shadow_read_write_fields, 1554 max_shadow_read_only_fields 1555 }; 1556 struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; 1557 struct vmcs12 *vmcs12 = get_vmcs12(&vmx->vcpu); 1558 struct shadow_vmcs_field field; 1559 unsigned long val; 1560 int i, q; 1561 1562 if (WARN_ON(!shadow_vmcs)) 1563 return; 1564 1565 vmcs_load(shadow_vmcs); 1566 1567 for (q = 0; q < ARRAY_SIZE(fields); q++) { 1568 for (i = 0; i < max_fields[q]; i++) { 1569 field = fields[q][i]; 1570 val = vmcs12_read_any(vmcs12, field.encoding, 1571 field.offset); 1572 __vmcs_writel(field.encoding, val); 1573 } 1574 } 1575 1576 vmcs_clear(shadow_vmcs); 1577 vmcs_load(vmx->loaded_vmcs->vmcs); 1578 } 1579 1580 static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields) 1581 { 1582 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1583 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1584 struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(&vmx->vcpu); 1585 1586 /* HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE */ 1587 vmcs12->tpr_threshold = evmcs->tpr_threshold; 1588 vmcs12->guest_rip = evmcs->guest_rip; 1589 1590 if (unlikely(!(hv_clean_fields & 1591 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ENLIGHTENMENTSCONTROL))) { 1592 hv_vcpu->nested.pa_page_gpa = evmcs->partition_assist_page; 1593 hv_vcpu->nested.vm_id = evmcs->hv_vm_id; 1594 hv_vcpu->nested.vp_id = evmcs->hv_vp_id; 1595 } 1596 1597 if (unlikely(!(hv_clean_fields & 1598 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC))) { 1599 vmcs12->guest_rsp = evmcs->guest_rsp; 1600 vmcs12->guest_rflags = evmcs->guest_rflags; 1601 vmcs12->guest_interruptibility_info = 1602 evmcs->guest_interruptibility_info; 1603 /* 1604 * Not present in struct vmcs12: 1605 * vmcs12->guest_ssp = evmcs->guest_ssp; 1606 */ 1607 } 1608 1609 if (unlikely(!(hv_clean_fields & 1610 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_PROC))) { 1611 vmcs12->cpu_based_vm_exec_control = 1612 evmcs->cpu_based_vm_exec_control; 1613 } 1614 1615 if (unlikely(!(hv_clean_fields & 1616 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EXCPN))) { 1617 vmcs12->exception_bitmap = evmcs->exception_bitmap; 1618 } 1619 1620 if (unlikely(!(hv_clean_fields & 1621 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_ENTRY))) { 1622 vmcs12->vm_entry_controls = evmcs->vm_entry_controls; 1623 } 1624 1625 if (unlikely(!(hv_clean_fields & 1626 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_EVENT))) { 1627 vmcs12->vm_entry_intr_info_field = 1628 evmcs->vm_entry_intr_info_field; 1629 vmcs12->vm_entry_exception_error_code = 1630 evmcs->vm_entry_exception_error_code; 1631 vmcs12->vm_entry_instruction_len = 1632 evmcs->vm_entry_instruction_len; 1633 } 1634 1635 if (unlikely(!(hv_clean_fields & 1636 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1))) { 1637 vmcs12->host_ia32_pat = evmcs->host_ia32_pat; 1638 vmcs12->host_ia32_efer = evmcs->host_ia32_efer; 1639 vmcs12->host_cr0 = evmcs->host_cr0; 1640 vmcs12->host_cr3 = evmcs->host_cr3; 1641 vmcs12->host_cr4 = evmcs->host_cr4; 1642 vmcs12->host_ia32_sysenter_esp = evmcs->host_ia32_sysenter_esp; 1643 vmcs12->host_ia32_sysenter_eip = evmcs->host_ia32_sysenter_eip; 1644 vmcs12->host_rip = evmcs->host_rip; 1645 vmcs12->host_ia32_sysenter_cs = evmcs->host_ia32_sysenter_cs; 1646 vmcs12->host_es_selector = evmcs->host_es_selector; 1647 vmcs12->host_cs_selector = evmcs->host_cs_selector; 1648 vmcs12->host_ss_selector = evmcs->host_ss_selector; 1649 vmcs12->host_ds_selector = evmcs->host_ds_selector; 1650 vmcs12->host_fs_selector = evmcs->host_fs_selector; 1651 vmcs12->host_gs_selector = evmcs->host_gs_selector; 1652 vmcs12->host_tr_selector = evmcs->host_tr_selector; 1653 vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl; 1654 /* 1655 * Not present in struct vmcs12: 1656 * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet; 1657 * vmcs12->host_ssp = evmcs->host_ssp; 1658 * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr; 1659 */ 1660 } 1661 1662 if (unlikely(!(hv_clean_fields & 1663 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP1))) { 1664 vmcs12->pin_based_vm_exec_control = 1665 evmcs->pin_based_vm_exec_control; 1666 vmcs12->vm_exit_controls = evmcs->vm_exit_controls; 1667 vmcs12->secondary_vm_exec_control = 1668 evmcs->secondary_vm_exec_control; 1669 } 1670 1671 if (unlikely(!(hv_clean_fields & 1672 HV_VMX_ENLIGHTENED_CLEAN_FIELD_IO_BITMAP))) { 1673 vmcs12->io_bitmap_a = evmcs->io_bitmap_a; 1674 vmcs12->io_bitmap_b = evmcs->io_bitmap_b; 1675 } 1676 1677 if (unlikely(!(hv_clean_fields & 1678 HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP))) { 1679 vmcs12->msr_bitmap = evmcs->msr_bitmap; 1680 } 1681 1682 if (unlikely(!(hv_clean_fields & 1683 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2))) { 1684 vmcs12->guest_es_base = evmcs->guest_es_base; 1685 vmcs12->guest_cs_base = evmcs->guest_cs_base; 1686 vmcs12->guest_ss_base = evmcs->guest_ss_base; 1687 vmcs12->guest_ds_base = evmcs->guest_ds_base; 1688 vmcs12->guest_fs_base = evmcs->guest_fs_base; 1689 vmcs12->guest_gs_base = evmcs->guest_gs_base; 1690 vmcs12->guest_ldtr_base = evmcs->guest_ldtr_base; 1691 vmcs12->guest_tr_base = evmcs->guest_tr_base; 1692 vmcs12->guest_gdtr_base = evmcs->guest_gdtr_base; 1693 vmcs12->guest_idtr_base = evmcs->guest_idtr_base; 1694 vmcs12->guest_es_limit = evmcs->guest_es_limit; 1695 vmcs12->guest_cs_limit = evmcs->guest_cs_limit; 1696 vmcs12->guest_ss_limit = evmcs->guest_ss_limit; 1697 vmcs12->guest_ds_limit = evmcs->guest_ds_limit; 1698 vmcs12->guest_fs_limit = evmcs->guest_fs_limit; 1699 vmcs12->guest_gs_limit = evmcs->guest_gs_limit; 1700 vmcs12->guest_ldtr_limit = evmcs->guest_ldtr_limit; 1701 vmcs12->guest_tr_limit = evmcs->guest_tr_limit; 1702 vmcs12->guest_gdtr_limit = evmcs->guest_gdtr_limit; 1703 vmcs12->guest_idtr_limit = evmcs->guest_idtr_limit; 1704 vmcs12->guest_es_ar_bytes = evmcs->guest_es_ar_bytes; 1705 vmcs12->guest_cs_ar_bytes = evmcs->guest_cs_ar_bytes; 1706 vmcs12->guest_ss_ar_bytes = evmcs->guest_ss_ar_bytes; 1707 vmcs12->guest_ds_ar_bytes = evmcs->guest_ds_ar_bytes; 1708 vmcs12->guest_fs_ar_bytes = evmcs->guest_fs_ar_bytes; 1709 vmcs12->guest_gs_ar_bytes = evmcs->guest_gs_ar_bytes; 1710 vmcs12->guest_ldtr_ar_bytes = evmcs->guest_ldtr_ar_bytes; 1711 vmcs12->guest_tr_ar_bytes = evmcs->guest_tr_ar_bytes; 1712 vmcs12->guest_es_selector = evmcs->guest_es_selector; 1713 vmcs12->guest_cs_selector = evmcs->guest_cs_selector; 1714 vmcs12->guest_ss_selector = evmcs->guest_ss_selector; 1715 vmcs12->guest_ds_selector = evmcs->guest_ds_selector; 1716 vmcs12->guest_fs_selector = evmcs->guest_fs_selector; 1717 vmcs12->guest_gs_selector = evmcs->guest_gs_selector; 1718 vmcs12->guest_ldtr_selector = evmcs->guest_ldtr_selector; 1719 vmcs12->guest_tr_selector = evmcs->guest_tr_selector; 1720 } 1721 1722 if (unlikely(!(hv_clean_fields & 1723 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2))) { 1724 vmcs12->tsc_offset = evmcs->tsc_offset; 1725 vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr; 1726 vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap; 1727 vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap; 1728 vmcs12->tsc_multiplier = evmcs->tsc_multiplier; 1729 } 1730 1731 if (unlikely(!(hv_clean_fields & 1732 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CRDR))) { 1733 vmcs12->cr0_guest_host_mask = evmcs->cr0_guest_host_mask; 1734 vmcs12->cr4_guest_host_mask = evmcs->cr4_guest_host_mask; 1735 vmcs12->cr0_read_shadow = evmcs->cr0_read_shadow; 1736 vmcs12->cr4_read_shadow = evmcs->cr4_read_shadow; 1737 vmcs12->guest_cr0 = evmcs->guest_cr0; 1738 vmcs12->guest_cr3 = evmcs->guest_cr3; 1739 vmcs12->guest_cr4 = evmcs->guest_cr4; 1740 vmcs12->guest_dr7 = evmcs->guest_dr7; 1741 } 1742 1743 if (unlikely(!(hv_clean_fields & 1744 HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_POINTER))) { 1745 vmcs12->host_fs_base = evmcs->host_fs_base; 1746 vmcs12->host_gs_base = evmcs->host_gs_base; 1747 vmcs12->host_tr_base = evmcs->host_tr_base; 1748 vmcs12->host_gdtr_base = evmcs->host_gdtr_base; 1749 vmcs12->host_idtr_base = evmcs->host_idtr_base; 1750 vmcs12->host_rsp = evmcs->host_rsp; 1751 } 1752 1753 if (unlikely(!(hv_clean_fields & 1754 HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_XLAT))) { 1755 vmcs12->ept_pointer = evmcs->ept_pointer; 1756 vmcs12->virtual_processor_id = evmcs->virtual_processor_id; 1757 } 1758 1759 if (unlikely(!(hv_clean_fields & 1760 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1))) { 1761 vmcs12->vmcs_link_pointer = evmcs->vmcs_link_pointer; 1762 vmcs12->guest_ia32_debugctl = evmcs->guest_ia32_debugctl; 1763 vmcs12->guest_ia32_pat = evmcs->guest_ia32_pat; 1764 vmcs12->guest_ia32_efer = evmcs->guest_ia32_efer; 1765 vmcs12->guest_pdptr0 = evmcs->guest_pdptr0; 1766 vmcs12->guest_pdptr1 = evmcs->guest_pdptr1; 1767 vmcs12->guest_pdptr2 = evmcs->guest_pdptr2; 1768 vmcs12->guest_pdptr3 = evmcs->guest_pdptr3; 1769 vmcs12->guest_pending_dbg_exceptions = 1770 evmcs->guest_pending_dbg_exceptions; 1771 vmcs12->guest_sysenter_esp = evmcs->guest_sysenter_esp; 1772 vmcs12->guest_sysenter_eip = evmcs->guest_sysenter_eip; 1773 vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs; 1774 vmcs12->guest_activity_state = evmcs->guest_activity_state; 1775 vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs; 1776 vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl; 1777 /* 1778 * Not present in struct vmcs12: 1779 * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet; 1780 * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl; 1781 * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr; 1782 */ 1783 } 1784 1785 /* 1786 * Not used? 1787 * vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr; 1788 * vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr; 1789 * vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr; 1790 * vmcs12->page_fault_error_code_mask = 1791 * evmcs->page_fault_error_code_mask; 1792 * vmcs12->page_fault_error_code_match = 1793 * evmcs->page_fault_error_code_match; 1794 * vmcs12->cr3_target_count = evmcs->cr3_target_count; 1795 * vmcs12->vm_exit_msr_store_count = evmcs->vm_exit_msr_store_count; 1796 * vmcs12->vm_exit_msr_load_count = evmcs->vm_exit_msr_load_count; 1797 * vmcs12->vm_entry_msr_load_count = evmcs->vm_entry_msr_load_count; 1798 */ 1799 1800 /* 1801 * Read only fields: 1802 * vmcs12->guest_physical_address = evmcs->guest_physical_address; 1803 * vmcs12->vm_instruction_error = evmcs->vm_instruction_error; 1804 * vmcs12->vm_exit_reason = evmcs->vm_exit_reason; 1805 * vmcs12->vm_exit_intr_info = evmcs->vm_exit_intr_info; 1806 * vmcs12->vm_exit_intr_error_code = evmcs->vm_exit_intr_error_code; 1807 * vmcs12->idt_vectoring_info_field = evmcs->idt_vectoring_info_field; 1808 * vmcs12->idt_vectoring_error_code = evmcs->idt_vectoring_error_code; 1809 * vmcs12->vm_exit_instruction_len = evmcs->vm_exit_instruction_len; 1810 * vmcs12->vmx_instruction_info = evmcs->vmx_instruction_info; 1811 * vmcs12->exit_qualification = evmcs->exit_qualification; 1812 * vmcs12->guest_linear_address = evmcs->guest_linear_address; 1813 * 1814 * Not present in struct vmcs12: 1815 * vmcs12->exit_io_instruction_ecx = evmcs->exit_io_instruction_ecx; 1816 * vmcs12->exit_io_instruction_esi = evmcs->exit_io_instruction_esi; 1817 * vmcs12->exit_io_instruction_edi = evmcs->exit_io_instruction_edi; 1818 * vmcs12->exit_io_instruction_eip = evmcs->exit_io_instruction_eip; 1819 */ 1820 1821 return; 1822 } 1823 1824 static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) 1825 { 1826 struct vmcs12 *vmcs12 = vmx->nested.cached_vmcs12; 1827 struct hv_enlightened_vmcs *evmcs = vmx->nested.hv_evmcs; 1828 1829 /* 1830 * Should not be changed by KVM: 1831 * 1832 * evmcs->host_es_selector = vmcs12->host_es_selector; 1833 * evmcs->host_cs_selector = vmcs12->host_cs_selector; 1834 * evmcs->host_ss_selector = vmcs12->host_ss_selector; 1835 * evmcs->host_ds_selector = vmcs12->host_ds_selector; 1836 * evmcs->host_fs_selector = vmcs12->host_fs_selector; 1837 * evmcs->host_gs_selector = vmcs12->host_gs_selector; 1838 * evmcs->host_tr_selector = vmcs12->host_tr_selector; 1839 * evmcs->host_ia32_pat = vmcs12->host_ia32_pat; 1840 * evmcs->host_ia32_efer = vmcs12->host_ia32_efer; 1841 * evmcs->host_cr0 = vmcs12->host_cr0; 1842 * evmcs->host_cr3 = vmcs12->host_cr3; 1843 * evmcs->host_cr4 = vmcs12->host_cr4; 1844 * evmcs->host_ia32_sysenter_esp = vmcs12->host_ia32_sysenter_esp; 1845 * evmcs->host_ia32_sysenter_eip = vmcs12->host_ia32_sysenter_eip; 1846 * evmcs->host_rip = vmcs12->host_rip; 1847 * evmcs->host_ia32_sysenter_cs = vmcs12->host_ia32_sysenter_cs; 1848 * evmcs->host_fs_base = vmcs12->host_fs_base; 1849 * evmcs->host_gs_base = vmcs12->host_gs_base; 1850 * evmcs->host_tr_base = vmcs12->host_tr_base; 1851 * evmcs->host_gdtr_base = vmcs12->host_gdtr_base; 1852 * evmcs->host_idtr_base = vmcs12->host_idtr_base; 1853 * evmcs->host_rsp = vmcs12->host_rsp; 1854 * sync_vmcs02_to_vmcs12() doesn't read these: 1855 * evmcs->io_bitmap_a = vmcs12->io_bitmap_a; 1856 * evmcs->io_bitmap_b = vmcs12->io_bitmap_b; 1857 * evmcs->msr_bitmap = vmcs12->msr_bitmap; 1858 * evmcs->ept_pointer = vmcs12->ept_pointer; 1859 * evmcs->xss_exit_bitmap = vmcs12->xss_exit_bitmap; 1860 * evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr; 1861 * evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr; 1862 * evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr; 1863 * evmcs->tpr_threshold = vmcs12->tpr_threshold; 1864 * evmcs->virtual_processor_id = vmcs12->virtual_processor_id; 1865 * evmcs->exception_bitmap = vmcs12->exception_bitmap; 1866 * evmcs->vmcs_link_pointer = vmcs12->vmcs_link_pointer; 1867 * evmcs->pin_based_vm_exec_control = vmcs12->pin_based_vm_exec_control; 1868 * evmcs->vm_exit_controls = vmcs12->vm_exit_controls; 1869 * evmcs->secondary_vm_exec_control = vmcs12->secondary_vm_exec_control; 1870 * evmcs->page_fault_error_code_mask = 1871 * vmcs12->page_fault_error_code_mask; 1872 * evmcs->page_fault_error_code_match = 1873 * vmcs12->page_fault_error_code_match; 1874 * evmcs->cr3_target_count = vmcs12->cr3_target_count; 1875 * evmcs->virtual_apic_page_addr = vmcs12->virtual_apic_page_addr; 1876 * evmcs->tsc_offset = vmcs12->tsc_offset; 1877 * evmcs->guest_ia32_debugctl = vmcs12->guest_ia32_debugctl; 1878 * evmcs->cr0_guest_host_mask = vmcs12->cr0_guest_host_mask; 1879 * evmcs->cr4_guest_host_mask = vmcs12->cr4_guest_host_mask; 1880 * evmcs->cr0_read_shadow = vmcs12->cr0_read_shadow; 1881 * evmcs->cr4_read_shadow = vmcs12->cr4_read_shadow; 1882 * evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count; 1883 * evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count; 1884 * evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count; 1885 * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl; 1886 * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl; 1887 * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap; 1888 * evmcs->tsc_multiplier = vmcs12->tsc_multiplier; 1889 * 1890 * Not present in struct vmcs12: 1891 * evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx; 1892 * evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi; 1893 * evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi; 1894 * evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip; 1895 * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet; 1896 * evmcs->host_ssp = vmcs12->host_ssp; 1897 * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr; 1898 * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet; 1899 * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl; 1900 * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr; 1901 * evmcs->guest_ssp = vmcs12->guest_ssp; 1902 */ 1903 1904 evmcs->guest_es_selector = vmcs12->guest_es_selector; 1905 evmcs->guest_cs_selector = vmcs12->guest_cs_selector; 1906 evmcs->guest_ss_selector = vmcs12->guest_ss_selector; 1907 evmcs->guest_ds_selector = vmcs12->guest_ds_selector; 1908 evmcs->guest_fs_selector = vmcs12->guest_fs_selector; 1909 evmcs->guest_gs_selector = vmcs12->guest_gs_selector; 1910 evmcs->guest_ldtr_selector = vmcs12->guest_ldtr_selector; 1911 evmcs->guest_tr_selector = vmcs12->guest_tr_selector; 1912 1913 evmcs->guest_es_limit = vmcs12->guest_es_limit; 1914 evmcs->guest_cs_limit = vmcs12->guest_cs_limit; 1915 evmcs->guest_ss_limit = vmcs12->guest_ss_limit; 1916 evmcs->guest_ds_limit = vmcs12->guest_ds_limit; 1917 evmcs->guest_fs_limit = vmcs12->guest_fs_limit; 1918 evmcs->guest_gs_limit = vmcs12->guest_gs_limit; 1919 evmcs->guest_ldtr_limit = vmcs12->guest_ldtr_limit; 1920 evmcs->guest_tr_limit = vmcs12->guest_tr_limit; 1921 evmcs->guest_gdtr_limit = vmcs12->guest_gdtr_limit; 1922 evmcs->guest_idtr_limit = vmcs12->guest_idtr_limit; 1923 1924 evmcs->guest_es_ar_bytes = vmcs12->guest_es_ar_bytes; 1925 evmcs->guest_cs_ar_bytes = vmcs12->guest_cs_ar_bytes; 1926 evmcs->guest_ss_ar_bytes = vmcs12->guest_ss_ar_bytes; 1927 evmcs->guest_ds_ar_bytes = vmcs12->guest_ds_ar_bytes; 1928 evmcs->guest_fs_ar_bytes = vmcs12->guest_fs_ar_bytes; 1929 evmcs->guest_gs_ar_bytes = vmcs12->guest_gs_ar_bytes; 1930 evmcs->guest_ldtr_ar_bytes = vmcs12->guest_ldtr_ar_bytes; 1931 evmcs->guest_tr_ar_bytes = vmcs12->guest_tr_ar_bytes; 1932 1933 evmcs->guest_es_base = vmcs12->guest_es_base; 1934 evmcs->guest_cs_base = vmcs12->guest_cs_base; 1935 evmcs->guest_ss_base = vmcs12->guest_ss_base; 1936 evmcs->guest_ds_base = vmcs12->guest_ds_base; 1937 evmcs->guest_fs_base = vmcs12->guest_fs_base; 1938 evmcs->guest_gs_base = vmcs12->guest_gs_base; 1939 evmcs->guest_ldtr_base = vmcs12->guest_ldtr_base; 1940 evmcs->guest_tr_base = vmcs12->guest_tr_base; 1941 evmcs->guest_gdtr_base = vmcs12->guest_gdtr_base; 1942 evmcs->guest_idtr_base = vmcs12->guest_idtr_base; 1943 1944 evmcs->guest_ia32_pat = vmcs12->guest_ia32_pat; 1945 evmcs->guest_ia32_efer = vmcs12->guest_ia32_efer; 1946 1947 evmcs->guest_pdptr0 = vmcs12->guest_pdptr0; 1948 evmcs->guest_pdptr1 = vmcs12->guest_pdptr1; 1949 evmcs->guest_pdptr2 = vmcs12->guest_pdptr2; 1950 evmcs->guest_pdptr3 = vmcs12->guest_pdptr3; 1951 1952 evmcs->guest_pending_dbg_exceptions = 1953 vmcs12->guest_pending_dbg_exceptions; 1954 evmcs->guest_sysenter_esp = vmcs12->guest_sysenter_esp; 1955 evmcs->guest_sysenter_eip = vmcs12->guest_sysenter_eip; 1956 1957 evmcs->guest_activity_state = vmcs12->guest_activity_state; 1958 evmcs->guest_sysenter_cs = vmcs12->guest_sysenter_cs; 1959 1960 evmcs->guest_cr0 = vmcs12->guest_cr0; 1961 evmcs->guest_cr3 = vmcs12->guest_cr3; 1962 evmcs->guest_cr4 = vmcs12->guest_cr4; 1963 evmcs->guest_dr7 = vmcs12->guest_dr7; 1964 1965 evmcs->guest_physical_address = vmcs12->guest_physical_address; 1966 1967 evmcs->vm_instruction_error = vmcs12->vm_instruction_error; 1968 evmcs->vm_exit_reason = vmcs12->vm_exit_reason; 1969 evmcs->vm_exit_intr_info = vmcs12->vm_exit_intr_info; 1970 evmcs->vm_exit_intr_error_code = vmcs12->vm_exit_intr_error_code; 1971 evmcs->idt_vectoring_info_field = vmcs12->idt_vectoring_info_field; 1972 evmcs->idt_vectoring_error_code = vmcs12->idt_vectoring_error_code; 1973 evmcs->vm_exit_instruction_len = vmcs12->vm_exit_instruction_len; 1974 evmcs->vmx_instruction_info = vmcs12->vmx_instruction_info; 1975 1976 evmcs->exit_qualification = vmcs12->exit_qualification; 1977 1978 evmcs->guest_linear_address = vmcs12->guest_linear_address; 1979 evmcs->guest_rsp = vmcs12->guest_rsp; 1980 evmcs->guest_rflags = vmcs12->guest_rflags; 1981 1982 evmcs->guest_interruptibility_info = 1983 vmcs12->guest_interruptibility_info; 1984 evmcs->cpu_based_vm_exec_control = vmcs12->cpu_based_vm_exec_control; 1985 evmcs->vm_entry_controls = vmcs12->vm_entry_controls; 1986 evmcs->vm_entry_intr_info_field = vmcs12->vm_entry_intr_info_field; 1987 evmcs->vm_entry_exception_error_code = 1988 vmcs12->vm_entry_exception_error_code; 1989 evmcs->vm_entry_instruction_len = vmcs12->vm_entry_instruction_len; 1990 1991 evmcs->guest_rip = vmcs12->guest_rip; 1992 1993 evmcs->guest_bndcfgs = vmcs12->guest_bndcfgs; 1994 1995 return; 1996 } 1997 1998 /* 1999 * This is an equivalent of the nested hypervisor executing the vmptrld 2000 * instruction. 2001 */ 2002 static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( 2003 struct kvm_vcpu *vcpu, bool from_launch) 2004 { 2005 struct vcpu_vmx *vmx = to_vmx(vcpu); 2006 bool evmcs_gpa_changed = false; 2007 u64 evmcs_gpa; 2008 2009 if (likely(!guest_cpuid_has_evmcs(vcpu))) 2010 return EVMPTRLD_DISABLED; 2011 2012 evmcs_gpa = nested_get_evmptr(vcpu); 2013 if (!evmptr_is_valid(evmcs_gpa)) { 2014 nested_release_evmcs(vcpu); 2015 return EVMPTRLD_DISABLED; 2016 } 2017 2018 if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { 2019 vmx->nested.current_vmptr = INVALID_GPA; 2020 2021 nested_release_evmcs(vcpu); 2022 2023 if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), 2024 &vmx->nested.hv_evmcs_map)) 2025 return EVMPTRLD_ERROR; 2026 2027 vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; 2028 2029 /* 2030 * Currently, KVM only supports eVMCS version 1 2031 * (== KVM_EVMCS_VERSION) and thus we expect guest to set this 2032 * value to first u32 field of eVMCS which should specify eVMCS 2033 * VersionNumber. 2034 * 2035 * Guest should be aware of supported eVMCS versions by host by 2036 * examining CPUID.0x4000000A.EAX[0:15]. Host userspace VMM is 2037 * expected to set this CPUID leaf according to the value 2038 * returned in vmcs_version from nested_enable_evmcs(). 2039 * 2040 * However, it turns out that Microsoft Hyper-V fails to comply 2041 * to their own invented interface: When Hyper-V use eVMCS, it 2042 * just sets first u32 field of eVMCS to revision_id specified 2043 * in MSR_IA32_VMX_BASIC. Instead of used eVMCS version number 2044 * which is one of the supported versions specified in 2045 * CPUID.0x4000000A.EAX[0:15]. 2046 * 2047 * To overcome Hyper-V bug, we accept here either a supported 2048 * eVMCS version or VMCS12 revision_id as valid values for first 2049 * u32 field of eVMCS. 2050 */ 2051 if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && 2052 (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { 2053 nested_release_evmcs(vcpu); 2054 return EVMPTRLD_VMFAIL; 2055 } 2056 2057 vmx->nested.hv_evmcs_vmptr = evmcs_gpa; 2058 2059 evmcs_gpa_changed = true; 2060 /* 2061 * Unlike normal vmcs12, enlightened vmcs12 is not fully 2062 * reloaded from guest's memory (read only fields, fields not 2063 * present in struct hv_enlightened_vmcs, ...). Make sure there 2064 * are no leftovers. 2065 */ 2066 if (from_launch) { 2067 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2068 memset(vmcs12, 0, sizeof(*vmcs12)); 2069 vmcs12->hdr.revision_id = VMCS12_REVISION; 2070 } 2071 2072 } 2073 2074 /* 2075 * Clean fields data can't be used on VMLAUNCH and when we switch 2076 * between different L2 guests as KVM keeps a single VMCS12 per L1. 2077 */ 2078 if (from_launch || evmcs_gpa_changed) { 2079 vmx->nested.hv_evmcs->hv_clean_fields &= 2080 ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2081 2082 vmx->nested.force_msr_bitmap_recalc = true; 2083 } 2084 2085 return EVMPTRLD_SUCCEEDED; 2086 } 2087 2088 void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) 2089 { 2090 struct vcpu_vmx *vmx = to_vmx(vcpu); 2091 2092 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2093 copy_vmcs12_to_enlightened(vmx); 2094 else 2095 copy_vmcs12_to_shadow(vmx); 2096 2097 vmx->nested.need_vmcs12_to_shadow_sync = false; 2098 } 2099 2100 static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) 2101 { 2102 struct vcpu_vmx *vmx = 2103 container_of(timer, struct vcpu_vmx, nested.preemption_timer); 2104 2105 vmx->nested.preemption_timer_expired = true; 2106 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu); 2107 kvm_vcpu_kick(&vmx->vcpu); 2108 2109 return HRTIMER_NORESTART; 2110 } 2111 2112 static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) 2113 { 2114 struct vcpu_vmx *vmx = to_vmx(vcpu); 2115 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 2116 2117 u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> 2118 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2119 2120 if (!vmx->nested.has_preemption_timer_deadline) { 2121 vmx->nested.preemption_timer_deadline = 2122 vmcs12->vmx_preemption_timer_value + l1_scaled_tsc; 2123 vmx->nested.has_preemption_timer_deadline = true; 2124 } 2125 return vmx->nested.preemption_timer_deadline - l1_scaled_tsc; 2126 } 2127 2128 static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, 2129 u64 preemption_timeout) 2130 { 2131 struct vcpu_vmx *vmx = to_vmx(vcpu); 2132 2133 /* 2134 * A timer value of zero is architecturally guaranteed to cause 2135 * a VMExit prior to executing any instructions in the guest. 2136 */ 2137 if (preemption_timeout == 0) { 2138 vmx_preemption_timer_fn(&vmx->nested.preemption_timer); 2139 return; 2140 } 2141 2142 if (vcpu->arch.virtual_tsc_khz == 0) 2143 return; 2144 2145 preemption_timeout <<= VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 2146 preemption_timeout *= 1000000; 2147 do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz); 2148 hrtimer_start(&vmx->nested.preemption_timer, 2149 ktime_add_ns(ktime_get(), preemption_timeout), 2150 HRTIMER_MODE_ABS_PINNED); 2151 } 2152 2153 static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2154 { 2155 if (vmx->nested.nested_run_pending && 2156 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) 2157 return vmcs12->guest_ia32_efer; 2158 else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) 2159 return vmx->vcpu.arch.efer | (EFER_LMA | EFER_LME); 2160 else 2161 return vmx->vcpu.arch.efer & ~(EFER_LMA | EFER_LME); 2162 } 2163 2164 static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) 2165 { 2166 struct kvm *kvm = vmx->vcpu.kvm; 2167 2168 /* 2169 * If vmcs02 hasn't been initialized, set the constant vmcs02 state 2170 * according to L0's settings (vmcs12 is irrelevant here). Host 2171 * fields that come from L0 and are not constant, e.g. HOST_CR3, 2172 * will be set as needed prior to VMLAUNCH/VMRESUME. 2173 */ 2174 if (vmx->nested.vmcs02_initialized) 2175 return; 2176 vmx->nested.vmcs02_initialized = true; 2177 2178 /* 2179 * We don't care what the EPTP value is we just need to guarantee 2180 * it's valid so we don't get a false positive when doing early 2181 * consistency checks. 2182 */ 2183 if (enable_ept && nested_early_check) 2184 vmcs_write64(EPT_POINTER, 2185 construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); 2186 2187 /* All VMFUNCs are currently emulated through L0 vmexits. */ 2188 if (cpu_has_vmx_vmfunc()) 2189 vmcs_write64(VM_FUNCTION_CONTROL, 0); 2190 2191 if (cpu_has_vmx_posted_intr()) 2192 vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); 2193 2194 if (cpu_has_vmx_msr_bitmap()) 2195 vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap)); 2196 2197 /* 2198 * PML is emulated for L2, but never enabled in hardware as the MMU 2199 * handles A/D emulation. Disabling PML for L2 also avoids having to 2200 * deal with filtering out L2 GPAs from the buffer. 2201 */ 2202 if (enable_pml) { 2203 vmcs_write64(PML_ADDRESS, 0); 2204 vmcs_write16(GUEST_PML_INDEX, -1); 2205 } 2206 2207 if (cpu_has_vmx_encls_vmexit()) 2208 vmcs_write64(ENCLS_EXITING_BITMAP, INVALID_GPA); 2209 2210 if (kvm_notify_vmexit_enabled(kvm)) 2211 vmcs_write32(NOTIFY_WINDOW, kvm->arch.notify_window); 2212 2213 /* 2214 * Set the MSR load/store lists to match L0's settings. Only the 2215 * addresses are constant (for vmcs02), the counts can change based 2216 * on L2's behavior, e.g. switching to/from long mode. 2217 */ 2218 vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val)); 2219 vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val)); 2220 vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val)); 2221 2222 vmx_set_constant_host_state(vmx); 2223 } 2224 2225 static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, 2226 struct vmcs12 *vmcs12) 2227 { 2228 prepare_vmcs02_constant_state(vmx); 2229 2230 vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); 2231 2232 if (enable_vpid) { 2233 if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) 2234 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); 2235 else 2236 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2237 } 2238 } 2239 2240 static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, 2241 struct vmcs12 *vmcs12) 2242 { 2243 u32 exec_control; 2244 u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); 2245 2246 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2247 prepare_vmcs02_early_rare(vmx, vmcs12); 2248 2249 /* 2250 * PIN CONTROLS 2251 */ 2252 exec_control = __pin_controls_get(vmcs01); 2253 exec_control |= (vmcs12->pin_based_vm_exec_control & 2254 ~PIN_BASED_VMX_PREEMPTION_TIMER); 2255 2256 /* Posted interrupts setting is only taken from vmcs12. */ 2257 vmx->nested.pi_pending = false; 2258 if (nested_cpu_has_posted_intr(vmcs12)) 2259 vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; 2260 else 2261 exec_control &= ~PIN_BASED_POSTED_INTR; 2262 pin_controls_set(vmx, exec_control); 2263 2264 /* 2265 * EXEC CONTROLS 2266 */ 2267 exec_control = __exec_controls_get(vmcs01); /* L0's desires */ 2268 exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; 2269 exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; 2270 exec_control &= ~CPU_BASED_TPR_SHADOW; 2271 exec_control |= vmcs12->cpu_based_vm_exec_control; 2272 2273 vmx->nested.l1_tpr_threshold = -1; 2274 if (exec_control & CPU_BASED_TPR_SHADOW) 2275 vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); 2276 #ifdef CONFIG_X86_64 2277 else 2278 exec_control |= CPU_BASED_CR8_LOAD_EXITING | 2279 CPU_BASED_CR8_STORE_EXITING; 2280 #endif 2281 2282 /* 2283 * A vmexit (to either L1 hypervisor or L0 userspace) is always needed 2284 * for I/O port accesses. 2285 */ 2286 exec_control |= CPU_BASED_UNCOND_IO_EXITING; 2287 exec_control &= ~CPU_BASED_USE_IO_BITMAPS; 2288 2289 /* 2290 * This bit will be computed in nested_get_vmcs12_pages, because 2291 * we do not have access to L1's MSR bitmap yet. For now, keep 2292 * the same bit as before, hoping to avoid multiple VMWRITEs that 2293 * only set/clear this bit. 2294 */ 2295 exec_control &= ~CPU_BASED_USE_MSR_BITMAPS; 2296 exec_control |= exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS; 2297 2298 exec_controls_set(vmx, exec_control); 2299 2300 /* 2301 * SECONDARY EXEC CONTROLS 2302 */ 2303 if (cpu_has_secondary_exec_ctrls()) { 2304 exec_control = __secondary_exec_controls_get(vmcs01); 2305 2306 /* Take the following fields only from vmcs12 */ 2307 exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | 2308 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 2309 SECONDARY_EXEC_ENABLE_INVPCID | 2310 SECONDARY_EXEC_ENABLE_RDTSCP | 2311 SECONDARY_EXEC_ENABLE_XSAVES | 2312 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | 2313 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 2314 SECONDARY_EXEC_APIC_REGISTER_VIRT | 2315 SECONDARY_EXEC_ENABLE_VMFUNC | 2316 SECONDARY_EXEC_DESC); 2317 2318 if (nested_cpu_has(vmcs12, 2319 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) 2320 exec_control |= vmcs12->secondary_vm_exec_control; 2321 2322 /* PML is emulated and never enabled in hardware for L2. */ 2323 exec_control &= ~SECONDARY_EXEC_ENABLE_PML; 2324 2325 /* VMCS shadowing for L2 is emulated for now */ 2326 exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; 2327 2328 /* 2329 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() 2330 * will not have to rewrite the controls just for this bit. 2331 */ 2332 if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) 2333 exec_control |= SECONDARY_EXEC_DESC; 2334 2335 if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) 2336 vmcs_write16(GUEST_INTR_STATUS, 2337 vmcs12->guest_intr_status); 2338 2339 if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) 2340 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2341 2342 if (exec_control & SECONDARY_EXEC_ENCLS_EXITING) 2343 vmx_write_encls_bitmap(&vmx->vcpu, vmcs12); 2344 2345 secondary_exec_controls_set(vmx, exec_control); 2346 } 2347 2348 /* 2349 * ENTRY CONTROLS 2350 * 2351 * vmcs12's VM_{ENTRY,EXIT}_LOAD_IA32_EFER and VM_ENTRY_IA32E_MODE 2352 * are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate 2353 * on the related bits (if supported by the CPU) in the hope that 2354 * we can avoid VMWrites during vmx_set_efer(). 2355 * 2356 * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is 2357 * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to 2358 * do the same for L2. 2359 */ 2360 exec_control = __vm_entry_controls_get(vmcs01); 2361 exec_control |= (vmcs12->vm_entry_controls & 2362 ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 2363 exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); 2364 if (cpu_has_load_ia32_efer()) { 2365 if (guest_efer & EFER_LMA) 2366 exec_control |= VM_ENTRY_IA32E_MODE; 2367 if (guest_efer != host_efer) 2368 exec_control |= VM_ENTRY_LOAD_IA32_EFER; 2369 } 2370 vm_entry_controls_set(vmx, exec_control); 2371 2372 /* 2373 * EXIT CONTROLS 2374 * 2375 * L2->L1 exit controls are emulated - the hardware exit is to L0 so 2376 * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER 2377 * bits may be modified by vmx_set_efer() in prepare_vmcs02(). 2378 */ 2379 exec_control = __vm_exit_controls_get(vmcs01); 2380 if (cpu_has_load_ia32_efer() && guest_efer != host_efer) 2381 exec_control |= VM_EXIT_LOAD_IA32_EFER; 2382 else 2383 exec_control &= ~VM_EXIT_LOAD_IA32_EFER; 2384 vm_exit_controls_set(vmx, exec_control); 2385 2386 /* 2387 * Interrupt/Exception Fields 2388 */ 2389 if (vmx->nested.nested_run_pending) { 2390 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2391 vmcs12->vm_entry_intr_info_field); 2392 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 2393 vmcs12->vm_entry_exception_error_code); 2394 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 2395 vmcs12->vm_entry_instruction_len); 2396 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 2397 vmcs12->guest_interruptibility_info); 2398 vmx->loaded_vmcs->nmi_known_unmasked = 2399 !(vmcs12->guest_interruptibility_info & GUEST_INTR_STATE_NMI); 2400 } else { 2401 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); 2402 } 2403 } 2404 2405 static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) 2406 { 2407 struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; 2408 2409 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2410 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP2)) { 2411 vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); 2412 vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); 2413 vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); 2414 vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); 2415 vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); 2416 vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector); 2417 vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); 2418 vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); 2419 vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); 2420 vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); 2421 vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); 2422 vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); 2423 vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); 2424 vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit); 2425 vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit); 2426 vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit); 2427 vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); 2428 vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); 2429 vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); 2430 vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); 2431 vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); 2432 vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); 2433 vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); 2434 vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); 2435 vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); 2436 vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); 2437 vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); 2438 vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); 2439 vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); 2440 vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); 2441 vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); 2442 vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base); 2443 vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base); 2444 vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); 2445 vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); 2446 vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); 2447 2448 vmx->segment_cache.bitmask = 0; 2449 } 2450 2451 if (!hv_evmcs || !(hv_evmcs->hv_clean_fields & 2452 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1)) { 2453 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); 2454 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 2455 vmcs12->guest_pending_dbg_exceptions); 2456 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); 2457 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); 2458 2459 /* 2460 * L1 may access the L2's PDPTR, so save them to construct 2461 * vmcs12 2462 */ 2463 if (enable_ept) { 2464 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2465 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2466 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2467 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2468 } 2469 2470 if (kvm_mpx_supported() && vmx->nested.nested_run_pending && 2471 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)) 2472 vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); 2473 } 2474 2475 if (nested_cpu_has_xsaves(vmcs12)) 2476 vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); 2477 2478 /* 2479 * Whether page-faults are trapped is determined by a combination of 2480 * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 2481 * doesn't care about page faults then we should set all of these to 2482 * L1's desires. However, if L0 does care about (some) page faults, it 2483 * is not easy (if at all possible?) to merge L0 and L1's desires, we 2484 * simply ask to exit on each and every L2 page fault. This is done by 2485 * setting MASK=MATCH=0 and (see below) EB.PF=1. 2486 * Note that below we don't need special code to set EB.PF beyond the 2487 * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, 2488 * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when 2489 * !enable_ept, EB.PF is 1, so the "or" will always be 1. 2490 */ 2491 if (vmx_need_pf_intercept(&vmx->vcpu)) { 2492 /* 2493 * TODO: if both L0 and L1 need the same MASK and MATCH, 2494 * go ahead and use it? 2495 */ 2496 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); 2497 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); 2498 } else { 2499 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); 2500 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); 2501 } 2502 2503 if (cpu_has_vmx_apicv()) { 2504 vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); 2505 vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); 2506 vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); 2507 vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); 2508 } 2509 2510 /* 2511 * Make sure the msr_autostore list is up to date before we set the 2512 * count in the vmcs02. 2513 */ 2514 prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC); 2515 2516 vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr); 2517 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 2518 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 2519 2520 set_cr4_guest_host_mask(vmx); 2521 } 2522 2523 /* 2524 * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested 2525 * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it 2526 * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 2527 * guest in a way that will both be appropriate to L1's requests, and our 2528 * needs. In addition to modifying the active vmcs (which is vmcs02), this 2529 * function also has additional necessary side-effects, like setting various 2530 * vcpu->arch fields. 2531 * Returns 0 on success, 1 on failure. Invalid state exit qualification code 2532 * is assigned to entry_failure_code on failure. 2533 */ 2534 static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 2535 bool from_vmentry, 2536 enum vm_entry_failure_code *entry_failure_code) 2537 { 2538 struct vcpu_vmx *vmx = to_vmx(vcpu); 2539 bool load_guest_pdptrs_vmcs12 = false; 2540 2541 if (vmx->nested.dirty_vmcs12 || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 2542 prepare_vmcs02_rare(vmx, vmcs12); 2543 vmx->nested.dirty_vmcs12 = false; 2544 2545 load_guest_pdptrs_vmcs12 = !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) || 2546 !(vmx->nested.hv_evmcs->hv_clean_fields & 2547 HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1); 2548 } 2549 2550 if (vmx->nested.nested_run_pending && 2551 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { 2552 kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); 2553 vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); 2554 } else { 2555 kvm_set_dr(vcpu, 7, vcpu->arch.dr7); 2556 vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.pre_vmenter_debugctl); 2557 } 2558 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || 2559 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 2560 vmcs_write64(GUEST_BNDCFGS, vmx->nested.pre_vmenter_bndcfgs); 2561 vmx_set_rflags(vcpu, vmcs12->guest_rflags); 2562 2563 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the 2564 * bitwise-or of what L1 wants to trap for L2, and what we want to 2565 * trap. Note that CR0.TS also needs updating - we do this later. 2566 */ 2567 vmx_update_exception_bitmap(vcpu); 2568 vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask; 2569 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); 2570 2571 if (vmx->nested.nested_run_pending && 2572 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) { 2573 vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat); 2574 vcpu->arch.pat = vmcs12->guest_ia32_pat; 2575 } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 2576 vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); 2577 } 2578 2579 vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( 2580 vcpu->arch.l1_tsc_offset, 2581 vmx_get_l2_tsc_offset(vcpu), 2582 vmx_get_l2_tsc_multiplier(vcpu)); 2583 2584 vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( 2585 vcpu->arch.l1_tsc_scaling_ratio, 2586 vmx_get_l2_tsc_multiplier(vcpu)); 2587 2588 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 2589 if (kvm_caps.has_tsc_control) 2590 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 2591 2592 nested_vmx_transition_tlb_flush(vcpu, vmcs12, true); 2593 2594 if (nested_cpu_has_ept(vmcs12)) 2595 nested_ept_init_mmu_context(vcpu); 2596 2597 /* 2598 * Override the CR0/CR4 read shadows after setting the effective guest 2599 * CR0/CR4. The common helpers also set the shadows, but they don't 2600 * account for vmcs12's cr0/4_guest_host_mask. 2601 */ 2602 vmx_set_cr0(vcpu, vmcs12->guest_cr0); 2603 vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12)); 2604 2605 vmx_set_cr4(vcpu, vmcs12->guest_cr4); 2606 vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); 2607 2608 vcpu->arch.efer = nested_vmx_calc_efer(vmx, vmcs12); 2609 /* Note: may modify VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ 2610 vmx_set_efer(vcpu, vcpu->arch.efer); 2611 2612 /* 2613 * Guest state is invalid and unrestricted guest is disabled, 2614 * which means L1 attempted VMEntry to L2 with invalid state. 2615 * Fail the VMEntry. 2616 * 2617 * However when force loading the guest state (SMM exit or 2618 * loading nested state after migration, it is possible to 2619 * have invalid guest state now, which will be later fixed by 2620 * restoring L2 register state 2621 */ 2622 if (CC(from_vmentry && !vmx_guest_state_valid(vcpu))) { 2623 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2624 return -EINVAL; 2625 } 2626 2627 /* Shadow page tables on either EPT or shadow page tables. */ 2628 if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), 2629 from_vmentry, entry_failure_code)) 2630 return -EINVAL; 2631 2632 /* 2633 * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 2634 * on nested VM-Exit, which can occur without actually running L2 and 2635 * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with 2636 * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the 2637 * transition to HLT instead of running L2. 2638 */ 2639 if (enable_ept) 2640 vmcs_writel(GUEST_CR3, vmcs12->guest_cr3); 2641 2642 /* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */ 2643 if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) && 2644 is_pae_paging(vcpu)) { 2645 vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); 2646 vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); 2647 vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); 2648 vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); 2649 } 2650 2651 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 2652 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && 2653 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 2654 vmcs12->guest_ia32_perf_global_ctrl))) { 2655 *entry_failure_code = ENTRY_FAIL_DEFAULT; 2656 return -EINVAL; 2657 } 2658 2659 kvm_rsp_write(vcpu, vmcs12->guest_rsp); 2660 kvm_rip_write(vcpu, vmcs12->guest_rip); 2661 2662 /* 2663 * It was observed that genuine Hyper-V running in L1 doesn't reset 2664 * 'hv_clean_fields' by itself, it only sets the corresponding dirty 2665 * bits when it changes a field in eVMCS. Mark all fields as clean 2666 * here. 2667 */ 2668 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 2669 vmx->nested.hv_evmcs->hv_clean_fields |= 2670 HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; 2671 2672 return 0; 2673 } 2674 2675 static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) 2676 { 2677 if (CC(!nested_cpu_has_nmi_exiting(vmcs12) && 2678 nested_cpu_has_virtual_nmis(vmcs12))) 2679 return -EINVAL; 2680 2681 if (CC(!nested_cpu_has_virtual_nmis(vmcs12) && 2682 nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING))) 2683 return -EINVAL; 2684 2685 return 0; 2686 } 2687 2688 static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) 2689 { 2690 struct vcpu_vmx *vmx = to_vmx(vcpu); 2691 2692 /* Check for memory type validity */ 2693 switch (new_eptp & VMX_EPTP_MT_MASK) { 2694 case VMX_EPTP_MT_UC: 2695 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) 2696 return false; 2697 break; 2698 case VMX_EPTP_MT_WB: 2699 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_WB_BIT))) 2700 return false; 2701 break; 2702 default: 2703 return false; 2704 } 2705 2706 /* Page-walk levels validity. */ 2707 switch (new_eptp & VMX_EPTP_PWL_MASK) { 2708 case VMX_EPTP_PWL_5: 2709 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) 2710 return false; 2711 break; 2712 case VMX_EPTP_PWL_4: 2713 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) 2714 return false; 2715 break; 2716 default: 2717 return false; 2718 } 2719 2720 /* Reserved bits should not be set */ 2721 if (CC(kvm_vcpu_is_illegal_gpa(vcpu, new_eptp) || ((new_eptp >> 7) & 0x1f))) 2722 return false; 2723 2724 /* AD, if set, should be supported */ 2725 if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { 2726 if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) 2727 return false; 2728 } 2729 2730 return true; 2731 } 2732 2733 /* 2734 * Checks related to VM-Execution Control Fields 2735 */ 2736 static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, 2737 struct vmcs12 *vmcs12) 2738 { 2739 struct vcpu_vmx *vmx = to_vmx(vcpu); 2740 2741 if (CC(!vmx_control_verify(vmcs12->pin_based_vm_exec_control, 2742 vmx->nested.msrs.pinbased_ctls_low, 2743 vmx->nested.msrs.pinbased_ctls_high)) || 2744 CC(!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, 2745 vmx->nested.msrs.procbased_ctls_low, 2746 vmx->nested.msrs.procbased_ctls_high))) 2747 return -EINVAL; 2748 2749 if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && 2750 CC(!vmx_control_verify(vmcs12->secondary_vm_exec_control, 2751 vmx->nested.msrs.secondary_ctls_low, 2752 vmx->nested.msrs.secondary_ctls_high))) 2753 return -EINVAL; 2754 2755 if (CC(vmcs12->cr3_target_count > nested_cpu_vmx_misc_cr3_count(vcpu)) || 2756 nested_vmx_check_io_bitmap_controls(vcpu, vmcs12) || 2757 nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12) || 2758 nested_vmx_check_tpr_shadow_controls(vcpu, vmcs12) || 2759 nested_vmx_check_apic_access_controls(vcpu, vmcs12) || 2760 nested_vmx_check_apicv_controls(vcpu, vmcs12) || 2761 nested_vmx_check_nmi_controls(vmcs12) || 2762 nested_vmx_check_pml_controls(vcpu, vmcs12) || 2763 nested_vmx_check_unrestricted_guest_controls(vcpu, vmcs12) || 2764 nested_vmx_check_mode_based_ept_exec_controls(vcpu, vmcs12) || 2765 nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12) || 2766 CC(nested_cpu_has_vpid(vmcs12) && !vmcs12->virtual_processor_id)) 2767 return -EINVAL; 2768 2769 if (!nested_cpu_has_preemption_timer(vmcs12) && 2770 nested_cpu_has_save_preemption_timer(vmcs12)) 2771 return -EINVAL; 2772 2773 if (nested_cpu_has_ept(vmcs12) && 2774 CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) 2775 return -EINVAL; 2776 2777 if (nested_cpu_has_vmfunc(vmcs12)) { 2778 if (CC(vmcs12->vm_function_control & 2779 ~vmx->nested.msrs.vmfunc_controls)) 2780 return -EINVAL; 2781 2782 if (nested_cpu_has_eptp_switching(vmcs12)) { 2783 if (CC(!nested_cpu_has_ept(vmcs12)) || 2784 CC(!page_address_valid(vcpu, vmcs12->eptp_list_address))) 2785 return -EINVAL; 2786 } 2787 } 2788 2789 return 0; 2790 } 2791 2792 /* 2793 * Checks related to VM-Exit Control Fields 2794 */ 2795 static int nested_check_vm_exit_controls(struct kvm_vcpu *vcpu, 2796 struct vmcs12 *vmcs12) 2797 { 2798 struct vcpu_vmx *vmx = to_vmx(vcpu); 2799 2800 if (CC(!vmx_control_verify(vmcs12->vm_exit_controls, 2801 vmx->nested.msrs.exit_ctls_low, 2802 vmx->nested.msrs.exit_ctls_high)) || 2803 CC(nested_vmx_check_exit_msr_switch_controls(vcpu, vmcs12))) 2804 return -EINVAL; 2805 2806 return 0; 2807 } 2808 2809 /* 2810 * Checks related to VM-Entry Control Fields 2811 */ 2812 static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu, 2813 struct vmcs12 *vmcs12) 2814 { 2815 struct vcpu_vmx *vmx = to_vmx(vcpu); 2816 2817 if (CC(!vmx_control_verify(vmcs12->vm_entry_controls, 2818 vmx->nested.msrs.entry_ctls_low, 2819 vmx->nested.msrs.entry_ctls_high))) 2820 return -EINVAL; 2821 2822 /* 2823 * From the Intel SDM, volume 3: 2824 * Fields relevant to VM-entry event injection must be set properly. 2825 * These fields are the VM-entry interruption-information field, the 2826 * VM-entry exception error code, and the VM-entry instruction length. 2827 */ 2828 if (vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) { 2829 u32 intr_info = vmcs12->vm_entry_intr_info_field; 2830 u8 vector = intr_info & INTR_INFO_VECTOR_MASK; 2831 u32 intr_type = intr_info & INTR_INFO_INTR_TYPE_MASK; 2832 bool has_error_code = intr_info & INTR_INFO_DELIVER_CODE_MASK; 2833 bool should_have_error_code; 2834 bool urg = nested_cpu_has2(vmcs12, 2835 SECONDARY_EXEC_UNRESTRICTED_GUEST); 2836 bool prot_mode = !urg || vmcs12->guest_cr0 & X86_CR0_PE; 2837 2838 /* VM-entry interruption-info field: interruption type */ 2839 if (CC(intr_type == INTR_TYPE_RESERVED) || 2840 CC(intr_type == INTR_TYPE_OTHER_EVENT && 2841 !nested_cpu_supports_monitor_trap_flag(vcpu))) 2842 return -EINVAL; 2843 2844 /* VM-entry interruption-info field: vector */ 2845 if (CC(intr_type == INTR_TYPE_NMI_INTR && vector != NMI_VECTOR) || 2846 CC(intr_type == INTR_TYPE_HARD_EXCEPTION && vector > 31) || 2847 CC(intr_type == INTR_TYPE_OTHER_EVENT && vector != 0)) 2848 return -EINVAL; 2849 2850 /* VM-entry interruption-info field: deliver error code */ 2851 should_have_error_code = 2852 intr_type == INTR_TYPE_HARD_EXCEPTION && prot_mode && 2853 x86_exception_has_error_code(vector); 2854 if (CC(has_error_code != should_have_error_code)) 2855 return -EINVAL; 2856 2857 /* VM-entry exception error code */ 2858 if (CC(has_error_code && 2859 vmcs12->vm_entry_exception_error_code & GENMASK(31, 16))) 2860 return -EINVAL; 2861 2862 /* VM-entry interruption-info field: reserved bits */ 2863 if (CC(intr_info & INTR_INFO_RESVD_BITS_MASK)) 2864 return -EINVAL; 2865 2866 /* VM-entry instruction length */ 2867 switch (intr_type) { 2868 case INTR_TYPE_SOFT_EXCEPTION: 2869 case INTR_TYPE_SOFT_INTR: 2870 case INTR_TYPE_PRIV_SW_EXCEPTION: 2871 if (CC(vmcs12->vm_entry_instruction_len > 15) || 2872 CC(vmcs12->vm_entry_instruction_len == 0 && 2873 CC(!nested_cpu_has_zero_length_injection(vcpu)))) 2874 return -EINVAL; 2875 } 2876 } 2877 2878 if (nested_vmx_check_entry_msr_switch_controls(vcpu, vmcs12)) 2879 return -EINVAL; 2880 2881 return 0; 2882 } 2883 2884 static int nested_vmx_check_controls(struct kvm_vcpu *vcpu, 2885 struct vmcs12 *vmcs12) 2886 { 2887 if (nested_check_vm_execution_controls(vcpu, vmcs12) || 2888 nested_check_vm_exit_controls(vcpu, vmcs12) || 2889 nested_check_vm_entry_controls(vcpu, vmcs12)) 2890 return -EINVAL; 2891 2892 if (guest_cpuid_has_evmcs(vcpu)) 2893 return nested_evmcs_check_controls(vmcs12); 2894 2895 return 0; 2896 } 2897 2898 static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu, 2899 struct vmcs12 *vmcs12) 2900 { 2901 #ifdef CONFIG_X86_64 2902 if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) != 2903 !!(vcpu->arch.efer & EFER_LMA))) 2904 return -EINVAL; 2905 #endif 2906 return 0; 2907 } 2908 2909 static int nested_vmx_check_host_state(struct kvm_vcpu *vcpu, 2910 struct vmcs12 *vmcs12) 2911 { 2912 bool ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE); 2913 2914 if (CC(!nested_host_cr0_valid(vcpu, vmcs12->host_cr0)) || 2915 CC(!nested_host_cr4_valid(vcpu, vmcs12->host_cr4)) || 2916 CC(kvm_vcpu_is_illegal_gpa(vcpu, vmcs12->host_cr3))) 2917 return -EINVAL; 2918 2919 if (CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_esp, vcpu)) || 2920 CC(is_noncanonical_address(vmcs12->host_ia32_sysenter_eip, vcpu))) 2921 return -EINVAL; 2922 2923 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) && 2924 CC(!kvm_pat_valid(vmcs12->host_ia32_pat))) 2925 return -EINVAL; 2926 2927 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 2928 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 2929 vmcs12->host_ia32_perf_global_ctrl))) 2930 return -EINVAL; 2931 2932 if (ia32e) { 2933 if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE))) 2934 return -EINVAL; 2935 } else { 2936 if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) || 2937 CC(vmcs12->host_cr4 & X86_CR4_PCIDE) || 2938 CC((vmcs12->host_rip) >> 32)) 2939 return -EINVAL; 2940 } 2941 2942 if (CC(vmcs12->host_cs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2943 CC(vmcs12->host_ss_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2944 CC(vmcs12->host_ds_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2945 CC(vmcs12->host_es_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2946 CC(vmcs12->host_fs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2947 CC(vmcs12->host_gs_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2948 CC(vmcs12->host_tr_selector & (SEGMENT_RPL_MASK | SEGMENT_TI_MASK)) || 2949 CC(vmcs12->host_cs_selector == 0) || 2950 CC(vmcs12->host_tr_selector == 0) || 2951 CC(vmcs12->host_ss_selector == 0 && !ia32e)) 2952 return -EINVAL; 2953 2954 if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) || 2955 CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) || 2956 CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) || 2957 CC(is_noncanonical_address(vmcs12->host_idtr_base, vcpu)) || 2958 CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) || 2959 CC(is_noncanonical_address(vmcs12->host_rip, vcpu))) 2960 return -EINVAL; 2961 2962 /* 2963 * If the load IA32_EFER VM-exit control is 1, bits reserved in the 2964 * IA32_EFER MSR must be 0 in the field for that register. In addition, 2965 * the values of the LMA and LME bits in the field must each be that of 2966 * the host address-space size VM-exit control. 2967 */ 2968 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) { 2969 if (CC(!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer)) || 2970 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA)) || 2971 CC(ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))) 2972 return -EINVAL; 2973 } 2974 2975 return 0; 2976 } 2977 2978 static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu, 2979 struct vmcs12 *vmcs12) 2980 { 2981 struct vcpu_vmx *vmx = to_vmx(vcpu); 2982 struct gfn_to_hva_cache *ghc = &vmx->nested.shadow_vmcs12_cache; 2983 struct vmcs_hdr hdr; 2984 2985 if (vmcs12->vmcs_link_pointer == INVALID_GPA) 2986 return 0; 2987 2988 if (CC(!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))) 2989 return -EINVAL; 2990 2991 if (ghc->gpa != vmcs12->vmcs_link_pointer && 2992 CC(kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, 2993 vmcs12->vmcs_link_pointer, VMCS12_SIZE))) 2994 return -EINVAL; 2995 2996 if (CC(kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 2997 offsetof(struct vmcs12, hdr), 2998 sizeof(hdr)))) 2999 return -EINVAL; 3000 3001 if (CC(hdr.revision_id != VMCS12_REVISION) || 3002 CC(hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))) 3003 return -EINVAL; 3004 3005 return 0; 3006 } 3007 3008 /* 3009 * Checks related to Guest Non-register State 3010 */ 3011 static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) 3012 { 3013 if (CC(vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && 3014 vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT && 3015 vmcs12->guest_activity_state != GUEST_ACTIVITY_WAIT_SIPI)) 3016 return -EINVAL; 3017 3018 return 0; 3019 } 3020 3021 static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, 3022 struct vmcs12 *vmcs12, 3023 enum vm_entry_failure_code *entry_failure_code) 3024 { 3025 bool ia32e = !!(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE); 3026 3027 *entry_failure_code = ENTRY_FAIL_DEFAULT; 3028 3029 if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || 3030 CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) 3031 return -EINVAL; 3032 3033 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) && 3034 CC(!kvm_dr7_valid(vmcs12->guest_dr7))) 3035 return -EINVAL; 3036 3037 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) && 3038 CC(!kvm_pat_valid(vmcs12->guest_ia32_pat))) 3039 return -EINVAL; 3040 3041 if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { 3042 *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; 3043 return -EINVAL; 3044 } 3045 3046 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && 3047 CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu), 3048 vmcs12->guest_ia32_perf_global_ctrl))) 3049 return -EINVAL; 3050 3051 if (CC((vmcs12->guest_cr0 & (X86_CR0_PG | X86_CR0_PE)) == X86_CR0_PG)) 3052 return -EINVAL; 3053 3054 if (CC(ia32e && !(vmcs12->guest_cr4 & X86_CR4_PAE)) || 3055 CC(ia32e && !(vmcs12->guest_cr0 & X86_CR0_PG))) 3056 return -EINVAL; 3057 3058 /* 3059 * If the load IA32_EFER VM-entry control is 1, the following checks 3060 * are performed on the field for the IA32_EFER MSR: 3061 * - Bits reserved in the IA32_EFER MSR must be 0. 3062 * - Bit 10 (corresponding to IA32_EFER.LMA) must equal the value of 3063 * the IA-32e mode guest VM-exit control. It must also be identical 3064 * to bit 8 (LME) if bit 31 in the CR0 field (corresponding to 3065 * CR0.PG) is 1. 3066 */ 3067 if (to_vmx(vcpu)->nested.nested_run_pending && 3068 (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) { 3069 if (CC(!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer)) || 3070 CC(ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA)) || 3071 CC(((vmcs12->guest_cr0 & X86_CR0_PG) && 3072 ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))) 3073 return -EINVAL; 3074 } 3075 3076 if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && 3077 (CC(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu)) || 3078 CC((vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))) 3079 return -EINVAL; 3080 3081 if (nested_check_guest_non_reg_state(vmcs12)) 3082 return -EINVAL; 3083 3084 return 0; 3085 } 3086 3087 static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) 3088 { 3089 struct vcpu_vmx *vmx = to_vmx(vcpu); 3090 unsigned long cr3, cr4; 3091 bool vm_fail; 3092 3093 if (!nested_early_check) 3094 return 0; 3095 3096 if (vmx->msr_autoload.host.nr) 3097 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); 3098 if (vmx->msr_autoload.guest.nr) 3099 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); 3100 3101 preempt_disable(); 3102 3103 vmx_prepare_switch_to_guest(vcpu); 3104 3105 /* 3106 * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, 3107 * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to 3108 * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. 3109 * there is no need to preserve other bits or save/restore the field. 3110 */ 3111 vmcs_writel(GUEST_RFLAGS, 0); 3112 3113 cr3 = __get_current_cr3_fast(); 3114 if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) { 3115 vmcs_writel(HOST_CR3, cr3); 3116 vmx->loaded_vmcs->host_state.cr3 = cr3; 3117 } 3118 3119 cr4 = cr4_read_shadow(); 3120 if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) { 3121 vmcs_writel(HOST_CR4, cr4); 3122 vmx->loaded_vmcs->host_state.cr4 = cr4; 3123 } 3124 3125 vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, 3126 __vmx_vcpu_run_flags(vmx)); 3127 3128 if (vmx->msr_autoload.host.nr) 3129 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 3130 if (vmx->msr_autoload.guest.nr) 3131 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 3132 3133 if (vm_fail) { 3134 u32 error = vmcs_read32(VM_INSTRUCTION_ERROR); 3135 3136 preempt_enable(); 3137 3138 trace_kvm_nested_vmenter_failed( 3139 "early hardware check VM-instruction error: ", error); 3140 WARN_ON_ONCE(error != VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3141 return 1; 3142 } 3143 3144 /* 3145 * VMExit clears RFLAGS.IF and DR7, even on a consistency check. 3146 */ 3147 if (hw_breakpoint_active()) 3148 set_debugreg(__this_cpu_read(cpu_dr7), 7); 3149 local_irq_enable(); 3150 preempt_enable(); 3151 3152 /* 3153 * A non-failing VMEntry means we somehow entered guest mode with 3154 * an illegal RIP, and that's just the tip of the iceberg. There 3155 * is no telling what memory has been modified or what state has 3156 * been exposed to unknown code. Hitting this all but guarantees 3157 * a (very critical) hardware issue. 3158 */ 3159 WARN_ON(!(vmcs_read32(VM_EXIT_REASON) & 3160 VMX_EXIT_REASONS_FAILED_VMENTRY)); 3161 3162 return 0; 3163 } 3164 3165 static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu) 3166 { 3167 struct vcpu_vmx *vmx = to_vmx(vcpu); 3168 3169 /* 3170 * hv_evmcs may end up being not mapped after migration (when 3171 * L2 was running), map it here to make sure vmcs12 changes are 3172 * properly reflected. 3173 */ 3174 if (guest_cpuid_has_evmcs(vcpu) && 3175 vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) { 3176 enum nested_evmptrld_status evmptrld_status = 3177 nested_vmx_handle_enlightened_vmptrld(vcpu, false); 3178 3179 if (evmptrld_status == EVMPTRLD_VMFAIL || 3180 evmptrld_status == EVMPTRLD_ERROR) 3181 return false; 3182 3183 /* 3184 * Post migration VMCS12 always provides the most actual 3185 * information, copy it to eVMCS upon entry. 3186 */ 3187 vmx->nested.need_vmcs12_to_shadow_sync = true; 3188 } 3189 3190 return true; 3191 } 3192 3193 static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) 3194 { 3195 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3196 struct vcpu_vmx *vmx = to_vmx(vcpu); 3197 struct kvm_host_map *map; 3198 3199 if (!vcpu->arch.pdptrs_from_userspace && 3200 !nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 3201 /* 3202 * Reload the guest's PDPTRs since after a migration 3203 * the guest CR3 might be restored prior to setting the nested 3204 * state which can lead to a load of wrong PDPTRs. 3205 */ 3206 if (CC(!load_pdptrs(vcpu, vcpu->arch.cr3))) 3207 return false; 3208 } 3209 3210 3211 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3212 map = &vmx->nested.apic_access_page_map; 3213 3214 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->apic_access_addr), map)) { 3215 vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(map->pfn)); 3216 } else { 3217 pr_debug_ratelimited("%s: no backing for APIC-access address in vmcs12\n", 3218 __func__); 3219 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3220 vcpu->run->internal.suberror = 3221 KVM_INTERNAL_ERROR_EMULATION; 3222 vcpu->run->internal.ndata = 0; 3223 return false; 3224 } 3225 } 3226 3227 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3228 map = &vmx->nested.virtual_apic_map; 3229 3230 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->virtual_apic_page_addr), map)) { 3231 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, pfn_to_hpa(map->pfn)); 3232 } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && 3233 nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && 3234 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { 3235 /* 3236 * The processor will never use the TPR shadow, simply 3237 * clear the bit from the execution control. Such a 3238 * configuration is useless, but it happens in tests. 3239 * For any other configuration, failing the vm entry is 3240 * _not_ what the processor does but it's basically the 3241 * only possibility we have. 3242 */ 3243 exec_controls_clearbit(vmx, CPU_BASED_TPR_SHADOW); 3244 } else { 3245 /* 3246 * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR to 3247 * force VM-Entry to fail. 3248 */ 3249 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, INVALID_GPA); 3250 } 3251 } 3252 3253 if (nested_cpu_has_posted_intr(vmcs12)) { 3254 map = &vmx->nested.pi_desc_map; 3255 3256 if (!kvm_vcpu_map(vcpu, gpa_to_gfn(vmcs12->posted_intr_desc_addr), map)) { 3257 vmx->nested.pi_desc = 3258 (struct pi_desc *)(((void *)map->hva) + 3259 offset_in_page(vmcs12->posted_intr_desc_addr)); 3260 vmcs_write64(POSTED_INTR_DESC_ADDR, 3261 pfn_to_hpa(map->pfn) + offset_in_page(vmcs12->posted_intr_desc_addr)); 3262 } else { 3263 /* 3264 * Defer the KVM_INTERNAL_EXIT until KVM tries to 3265 * access the contents of the VMCS12 posted interrupt 3266 * descriptor. (Note that KVM may do this when it 3267 * should not, per the architectural specification.) 3268 */ 3269 vmx->nested.pi_desc = NULL; 3270 pin_controls_clearbit(vmx, PIN_BASED_POSTED_INTR); 3271 } 3272 } 3273 if (nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) 3274 exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3275 else 3276 exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS); 3277 3278 return true; 3279 } 3280 3281 static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu) 3282 { 3283 /* 3284 * Note: nested_get_evmcs_page() also updates 'vp_assist_page' copy 3285 * in 'struct kvm_vcpu_hv' in case eVMCS is in use, this is mandatory 3286 * to make nested_evmcs_l2_tlb_flush_enabled() work correctly post 3287 * migration. 3288 */ 3289 if (!nested_get_evmcs_page(vcpu)) { 3290 pr_debug_ratelimited("%s: enlightened vmptrld failed\n", 3291 __func__); 3292 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3293 vcpu->run->internal.suberror = 3294 KVM_INTERNAL_ERROR_EMULATION; 3295 vcpu->run->internal.ndata = 0; 3296 3297 return false; 3298 } 3299 3300 if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu)) 3301 return false; 3302 3303 return true; 3304 } 3305 3306 static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) 3307 { 3308 struct vmcs12 *vmcs12; 3309 struct vcpu_vmx *vmx = to_vmx(vcpu); 3310 gpa_t dst; 3311 3312 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) 3313 return 0; 3314 3315 if (WARN_ON_ONCE(vmx->nested.pml_full)) 3316 return 1; 3317 3318 /* 3319 * Check if PML is enabled for the nested guest. Whether eptp bit 6 is 3320 * set is already checked as part of A/D emulation. 3321 */ 3322 vmcs12 = get_vmcs12(vcpu); 3323 if (!nested_cpu_has_pml(vmcs12)) 3324 return 0; 3325 3326 if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { 3327 vmx->nested.pml_full = true; 3328 return 1; 3329 } 3330 3331 gpa &= ~0xFFFull; 3332 dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; 3333 3334 if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, 3335 offset_in_page(dst), sizeof(gpa))) 3336 return 0; 3337 3338 vmcs12->guest_pml_index--; 3339 3340 return 0; 3341 } 3342 3343 /* 3344 * Intel's VMX Instruction Reference specifies a common set of prerequisites 3345 * for running VMX instructions (except VMXON, whose prerequisites are 3346 * slightly different). It also specifies what exception to inject otherwise. 3347 * Note that many of these exceptions have priority over VM exits, so they 3348 * don't have to be checked again here. 3349 */ 3350 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) 3351 { 3352 if (!to_vmx(vcpu)->nested.vmxon) { 3353 kvm_queue_exception(vcpu, UD_VECTOR); 3354 return 0; 3355 } 3356 3357 if (vmx_get_cpl(vcpu)) { 3358 kvm_inject_gp(vcpu, 0); 3359 return 0; 3360 } 3361 3362 return 1; 3363 } 3364 3365 static u8 vmx_has_apicv_interrupt(struct kvm_vcpu *vcpu) 3366 { 3367 u8 rvi = vmx_get_rvi(); 3368 u8 vppr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_PROCPRI); 3369 3370 return ((rvi & 0xf0) > (vppr & 0xf0)); 3371 } 3372 3373 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 3374 struct vmcs12 *vmcs12); 3375 3376 /* 3377 * If from_vmentry is false, this is being called from state restore (either RSM 3378 * or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume. 3379 * 3380 * Returns: 3381 * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode 3382 * NVMX_VMENTRY_VMFAIL: Consistency check VMFail 3383 * NVMX_VMENTRY_VMEXIT: Consistency check VMExit 3384 * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error 3385 */ 3386 enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, 3387 bool from_vmentry) 3388 { 3389 struct vcpu_vmx *vmx = to_vmx(vcpu); 3390 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3391 enum vm_entry_failure_code entry_failure_code; 3392 bool evaluate_pending_interrupts; 3393 union vmx_exit_reason exit_reason = { 3394 .basic = EXIT_REASON_INVALID_STATE, 3395 .failed_vmentry = 1, 3396 }; 3397 u32 failed_index; 3398 3399 trace_kvm_nested_vmenter(kvm_rip_read(vcpu), 3400 vmx->nested.current_vmptr, 3401 vmcs12->guest_rip, 3402 vmcs12->guest_intr_status, 3403 vmcs12->vm_entry_intr_info_field, 3404 vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT, 3405 vmcs12->ept_pointer, 3406 vmcs12->guest_cr3, 3407 KVM_ISA_VMX); 3408 3409 kvm_service_local_tlb_flush_requests(vcpu); 3410 3411 evaluate_pending_interrupts = exec_controls_get(vmx) & 3412 (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); 3413 if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) 3414 evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu); 3415 if (!evaluate_pending_interrupts) 3416 evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu); 3417 3418 if (!vmx->nested.nested_run_pending || 3419 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) 3420 vmx->nested.pre_vmenter_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); 3421 if (kvm_mpx_supported() && 3422 (!vmx->nested.nested_run_pending || 3423 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) 3424 vmx->nested.pre_vmenter_bndcfgs = vmcs_read64(GUEST_BNDCFGS); 3425 3426 /* 3427 * Overwrite vmcs01.GUEST_CR3 with L1's CR3 if EPT is disabled *and* 3428 * nested early checks are disabled. In the event of a "late" VM-Fail, 3429 * i.e. a VM-Fail detected by hardware but not KVM, KVM must unwind its 3430 * software model to the pre-VMEntry host state. When EPT is disabled, 3431 * GUEST_CR3 holds KVM's shadow CR3, not L1's "real" CR3, which causes 3432 * nested_vmx_restore_host_state() to corrupt vcpu->arch.cr3. Stuffing 3433 * vmcs01.GUEST_CR3 results in the unwind naturally setting arch.cr3 to 3434 * the correct value. Smashing vmcs01.GUEST_CR3 is safe because nested 3435 * VM-Exits, and the unwind, reset KVM's MMU, i.e. vmcs01.GUEST_CR3 is 3436 * guaranteed to be overwritten with a shadow CR3 prior to re-entering 3437 * L1. Don't stuff vmcs01.GUEST_CR3 when using nested early checks as 3438 * KVM modifies vcpu->arch.cr3 if and only if the early hardware checks 3439 * pass, and early VM-Fails do not reset KVM's MMU, i.e. the VM-Fail 3440 * path would need to manually save/restore vmcs01.GUEST_CR3. 3441 */ 3442 if (!enable_ept && !nested_early_check) 3443 vmcs_writel(GUEST_CR3, vcpu->arch.cr3); 3444 3445 vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); 3446 3447 prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); 3448 3449 if (from_vmentry) { 3450 if (unlikely(!nested_get_vmcs12_pages(vcpu))) { 3451 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3452 return NVMX_VMENTRY_KVM_INTERNAL_ERROR; 3453 } 3454 3455 if (nested_vmx_check_vmentry_hw(vcpu)) { 3456 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3457 return NVMX_VMENTRY_VMFAIL; 3458 } 3459 3460 if (nested_vmx_check_guest_state(vcpu, vmcs12, 3461 &entry_failure_code)) { 3462 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3463 vmcs12->exit_qualification = entry_failure_code; 3464 goto vmentry_fail_vmexit; 3465 } 3466 } 3467 3468 enter_guest_mode(vcpu); 3469 3470 if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &entry_failure_code)) { 3471 exit_reason.basic = EXIT_REASON_INVALID_STATE; 3472 vmcs12->exit_qualification = entry_failure_code; 3473 goto vmentry_fail_vmexit_guest_mode; 3474 } 3475 3476 if (from_vmentry) { 3477 failed_index = nested_vmx_load_msr(vcpu, 3478 vmcs12->vm_entry_msr_load_addr, 3479 vmcs12->vm_entry_msr_load_count); 3480 if (failed_index) { 3481 exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL; 3482 vmcs12->exit_qualification = failed_index; 3483 goto vmentry_fail_vmexit_guest_mode; 3484 } 3485 } else { 3486 /* 3487 * The MMU is not initialized to point at the right entities yet and 3488 * "get pages" would need to read data from the guest (i.e. we will 3489 * need to perform gpa to hpa translation). Request a call 3490 * to nested_get_vmcs12_pages before the next VM-entry. The MSRs 3491 * have already been set at vmentry time and should not be reset. 3492 */ 3493 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 3494 } 3495 3496 /* 3497 * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI 3498 * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can 3499 * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit 3500 * unconditionally. 3501 */ 3502 if (unlikely(evaluate_pending_interrupts)) 3503 kvm_make_request(KVM_REQ_EVENT, vcpu); 3504 3505 /* 3506 * Do not start the preemption timer hrtimer until after we know 3507 * we are successful, so that only nested_vmx_vmexit needs to cancel 3508 * the timer. 3509 */ 3510 vmx->nested.preemption_timer_expired = false; 3511 if (nested_cpu_has_preemption_timer(vmcs12)) { 3512 u64 timer_value = vmx_calc_preemption_timer_value(vcpu); 3513 vmx_start_preemption_timer(vcpu, timer_value); 3514 } 3515 3516 /* 3517 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point 3518 * we are no longer running L1, and VMLAUNCH/VMRESUME has not yet 3519 * returned as far as L1 is concerned. It will only return (and set 3520 * the success flag) when L2 exits (see nested_vmx_vmexit()). 3521 */ 3522 return NVMX_VMENTRY_SUCCESS; 3523 3524 /* 3525 * A failed consistency check that leads to a VMExit during L1's 3526 * VMEnter to L2 is a variation of a normal VMexit, as explained in 3527 * 26.7 "VM-entry failures during or after loading guest state". 3528 */ 3529 vmentry_fail_vmexit_guest_mode: 3530 if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) 3531 vcpu->arch.tsc_offset -= vmcs12->tsc_offset; 3532 leave_guest_mode(vcpu); 3533 3534 vmentry_fail_vmexit: 3535 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 3536 3537 if (!from_vmentry) 3538 return NVMX_VMENTRY_VMEXIT; 3539 3540 load_vmcs12_host_state(vcpu, vmcs12); 3541 vmcs12->vm_exit_reason = exit_reason.full; 3542 if (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 3543 vmx->nested.need_vmcs12_to_shadow_sync = true; 3544 return NVMX_VMENTRY_VMEXIT; 3545 } 3546 3547 /* 3548 * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1 3549 * for running an L2 nested guest. 3550 */ 3551 static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) 3552 { 3553 struct vmcs12 *vmcs12; 3554 enum nvmx_vmentry_status status; 3555 struct vcpu_vmx *vmx = to_vmx(vcpu); 3556 u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); 3557 enum nested_evmptrld_status evmptrld_status; 3558 3559 if (!nested_vmx_check_permission(vcpu)) 3560 return 1; 3561 3562 evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); 3563 if (evmptrld_status == EVMPTRLD_ERROR) { 3564 kvm_queue_exception(vcpu, UD_VECTOR); 3565 return 1; 3566 } 3567 3568 kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 3569 3570 if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) 3571 return nested_vmx_failInvalid(vcpu); 3572 3573 if (CC(!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr) && 3574 vmx->nested.current_vmptr == INVALID_GPA)) 3575 return nested_vmx_failInvalid(vcpu); 3576 3577 vmcs12 = get_vmcs12(vcpu); 3578 3579 /* 3580 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact 3581 * that there *is* a valid VMCS pointer, RFLAGS.CF is set 3582 * rather than RFLAGS.ZF, and no error number is stored to the 3583 * VM-instruction error field. 3584 */ 3585 if (CC(vmcs12->hdr.shadow_vmcs)) 3586 return nested_vmx_failInvalid(vcpu); 3587 3588 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 3589 copy_enlightened_to_vmcs12(vmx, vmx->nested.hv_evmcs->hv_clean_fields); 3590 /* Enlightened VMCS doesn't have launch state */ 3591 vmcs12->launch_state = !launch; 3592 } else if (enable_shadow_vmcs) { 3593 copy_shadow_to_vmcs12(vmx); 3594 } 3595 3596 /* 3597 * The nested entry process starts with enforcing various prerequisites 3598 * on vmcs12 as required by the Intel SDM, and act appropriately when 3599 * they fail: As the SDM explains, some conditions should cause the 3600 * instruction to fail, while others will cause the instruction to seem 3601 * to succeed, but return an EXIT_REASON_INVALID_STATE. 3602 * To speed up the normal (success) code path, we should avoid checking 3603 * for misconfigurations which will anyway be caught by the processor 3604 * when using the merged vmcs02. 3605 */ 3606 if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)) 3607 return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); 3608 3609 if (CC(vmcs12->launch_state == launch)) 3610 return nested_vmx_fail(vcpu, 3611 launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS 3612 : VMXERR_VMRESUME_NONLAUNCHED_VMCS); 3613 3614 if (nested_vmx_check_controls(vcpu, vmcs12)) 3615 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3616 3617 if (nested_vmx_check_address_space_size(vcpu, vmcs12)) 3618 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3619 3620 if (nested_vmx_check_host_state(vcpu, vmcs12)) 3621 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); 3622 3623 /* 3624 * We're finally done with prerequisite checking, and can start with 3625 * the nested entry. 3626 */ 3627 vmx->nested.nested_run_pending = 1; 3628 vmx->nested.has_preemption_timer_deadline = false; 3629 status = nested_vmx_enter_non_root_mode(vcpu, true); 3630 if (unlikely(status != NVMX_VMENTRY_SUCCESS)) 3631 goto vmentry_failed; 3632 3633 /* Emulate processing of posted interrupts on VM-Enter. */ 3634 if (nested_cpu_has_posted_intr(vmcs12) && 3635 kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) { 3636 vmx->nested.pi_pending = true; 3637 kvm_make_request(KVM_REQ_EVENT, vcpu); 3638 kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv); 3639 } 3640 3641 /* Hide L1D cache contents from the nested guest. */ 3642 vmx->vcpu.arch.l1tf_flush_l1d = true; 3643 3644 /* 3645 * Must happen outside of nested_vmx_enter_non_root_mode() as it will 3646 * also be used as part of restoring nVMX state for 3647 * snapshot restore (migration). 3648 * 3649 * In this flow, it is assumed that vmcs12 cache was 3650 * transferred as part of captured nVMX state and should 3651 * therefore not be read from guest memory (which may not 3652 * exist on destination host yet). 3653 */ 3654 nested_cache_shadow_vmcs12(vcpu, vmcs12); 3655 3656 switch (vmcs12->guest_activity_state) { 3657 case GUEST_ACTIVITY_HLT: 3658 /* 3659 * If we're entering a halted L2 vcpu and the L2 vcpu won't be 3660 * awakened by event injection or by an NMI-window VM-exit or 3661 * by an interrupt-window VM-exit, halt the vcpu. 3662 */ 3663 if (!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) && 3664 !nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING) && 3665 !(nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING) && 3666 (vmcs12->guest_rflags & X86_EFLAGS_IF))) { 3667 vmx->nested.nested_run_pending = 0; 3668 return kvm_emulate_halt_noskip(vcpu); 3669 } 3670 break; 3671 case GUEST_ACTIVITY_WAIT_SIPI: 3672 vmx->nested.nested_run_pending = 0; 3673 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 3674 break; 3675 default: 3676 break; 3677 } 3678 3679 return 1; 3680 3681 vmentry_failed: 3682 vmx->nested.nested_run_pending = 0; 3683 if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR) 3684 return 0; 3685 if (status == NVMX_VMENTRY_VMEXIT) 3686 return 1; 3687 WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); 3688 return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 3689 } 3690 3691 /* 3692 * On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date 3693 * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK). 3694 * This function returns the new value we should put in vmcs12.guest_cr0. 3695 * It's not enough to just return the vmcs02 GUEST_CR0. Rather, 3696 * 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now 3697 * available in vmcs02 GUEST_CR0. (Note: It's enough to check that L0 3698 * didn't trap the bit, because if L1 did, so would L0). 3699 * 2. Bits that L1 asked to trap (and therefore L0 also did) could not have 3700 * been modified by L2, and L1 knows it. So just leave the old value of 3701 * the bit from vmcs12.guest_cr0. Note that the bit from vmcs02 GUEST_CR0 3702 * isn't relevant, because if L0 traps this bit it can set it to anything. 3703 * 3. Bits that L1 didn't trap, but L0 did. L1 believes the guest could have 3704 * changed these bits, and therefore they need to be updated, but L0 3705 * didn't necessarily allow them to be changed in GUEST_CR0 - and rather 3706 * put them in vmcs02 CR0_READ_SHADOW. So take these bits from there. 3707 */ 3708 static inline unsigned long 3709 vmcs12_guest_cr0(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3710 { 3711 return 3712 /*1*/ (vmcs_readl(GUEST_CR0) & vcpu->arch.cr0_guest_owned_bits) | 3713 /*2*/ (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask) | 3714 /*3*/ (vmcs_readl(CR0_READ_SHADOW) & ~(vmcs12->cr0_guest_host_mask | 3715 vcpu->arch.cr0_guest_owned_bits)); 3716 } 3717 3718 static inline unsigned long 3719 vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 3720 { 3721 return 3722 /*1*/ (vmcs_readl(GUEST_CR4) & vcpu->arch.cr4_guest_owned_bits) | 3723 /*2*/ (vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask) | 3724 /*3*/ (vmcs_readl(CR4_READ_SHADOW) & ~(vmcs12->cr4_guest_host_mask | 3725 vcpu->arch.cr4_guest_owned_bits)); 3726 } 3727 3728 static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, 3729 struct vmcs12 *vmcs12, 3730 u32 vm_exit_reason, u32 exit_intr_info) 3731 { 3732 u32 idt_vectoring; 3733 unsigned int nr; 3734 3735 /* 3736 * Per the SDM, VM-Exits due to double and triple faults are never 3737 * considered to occur during event delivery, even if the double/triple 3738 * fault is the result of an escalating vectoring issue. 3739 * 3740 * Note, the SDM qualifies the double fault behavior with "The original 3741 * event results in a double-fault exception". It's unclear why the 3742 * qualification exists since exits due to double fault can occur only 3743 * while vectoring a different exception (injected events are never 3744 * subject to interception), i.e. there's _always_ an original event. 3745 * 3746 * The SDM also uses NMI as a confusing example for the "original event 3747 * causes the VM exit directly" clause. NMI isn't special in any way, 3748 * the same rule applies to all events that cause an exit directly. 3749 * NMI is an odd choice for the example because NMIs can only occur on 3750 * instruction boundaries, i.e. they _can't_ occur during vectoring. 3751 */ 3752 if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT || 3753 ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI && 3754 is_double_fault(exit_intr_info))) { 3755 vmcs12->idt_vectoring_info_field = 0; 3756 } else if (vcpu->arch.exception.injected) { 3757 nr = vcpu->arch.exception.vector; 3758 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3759 3760 if (kvm_exception_is_soft(nr)) { 3761 vmcs12->vm_exit_instruction_len = 3762 vcpu->arch.event_exit_inst_len; 3763 idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION; 3764 } else 3765 idt_vectoring |= INTR_TYPE_HARD_EXCEPTION; 3766 3767 if (vcpu->arch.exception.has_error_code) { 3768 idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK; 3769 vmcs12->idt_vectoring_error_code = 3770 vcpu->arch.exception.error_code; 3771 } 3772 3773 vmcs12->idt_vectoring_info_field = idt_vectoring; 3774 } else if (vcpu->arch.nmi_injected) { 3775 vmcs12->idt_vectoring_info_field = 3776 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR; 3777 } else if (vcpu->arch.interrupt.injected) { 3778 nr = vcpu->arch.interrupt.nr; 3779 idt_vectoring = nr | VECTORING_INFO_VALID_MASK; 3780 3781 if (vcpu->arch.interrupt.soft) { 3782 idt_vectoring |= INTR_TYPE_SOFT_INTR; 3783 vmcs12->vm_entry_instruction_len = 3784 vcpu->arch.event_exit_inst_len; 3785 } else 3786 idt_vectoring |= INTR_TYPE_EXT_INTR; 3787 3788 vmcs12->idt_vectoring_info_field = idt_vectoring; 3789 } else { 3790 vmcs12->idt_vectoring_info_field = 0; 3791 } 3792 } 3793 3794 3795 void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) 3796 { 3797 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3798 gfn_t gfn; 3799 3800 /* 3801 * Don't need to mark the APIC access page dirty; it is never 3802 * written to by the CPU during APIC virtualization. 3803 */ 3804 3805 if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) { 3806 gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT; 3807 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3808 } 3809 3810 if (nested_cpu_has_posted_intr(vmcs12)) { 3811 gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT; 3812 kvm_vcpu_mark_page_dirty(vcpu, gfn); 3813 } 3814 } 3815 3816 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) 3817 { 3818 struct vcpu_vmx *vmx = to_vmx(vcpu); 3819 int max_irr; 3820 void *vapic_page; 3821 u16 status; 3822 3823 if (!vmx->nested.pi_pending) 3824 return 0; 3825 3826 if (!vmx->nested.pi_desc) 3827 goto mmio_needed; 3828 3829 vmx->nested.pi_pending = false; 3830 3831 if (!pi_test_and_clear_on(vmx->nested.pi_desc)) 3832 return 0; 3833 3834 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3835 if (max_irr > 0) { 3836 vapic_page = vmx->nested.virtual_apic_map.hva; 3837 if (!vapic_page) 3838 goto mmio_needed; 3839 3840 __kvm_apic_update_irr(vmx->nested.pi_desc->pir, 3841 vapic_page, &max_irr); 3842 status = vmcs_read16(GUEST_INTR_STATUS); 3843 if ((u8)max_irr > ((u8)status & 0xff)) { 3844 status &= ~0xff; 3845 status |= (u8)max_irr; 3846 vmcs_write16(GUEST_INTR_STATUS, status); 3847 } 3848 } 3849 3850 nested_mark_vmcs12_pages_dirty(vcpu); 3851 return 0; 3852 3853 mmio_needed: 3854 kvm_handle_memory_failure(vcpu, X86EMUL_IO_NEEDED, NULL); 3855 return -ENXIO; 3856 } 3857 3858 static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu) 3859 { 3860 struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; 3861 u32 intr_info = ex->vector | INTR_INFO_VALID_MASK; 3862 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 3863 unsigned long exit_qual; 3864 3865 if (ex->has_payload) { 3866 exit_qual = ex->payload; 3867 } else if (ex->vector == PF_VECTOR) { 3868 exit_qual = vcpu->arch.cr2; 3869 } else if (ex->vector == DB_VECTOR) { 3870 exit_qual = vcpu->arch.dr6; 3871 exit_qual &= ~DR6_BT; 3872 exit_qual ^= DR6_ACTIVE_LOW; 3873 } else { 3874 exit_qual = 0; 3875 } 3876 3877 /* 3878 * Unlike AMD's Paged Real Mode, which reports an error code on #PF 3879 * VM-Exits even if the CPU is in Real Mode, Intel VMX never sets the 3880 * "has error code" flags on VM-Exit if the CPU is in Real Mode. 3881 */ 3882 if (ex->has_error_code && is_protmode(vcpu)) { 3883 /* 3884 * Intel CPUs do not generate error codes with bits 31:16 set, 3885 * and more importantly VMX disallows setting bits 31:16 in the 3886 * injected error code for VM-Entry. Drop the bits to mimic 3887 * hardware and avoid inducing failure on nested VM-Entry if L1 3888 * chooses to inject the exception back to L2. AMD CPUs _do_ 3889 * generate "full" 32-bit error codes, so KVM allows userspace 3890 * to inject exception error codes with bits 31:16 set. 3891 */ 3892 vmcs12->vm_exit_intr_error_code = (u16)ex->error_code; 3893 intr_info |= INTR_INFO_DELIVER_CODE_MASK; 3894 } 3895 3896 if (kvm_exception_is_soft(ex->vector)) 3897 intr_info |= INTR_TYPE_SOFT_EXCEPTION; 3898 else 3899 intr_info |= INTR_TYPE_HARD_EXCEPTION; 3900 3901 if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) && 3902 vmx_get_nmi_mask(vcpu)) 3903 intr_info |= INTR_INFO_UNBLOCK_NMI; 3904 3905 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual); 3906 } 3907 3908 /* 3909 * Returns true if a debug trap is (likely) pending delivery. Infer the class 3910 * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6). 3911 * Using the payload is flawed because code breakpoints (fault-like) and data 3912 * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e. 3913 * this will return false positives if a to-be-injected code breakpoint #DB is 3914 * pending (from KVM's perspective, but not "pending" across an instruction 3915 * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it 3916 * too is trap-like. 3917 * 3918 * KVM "works" despite these flaws as ICEBP isn't currently supported by the 3919 * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the 3920 * #DB has already happened), and MTF isn't marked pending on code breakpoints 3921 * from the emulator (because such #DBs are fault-like and thus don't trigger 3922 * actions that fire on instruction retire). 3923 */ 3924 static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex) 3925 { 3926 if (!ex->pending || ex->vector != DB_VECTOR) 3927 return 0; 3928 3929 /* General Detect #DBs are always fault-like. */ 3930 return ex->payload & ~DR6_BD; 3931 } 3932 3933 /* 3934 * Returns true if there's a pending #DB exception that is lower priority than 3935 * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by 3936 * KVM, but could theoretically be injected by userspace. Note, this code is 3937 * imperfect, see above. 3938 */ 3939 static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex) 3940 { 3941 return vmx_get_pending_dbg_trap(ex) & ~DR6_BT; 3942 } 3943 3944 /* 3945 * Certain VM-exits set the 'pending debug exceptions' field to indicate a 3946 * recognized #DB (data or single-step) that has yet to be delivered. Since KVM 3947 * represents these debug traps with a payload that is said to be compatible 3948 * with the 'pending debug exceptions' field, write the payload to the VMCS 3949 * field if a VM-exit is delivered before the debug trap. 3950 */ 3951 static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) 3952 { 3953 unsigned long pending_dbg; 3954 3955 pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception); 3956 if (pending_dbg) 3957 vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg); 3958 } 3959 3960 static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) 3961 { 3962 return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && 3963 to_vmx(vcpu)->nested.preemption_timer_expired; 3964 } 3965 3966 static bool vmx_has_nested_events(struct kvm_vcpu *vcpu, bool for_injection) 3967 { 3968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3969 void *vapic = vmx->nested.virtual_apic_map.hva; 3970 int max_irr, vppr; 3971 3972 if (nested_vmx_preemption_timer_pending(vcpu) || 3973 vmx->nested.mtf_pending) 3974 return true; 3975 3976 /* 3977 * Virtual Interrupt Delivery doesn't require manual injection. Either 3978 * the interrupt is already in GUEST_RVI and will be recognized by CPU 3979 * at VM-Entry, or there is a KVM_REQ_EVENT pending and KVM will move 3980 * the interrupt from the PIR to RVI prior to entering the guest. 3981 */ 3982 if (for_injection) 3983 return false; 3984 3985 if (!nested_cpu_has_vid(get_vmcs12(vcpu)) || 3986 __vmx_interrupt_blocked(vcpu)) 3987 return false; 3988 3989 if (!vapic) 3990 return false; 3991 3992 vppr = *((u32 *)(vapic + APIC_PROCPRI)); 3993 3994 if (vmx->nested.pi_pending && vmx->nested.pi_desc && 3995 pi_test_on(vmx->nested.pi_desc)) { 3996 max_irr = pi_find_highest_vector(vmx->nested.pi_desc); 3997 if (max_irr > 0 && (max_irr & 0xf0) > (vppr & 0xf0)) 3998 return true; 3999 } 4000 4001 return false; 4002 } 4003 4004 /* 4005 * Per the Intel SDM's table "Priority Among Concurrent Events", with minor 4006 * edits to fill in missing examples, e.g. #DB due to split-lock accesses, 4007 * and less minor edits to splice in the priority of VMX Non-Root specific 4008 * events, e.g. MTF and NMI/INTR-window exiting. 4009 * 4010 * 1 Hardware Reset and Machine Checks 4011 * - RESET 4012 * - Machine Check 4013 * 4014 * 2 Trap on Task Switch 4015 * - T flag in TSS is set (on task switch) 4016 * 4017 * 3 External Hardware Interventions 4018 * - FLUSH 4019 * - STOPCLK 4020 * - SMI 4021 * - INIT 4022 * 4023 * 3.5 Monitor Trap Flag (MTF) VM-exit[1] 4024 * 4025 * 4 Traps on Previous Instruction 4026 * - Breakpoints 4027 * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O 4028 * breakpoint, or #DB due to a split-lock access) 4029 * 4030 * 4.3 VMX-preemption timer expired VM-exit 4031 * 4032 * 4.6 NMI-window exiting VM-exit[2] 4033 * 4034 * 5 Nonmaskable Interrupts (NMI) 4035 * 4036 * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery 4037 * 4038 * 6 Maskable Hardware Interrupts 4039 * 4040 * 7 Code Breakpoint Fault 4041 * 4042 * 8 Faults from Fetching Next Instruction 4043 * - Code-Segment Limit Violation 4044 * - Code Page Fault 4045 * - Control protection exception (missing ENDBRANCH at target of indirect 4046 * call or jump) 4047 * 4048 * 9 Faults from Decoding Next Instruction 4049 * - Instruction length > 15 bytes 4050 * - Invalid Opcode 4051 * - Coprocessor Not Available 4052 * 4053 *10 Faults on Executing Instruction 4054 * - Overflow 4055 * - Bound error 4056 * - Invalid TSS 4057 * - Segment Not Present 4058 * - Stack fault 4059 * - General Protection 4060 * - Data Page Fault 4061 * - Alignment Check 4062 * - x86 FPU Floating-point exception 4063 * - SIMD floating-point exception 4064 * - Virtualization exception 4065 * - Control protection exception 4066 * 4067 * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs), 4068 * INIT signals, and higher priority events take priority over MTF VM exits. 4069 * MTF VM exits take priority over debug-trap exceptions and lower priority 4070 * events. 4071 * 4072 * [2] Debug-trap exceptions and higher priority events take priority over VM exits 4073 * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption 4074 * timer take priority over VM exits caused by the "NMI-window exiting" 4075 * VM-execution control and lower priority events. 4076 * 4077 * [3] Debug-trap exceptions and higher priority events take priority over VM exits 4078 * caused by "NMI-window exiting". VM exits caused by this control take 4079 * priority over non-maskable interrupts (NMIs) and lower priority events. 4080 * 4081 * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to 4082 * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus, 4083 * non-maskable interrupts (NMIs) and higher priority events take priority over 4084 * delivery of a virtual interrupt; delivery of a virtual interrupt takes 4085 * priority over external interrupts and lower priority events. 4086 */ 4087 static int vmx_check_nested_events(struct kvm_vcpu *vcpu) 4088 { 4089 struct kvm_lapic *apic = vcpu->arch.apic; 4090 struct vcpu_vmx *vmx = to_vmx(vcpu); 4091 /* 4092 * Only a pending nested run blocks a pending exception. If there is a 4093 * previously injected event, the pending exception occurred while said 4094 * event was being delivered and thus needs to be handled. 4095 */ 4096 bool block_nested_exceptions = vmx->nested.nested_run_pending; 4097 /* 4098 * New events (not exceptions) are only recognized at instruction 4099 * boundaries. If an event needs reinjection, then KVM is handling a 4100 * VM-Exit that occurred _during_ instruction execution; new events are 4101 * blocked until the instruction completes. 4102 */ 4103 bool block_nested_events = block_nested_exceptions || 4104 kvm_event_needs_reinjection(vcpu); 4105 4106 if (lapic_in_kernel(vcpu) && 4107 test_bit(KVM_APIC_INIT, &apic->pending_events)) { 4108 if (block_nested_events) 4109 return -EBUSY; 4110 nested_vmx_update_pending_dbg(vcpu); 4111 clear_bit(KVM_APIC_INIT, &apic->pending_events); 4112 if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED) 4113 nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0); 4114 4115 /* MTF is discarded if the vCPU is in WFS. */ 4116 vmx->nested.mtf_pending = false; 4117 return 0; 4118 } 4119 4120 if (lapic_in_kernel(vcpu) && 4121 test_bit(KVM_APIC_SIPI, &apic->pending_events)) { 4122 if (block_nested_events) 4123 return -EBUSY; 4124 4125 clear_bit(KVM_APIC_SIPI, &apic->pending_events); 4126 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 4127 nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0, 4128 apic->sipi_vector & 0xFFUL); 4129 return 0; 4130 } 4131 /* Fallthrough, the SIPI is completely ignored. */ 4132 } 4133 4134 /* 4135 * Process exceptions that are higher priority than Monitor Trap Flag: 4136 * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but 4137 * could theoretically come in from userspace), and ICEBP (INT1). 4138 * 4139 * TODO: SMIs have higher priority than MTF and trap-like #DBs (except 4140 * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF 4141 * across SMI/RSM as it should; that needs to be addressed in order to 4142 * prioritize SMI over MTF and trap-like #DBs. 4143 */ 4144 if (vcpu->arch.exception_vmexit.pending && 4145 !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) { 4146 if (block_nested_exceptions) 4147 return -EBUSY; 4148 4149 nested_vmx_inject_exception_vmexit(vcpu); 4150 return 0; 4151 } 4152 4153 if (vcpu->arch.exception.pending && 4154 !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) { 4155 if (block_nested_exceptions) 4156 return -EBUSY; 4157 goto no_vmexit; 4158 } 4159 4160 if (vmx->nested.mtf_pending) { 4161 if (block_nested_events) 4162 return -EBUSY; 4163 nested_vmx_update_pending_dbg(vcpu); 4164 nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0); 4165 return 0; 4166 } 4167 4168 if (vcpu->arch.exception_vmexit.pending) { 4169 if (block_nested_exceptions) 4170 return -EBUSY; 4171 4172 nested_vmx_inject_exception_vmexit(vcpu); 4173 return 0; 4174 } 4175 4176 if (vcpu->arch.exception.pending) { 4177 if (block_nested_exceptions) 4178 return -EBUSY; 4179 goto no_vmexit; 4180 } 4181 4182 if (nested_vmx_preemption_timer_pending(vcpu)) { 4183 if (block_nested_events) 4184 return -EBUSY; 4185 nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); 4186 return 0; 4187 } 4188 4189 if (vcpu->arch.smi_pending && !is_smm(vcpu)) { 4190 if (block_nested_events) 4191 return -EBUSY; 4192 goto no_vmexit; 4193 } 4194 4195 if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) { 4196 if (block_nested_events) 4197 return -EBUSY; 4198 if (!nested_exit_on_nmi(vcpu)) 4199 goto no_vmexit; 4200 4201 nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, 4202 NMI_VECTOR | INTR_TYPE_NMI_INTR | 4203 INTR_INFO_VALID_MASK, 0); 4204 /* 4205 * The NMI-triggered VM exit counts as injection: 4206 * clear this one and block further NMIs. 4207 */ 4208 vcpu->arch.nmi_pending = 0; 4209 vmx_set_nmi_mask(vcpu, true); 4210 return 0; 4211 } 4212 4213 if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) { 4214 if (block_nested_events) 4215 return -EBUSY; 4216 if (!nested_exit_on_intr(vcpu)) 4217 goto no_vmexit; 4218 nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); 4219 return 0; 4220 } 4221 4222 no_vmexit: 4223 return vmx_complete_nested_posted_interrupt(vcpu); 4224 } 4225 4226 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu) 4227 { 4228 ktime_t remaining = 4229 hrtimer_get_remaining(&to_vmx(vcpu)->nested.preemption_timer); 4230 u64 value; 4231 4232 if (ktime_to_ns(remaining) <= 0) 4233 return 0; 4234 4235 value = ktime_to_ns(remaining) * vcpu->arch.virtual_tsc_khz; 4236 do_div(value, 1000000); 4237 return value >> VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; 4238 } 4239 4240 static bool is_vmcs12_ext_field(unsigned long field) 4241 { 4242 switch (field) { 4243 case GUEST_ES_SELECTOR: 4244 case GUEST_CS_SELECTOR: 4245 case GUEST_SS_SELECTOR: 4246 case GUEST_DS_SELECTOR: 4247 case GUEST_FS_SELECTOR: 4248 case GUEST_GS_SELECTOR: 4249 case GUEST_LDTR_SELECTOR: 4250 case GUEST_TR_SELECTOR: 4251 case GUEST_ES_LIMIT: 4252 case GUEST_CS_LIMIT: 4253 case GUEST_SS_LIMIT: 4254 case GUEST_DS_LIMIT: 4255 case GUEST_FS_LIMIT: 4256 case GUEST_GS_LIMIT: 4257 case GUEST_LDTR_LIMIT: 4258 case GUEST_TR_LIMIT: 4259 case GUEST_GDTR_LIMIT: 4260 case GUEST_IDTR_LIMIT: 4261 case GUEST_ES_AR_BYTES: 4262 case GUEST_DS_AR_BYTES: 4263 case GUEST_FS_AR_BYTES: 4264 case GUEST_GS_AR_BYTES: 4265 case GUEST_LDTR_AR_BYTES: 4266 case GUEST_TR_AR_BYTES: 4267 case GUEST_ES_BASE: 4268 case GUEST_CS_BASE: 4269 case GUEST_SS_BASE: 4270 case GUEST_DS_BASE: 4271 case GUEST_FS_BASE: 4272 case GUEST_GS_BASE: 4273 case GUEST_LDTR_BASE: 4274 case GUEST_TR_BASE: 4275 case GUEST_GDTR_BASE: 4276 case GUEST_IDTR_BASE: 4277 case GUEST_PENDING_DBG_EXCEPTIONS: 4278 case GUEST_BNDCFGS: 4279 return true; 4280 default: 4281 break; 4282 } 4283 4284 return false; 4285 } 4286 4287 static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4288 struct vmcs12 *vmcs12) 4289 { 4290 struct vcpu_vmx *vmx = to_vmx(vcpu); 4291 4292 vmcs12->guest_es_selector = vmcs_read16(GUEST_ES_SELECTOR); 4293 vmcs12->guest_cs_selector = vmcs_read16(GUEST_CS_SELECTOR); 4294 vmcs12->guest_ss_selector = vmcs_read16(GUEST_SS_SELECTOR); 4295 vmcs12->guest_ds_selector = vmcs_read16(GUEST_DS_SELECTOR); 4296 vmcs12->guest_fs_selector = vmcs_read16(GUEST_FS_SELECTOR); 4297 vmcs12->guest_gs_selector = vmcs_read16(GUEST_GS_SELECTOR); 4298 vmcs12->guest_ldtr_selector = vmcs_read16(GUEST_LDTR_SELECTOR); 4299 vmcs12->guest_tr_selector = vmcs_read16(GUEST_TR_SELECTOR); 4300 vmcs12->guest_es_limit = vmcs_read32(GUEST_ES_LIMIT); 4301 vmcs12->guest_cs_limit = vmcs_read32(GUEST_CS_LIMIT); 4302 vmcs12->guest_ss_limit = vmcs_read32(GUEST_SS_LIMIT); 4303 vmcs12->guest_ds_limit = vmcs_read32(GUEST_DS_LIMIT); 4304 vmcs12->guest_fs_limit = vmcs_read32(GUEST_FS_LIMIT); 4305 vmcs12->guest_gs_limit = vmcs_read32(GUEST_GS_LIMIT); 4306 vmcs12->guest_ldtr_limit = vmcs_read32(GUEST_LDTR_LIMIT); 4307 vmcs12->guest_tr_limit = vmcs_read32(GUEST_TR_LIMIT); 4308 vmcs12->guest_gdtr_limit = vmcs_read32(GUEST_GDTR_LIMIT); 4309 vmcs12->guest_idtr_limit = vmcs_read32(GUEST_IDTR_LIMIT); 4310 vmcs12->guest_es_ar_bytes = vmcs_read32(GUEST_ES_AR_BYTES); 4311 vmcs12->guest_ds_ar_bytes = vmcs_read32(GUEST_DS_AR_BYTES); 4312 vmcs12->guest_fs_ar_bytes = vmcs_read32(GUEST_FS_AR_BYTES); 4313 vmcs12->guest_gs_ar_bytes = vmcs_read32(GUEST_GS_AR_BYTES); 4314 vmcs12->guest_ldtr_ar_bytes = vmcs_read32(GUEST_LDTR_AR_BYTES); 4315 vmcs12->guest_tr_ar_bytes = vmcs_read32(GUEST_TR_AR_BYTES); 4316 vmcs12->guest_es_base = vmcs_readl(GUEST_ES_BASE); 4317 vmcs12->guest_cs_base = vmcs_readl(GUEST_CS_BASE); 4318 vmcs12->guest_ss_base = vmcs_readl(GUEST_SS_BASE); 4319 vmcs12->guest_ds_base = vmcs_readl(GUEST_DS_BASE); 4320 vmcs12->guest_fs_base = vmcs_readl(GUEST_FS_BASE); 4321 vmcs12->guest_gs_base = vmcs_readl(GUEST_GS_BASE); 4322 vmcs12->guest_ldtr_base = vmcs_readl(GUEST_LDTR_BASE); 4323 vmcs12->guest_tr_base = vmcs_readl(GUEST_TR_BASE); 4324 vmcs12->guest_gdtr_base = vmcs_readl(GUEST_GDTR_BASE); 4325 vmcs12->guest_idtr_base = vmcs_readl(GUEST_IDTR_BASE); 4326 vmcs12->guest_pending_dbg_exceptions = 4327 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); 4328 4329 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; 4330 } 4331 4332 static void copy_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, 4333 struct vmcs12 *vmcs12) 4334 { 4335 struct vcpu_vmx *vmx = to_vmx(vcpu); 4336 int cpu; 4337 4338 if (!vmx->nested.need_sync_vmcs02_to_vmcs12_rare) 4339 return; 4340 4341 4342 WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01); 4343 4344 cpu = get_cpu(); 4345 vmx->loaded_vmcs = &vmx->nested.vmcs02; 4346 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01); 4347 4348 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4349 4350 vmx->loaded_vmcs = &vmx->vmcs01; 4351 vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02); 4352 put_cpu(); 4353 } 4354 4355 /* 4356 * Update the guest state fields of vmcs12 to reflect changes that 4357 * occurred while L2 was running. (The "IA-32e mode guest" bit of the 4358 * VM-entry controls is also updated, since this is really a guest 4359 * state bit.) 4360 */ 4361 static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) 4362 { 4363 struct vcpu_vmx *vmx = to_vmx(vcpu); 4364 4365 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 4366 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 4367 4368 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = 4369 !evmptr_is_valid(vmx->nested.hv_evmcs_vmptr); 4370 4371 vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); 4372 vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); 4373 4374 vmcs12->guest_rsp = kvm_rsp_read(vcpu); 4375 vmcs12->guest_rip = kvm_rip_read(vcpu); 4376 vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); 4377 4378 vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES); 4379 vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES); 4380 4381 vmcs12->guest_interruptibility_info = 4382 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); 4383 4384 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) 4385 vmcs12->guest_activity_state = GUEST_ACTIVITY_HLT; 4386 else if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) 4387 vmcs12->guest_activity_state = GUEST_ACTIVITY_WAIT_SIPI; 4388 else 4389 vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; 4390 4391 if (nested_cpu_has_preemption_timer(vmcs12) && 4392 vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && 4393 !vmx->nested.nested_run_pending) 4394 vmcs12->vmx_preemption_timer_value = 4395 vmx_get_preemption_timer_value(vcpu); 4396 4397 /* 4398 * In some cases (usually, nested EPT), L2 is allowed to change its 4399 * own CR3 without exiting. If it has changed it, we must keep it. 4400 * Of course, if L0 is using shadow page tables, GUEST_CR3 was defined 4401 * by L0, not L1 or L2, so we mustn't unconditionally copy it to vmcs12. 4402 * 4403 * Additionally, restore L2's PDPTR to vmcs12. 4404 */ 4405 if (enable_ept) { 4406 vmcs12->guest_cr3 = vmcs_readl(GUEST_CR3); 4407 if (nested_cpu_has_ept(vmcs12) && is_pae_paging(vcpu)) { 4408 vmcs12->guest_pdptr0 = vmcs_read64(GUEST_PDPTR0); 4409 vmcs12->guest_pdptr1 = vmcs_read64(GUEST_PDPTR1); 4410 vmcs12->guest_pdptr2 = vmcs_read64(GUEST_PDPTR2); 4411 vmcs12->guest_pdptr3 = vmcs_read64(GUEST_PDPTR3); 4412 } 4413 } 4414 4415 vmcs12->guest_linear_address = vmcs_readl(GUEST_LINEAR_ADDRESS); 4416 4417 if (nested_cpu_has_vid(vmcs12)) 4418 vmcs12->guest_intr_status = vmcs_read16(GUEST_INTR_STATUS); 4419 4420 vmcs12->vm_entry_controls = 4421 (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | 4422 (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); 4423 4424 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) 4425 kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); 4426 4427 if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) 4428 vmcs12->guest_ia32_efer = vcpu->arch.efer; 4429 } 4430 4431 /* 4432 * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits 4433 * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12), 4434 * and this function updates it to reflect the changes to the guest state while 4435 * L2 was running (and perhaps made some exits which were handled directly by L0 4436 * without going back to L1), and to reflect the exit reason. 4437 * Note that we do not have to copy here all VMCS fields, just those that 4438 * could have changed by the L2 guest or the exit - i.e., the guest-state and 4439 * exit-information fields only. Other fields are modified by L1 with VMWRITE, 4440 * which already writes to vmcs12 directly. 4441 */ 4442 static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, 4443 u32 vm_exit_reason, u32 exit_intr_info, 4444 unsigned long exit_qualification) 4445 { 4446 /* update exit information fields: */ 4447 vmcs12->vm_exit_reason = vm_exit_reason; 4448 if (to_vmx(vcpu)->exit_reason.enclave_mode) 4449 vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE; 4450 vmcs12->exit_qualification = exit_qualification; 4451 4452 /* 4453 * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched 4454 * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other 4455 * exit info fields are unmodified. 4456 */ 4457 if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) { 4458 vmcs12->launch_state = 1; 4459 4460 /* vm_entry_intr_info_field is cleared on exit. Emulate this 4461 * instead of reading the real value. */ 4462 vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK; 4463 4464 /* 4465 * Transfer the event that L0 or L1 may wanted to inject into 4466 * L2 to IDT_VECTORING_INFO_FIELD. 4467 */ 4468 vmcs12_save_pending_event(vcpu, vmcs12, 4469 vm_exit_reason, exit_intr_info); 4470 4471 vmcs12->vm_exit_intr_info = exit_intr_info; 4472 vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4473 vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 4474 4475 /* 4476 * According to spec, there's no need to store the guest's 4477 * MSRs if the exit is due to a VM-entry failure that occurs 4478 * during or after loading the guest state. Since this exit 4479 * does not fall in that category, we need to save the MSRs. 4480 */ 4481 if (nested_vmx_store_msr(vcpu, 4482 vmcs12->vm_exit_msr_store_addr, 4483 vmcs12->vm_exit_msr_store_count)) 4484 nested_vmx_abort(vcpu, 4485 VMX_ABORT_SAVE_GUEST_MSR_FAIL); 4486 } 4487 } 4488 4489 /* 4490 * A part of what we need to when the nested L2 guest exits and we want to 4491 * run its L1 parent, is to reset L1's guest state to the host state specified 4492 * in vmcs12. 4493 * This function is to be called not only on normal nested exit, but also on 4494 * a nested entry failure, as explained in Intel's spec, 3B.23.7 ("VM-Entry 4495 * Failures During or After Loading Guest State"). 4496 * This function should be called when the active VMCS is L1's (vmcs01). 4497 */ 4498 static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, 4499 struct vmcs12 *vmcs12) 4500 { 4501 enum vm_entry_failure_code ignored; 4502 struct kvm_segment seg; 4503 4504 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) 4505 vcpu->arch.efer = vmcs12->host_ia32_efer; 4506 else if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4507 vcpu->arch.efer |= (EFER_LMA | EFER_LME); 4508 else 4509 vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); 4510 vmx_set_efer(vcpu, vcpu->arch.efer); 4511 4512 kvm_rsp_write(vcpu, vmcs12->host_rsp); 4513 kvm_rip_write(vcpu, vmcs12->host_rip); 4514 vmx_set_rflags(vcpu, X86_EFLAGS_FIXED); 4515 vmx_set_interrupt_shadow(vcpu, 0); 4516 4517 /* 4518 * Note that calling vmx_set_cr0 is important, even if cr0 hasn't 4519 * actually changed, because vmx_set_cr0 refers to efer set above. 4520 * 4521 * CR0_GUEST_HOST_MASK is already set in the original vmcs01 4522 * (KVM doesn't change it); 4523 */ 4524 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4525 vmx_set_cr0(vcpu, vmcs12->host_cr0); 4526 4527 /* Same as above - no reason to call set_cr4_guest_host_mask(). */ 4528 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4529 vmx_set_cr4(vcpu, vmcs12->host_cr4); 4530 4531 nested_ept_uninit_mmu_context(vcpu); 4532 4533 /* 4534 * Only PDPTE load can fail as the value of cr3 was checked on entry and 4535 * couldn't have changed. 4536 */ 4537 if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, true, &ignored)) 4538 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); 4539 4540 nested_vmx_transition_tlb_flush(vcpu, vmcs12, false); 4541 4542 vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); 4543 vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); 4544 vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip); 4545 vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base); 4546 vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base); 4547 vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF); 4548 vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF); 4549 4550 /* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */ 4551 if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS) 4552 vmcs_write64(GUEST_BNDCFGS, 0); 4553 4554 if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PAT) { 4555 vmcs_write64(GUEST_IA32_PAT, vmcs12->host_ia32_pat); 4556 vcpu->arch.pat = vmcs12->host_ia32_pat; 4557 } 4558 if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && 4559 kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) 4560 WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, 4561 vmcs12->host_ia32_perf_global_ctrl)); 4562 4563 /* Set L1 segment info according to Intel SDM 4564 27.5.2 Loading Host Segment and Descriptor-Table Registers */ 4565 seg = (struct kvm_segment) { 4566 .base = 0, 4567 .limit = 0xFFFFFFFF, 4568 .selector = vmcs12->host_cs_selector, 4569 .type = 11, 4570 .present = 1, 4571 .s = 1, 4572 .g = 1 4573 }; 4574 if (vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) 4575 seg.l = 1; 4576 else 4577 seg.db = 1; 4578 __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); 4579 seg = (struct kvm_segment) { 4580 .base = 0, 4581 .limit = 0xFFFFFFFF, 4582 .type = 3, 4583 .present = 1, 4584 .s = 1, 4585 .db = 1, 4586 .g = 1 4587 }; 4588 seg.selector = vmcs12->host_ds_selector; 4589 __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); 4590 seg.selector = vmcs12->host_es_selector; 4591 __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); 4592 seg.selector = vmcs12->host_ss_selector; 4593 __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); 4594 seg.selector = vmcs12->host_fs_selector; 4595 seg.base = vmcs12->host_fs_base; 4596 __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); 4597 seg.selector = vmcs12->host_gs_selector; 4598 seg.base = vmcs12->host_gs_base; 4599 __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); 4600 seg = (struct kvm_segment) { 4601 .base = vmcs12->host_tr_base, 4602 .limit = 0x67, 4603 .selector = vmcs12->host_tr_selector, 4604 .type = 11, 4605 .present = 1 4606 }; 4607 __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); 4608 4609 memset(&seg, 0, sizeof(seg)); 4610 seg.unusable = 1; 4611 __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); 4612 4613 kvm_set_dr(vcpu, 7, 0x400); 4614 vmcs_write64(GUEST_IA32_DEBUGCTL, 0); 4615 4616 if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, 4617 vmcs12->vm_exit_msr_load_count)) 4618 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4619 4620 to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu); 4621 } 4622 4623 static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx) 4624 { 4625 struct vmx_uret_msr *efer_msr; 4626 unsigned int i; 4627 4628 if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER) 4629 return vmcs_read64(GUEST_IA32_EFER); 4630 4631 if (cpu_has_load_ia32_efer()) 4632 return host_efer; 4633 4634 for (i = 0; i < vmx->msr_autoload.guest.nr; ++i) { 4635 if (vmx->msr_autoload.guest.val[i].index == MSR_EFER) 4636 return vmx->msr_autoload.guest.val[i].value; 4637 } 4638 4639 efer_msr = vmx_find_uret_msr(vmx, MSR_EFER); 4640 if (efer_msr) 4641 return efer_msr->data; 4642 4643 return host_efer; 4644 } 4645 4646 static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) 4647 { 4648 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4649 struct vcpu_vmx *vmx = to_vmx(vcpu); 4650 struct vmx_msr_entry g, h; 4651 gpa_t gpa; 4652 u32 i, j; 4653 4654 vcpu->arch.pat = vmcs_read64(GUEST_IA32_PAT); 4655 4656 if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { 4657 /* 4658 * L1's host DR7 is lost if KVM_GUESTDBG_USE_HW_BP is set 4659 * as vmcs01.GUEST_DR7 contains a userspace defined value 4660 * and vcpu->arch.dr7 is not squirreled away before the 4661 * nested VMENTER (not worth adding a variable in nested_vmx). 4662 */ 4663 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 4664 kvm_set_dr(vcpu, 7, DR7_FIXED_1); 4665 else 4666 WARN_ON(kvm_set_dr(vcpu, 7, vmcs_readl(GUEST_DR7))); 4667 } 4668 4669 /* 4670 * Note that calling vmx_set_{efer,cr0,cr4} is important as they 4671 * handle a variety of side effects to KVM's software model. 4672 */ 4673 vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx)); 4674 4675 vcpu->arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits(); 4676 vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW)); 4677 4678 vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); 4679 vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); 4680 4681 nested_ept_uninit_mmu_context(vcpu); 4682 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 4683 kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); 4684 4685 /* 4686 * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs 4687 * from vmcs01 (if necessary). The PDPTRs are not loaded on 4688 * VMFail, like everything else we just need to ensure our 4689 * software model is up-to-date. 4690 */ 4691 if (enable_ept && is_pae_paging(vcpu)) 4692 ept_save_pdptrs(vcpu); 4693 4694 kvm_mmu_reset_context(vcpu); 4695 4696 /* 4697 * This nasty bit of open coding is a compromise between blindly 4698 * loading L1's MSRs using the exit load lists (incorrect emulation 4699 * of VMFail), leaving the nested VM's MSRs in the software model 4700 * (incorrect behavior) and snapshotting the modified MSRs (too 4701 * expensive since the lists are unbound by hardware). For each 4702 * MSR that was (prematurely) loaded from the nested VMEntry load 4703 * list, reload it from the exit load list if it exists and differs 4704 * from the guest value. The intent is to stuff host state as 4705 * silently as possible, not to fully process the exit load list. 4706 */ 4707 for (i = 0; i < vmcs12->vm_entry_msr_load_count; i++) { 4708 gpa = vmcs12->vm_entry_msr_load_addr + (i * sizeof(g)); 4709 if (kvm_vcpu_read_guest(vcpu, gpa, &g, sizeof(g))) { 4710 pr_debug_ratelimited( 4711 "%s read MSR index failed (%u, 0x%08llx)\n", 4712 __func__, i, gpa); 4713 goto vmabort; 4714 } 4715 4716 for (j = 0; j < vmcs12->vm_exit_msr_load_count; j++) { 4717 gpa = vmcs12->vm_exit_msr_load_addr + (j * sizeof(h)); 4718 if (kvm_vcpu_read_guest(vcpu, gpa, &h, sizeof(h))) { 4719 pr_debug_ratelimited( 4720 "%s read MSR failed (%u, 0x%08llx)\n", 4721 __func__, j, gpa); 4722 goto vmabort; 4723 } 4724 if (h.index != g.index) 4725 continue; 4726 if (h.value == g.value) 4727 break; 4728 4729 if (nested_vmx_load_msr_check(vcpu, &h)) { 4730 pr_debug_ratelimited( 4731 "%s check failed (%u, 0x%x, 0x%x)\n", 4732 __func__, j, h.index, h.reserved); 4733 goto vmabort; 4734 } 4735 4736 if (kvm_set_msr(vcpu, h.index, h.value)) { 4737 pr_debug_ratelimited( 4738 "%s WRMSR failed (%u, 0x%x, 0x%llx)\n", 4739 __func__, j, h.index, h.value); 4740 goto vmabort; 4741 } 4742 } 4743 } 4744 4745 return; 4746 4747 vmabort: 4748 nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); 4749 } 4750 4751 /* 4752 * Emulate an exit from nested guest (L2) to L1, i.e., prepare to run L1 4753 * and modify vmcs12 to make it see what it would expect to see there if 4754 * L2 was its real guest. Must only be called when in L2 (is_guest_mode()) 4755 */ 4756 void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, 4757 u32 exit_intr_info, unsigned long exit_qualification) 4758 { 4759 struct vcpu_vmx *vmx = to_vmx(vcpu); 4760 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 4761 4762 /* Pending MTF traps are discarded on VM-Exit. */ 4763 vmx->nested.mtf_pending = false; 4764 4765 /* trying to cancel vmlaunch/vmresume is a bug */ 4766 WARN_ON_ONCE(vmx->nested.nested_run_pending); 4767 4768 if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { 4769 /* 4770 * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map 4771 * Enlightened VMCS after migration and we still need to 4772 * do that when something is forcing L2->L1 exit prior to 4773 * the first L2 run. 4774 */ 4775 (void)nested_get_evmcs_page(vcpu); 4776 } 4777 4778 /* Service pending TLB flush requests for L2 before switching to L1. */ 4779 kvm_service_local_tlb_flush_requests(vcpu); 4780 4781 /* 4782 * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between 4783 * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are 4784 * up-to-date before switching to L1. 4785 */ 4786 if (enable_ept && is_pae_paging(vcpu)) 4787 vmx_ept_load_pdptrs(vcpu); 4788 4789 leave_guest_mode(vcpu); 4790 4791 if (nested_cpu_has_preemption_timer(vmcs12)) 4792 hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer); 4793 4794 if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING)) { 4795 vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset; 4796 if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING)) 4797 vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio; 4798 } 4799 4800 if (likely(!vmx->fail)) { 4801 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 4802 4803 if (vm_exit_reason != -1) 4804 prepare_vmcs12(vcpu, vmcs12, vm_exit_reason, 4805 exit_intr_info, exit_qualification); 4806 4807 /* 4808 * Must happen outside of sync_vmcs02_to_vmcs12() as it will 4809 * also be used to capture vmcs12 cache as part of 4810 * capturing nVMX state for snapshot (migration). 4811 * 4812 * Otherwise, this flush will dirty guest memory at a 4813 * point it is already assumed by user-space to be 4814 * immutable. 4815 */ 4816 nested_flush_cached_shadow_vmcs12(vcpu, vmcs12); 4817 } else { 4818 /* 4819 * The only expected VM-instruction error is "VM entry with 4820 * invalid control field(s)." Anything else indicates a 4821 * problem with L0. And we should never get here with a 4822 * VMFail of any type if early consistency checks are enabled. 4823 */ 4824 WARN_ON_ONCE(vmcs_read32(VM_INSTRUCTION_ERROR) != 4825 VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4826 WARN_ON_ONCE(nested_early_check); 4827 } 4828 4829 /* 4830 * Drop events/exceptions that were queued for re-injection to L2 4831 * (picked up via vmx_complete_interrupts()), as well as exceptions 4832 * that were pending for L2. Note, this must NOT be hoisted above 4833 * prepare_vmcs12(), events/exceptions queued for re-injection need to 4834 * be captured in vmcs12 (see vmcs12_save_pending_event()). 4835 */ 4836 vcpu->arch.nmi_injected = false; 4837 kvm_clear_exception_queue(vcpu); 4838 kvm_clear_interrupt_queue(vcpu); 4839 4840 vmx_switch_vmcs(vcpu, &vmx->vmcs01); 4841 4842 /* 4843 * If IBRS is advertised to the vCPU, KVM must flush the indirect 4844 * branch predictors when transitioning from L2 to L1, as L1 expects 4845 * hardware (KVM in this case) to provide separate predictor modes. 4846 * Bare metal isolates VMX root (host) from VMX non-root (guest), but 4847 * doesn't isolate different VMCSs, i.e. in this case, doesn't provide 4848 * separate modes for L2 vs L1. 4849 */ 4850 if (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) 4851 indirect_branch_prediction_barrier(); 4852 4853 /* Update any VMCS fields that might have changed while L2 ran */ 4854 vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr); 4855 vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr); 4856 vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); 4857 if (kvm_caps.has_tsc_control) 4858 vmcs_write64(TSC_MULTIPLIER, vcpu->arch.tsc_scaling_ratio); 4859 4860 if (vmx->nested.l1_tpr_threshold != -1) 4861 vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold); 4862 4863 if (vmx->nested.change_vmcs01_virtual_apic_mode) { 4864 vmx->nested.change_vmcs01_virtual_apic_mode = false; 4865 vmx_set_virtual_apic_mode(vcpu); 4866 } 4867 4868 if (vmx->nested.update_vmcs01_cpu_dirty_logging) { 4869 vmx->nested.update_vmcs01_cpu_dirty_logging = false; 4870 vmx_update_cpu_dirty_logging(vcpu); 4871 } 4872 4873 /* Unpin physical memory we referred to in vmcs02 */ 4874 kvm_vcpu_unmap(vcpu, &vmx->nested.apic_access_page_map, false); 4875 kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true); 4876 kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true); 4877 vmx->nested.pi_desc = NULL; 4878 4879 if (vmx->nested.reload_vmcs01_apic_access_page) { 4880 vmx->nested.reload_vmcs01_apic_access_page = false; 4881 kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); 4882 } 4883 4884 if (vmx->nested.update_vmcs01_apicv_status) { 4885 vmx->nested.update_vmcs01_apicv_status = false; 4886 kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu); 4887 } 4888 4889 if ((vm_exit_reason != -1) && 4890 (enable_shadow_vmcs || evmptr_is_valid(vmx->nested.hv_evmcs_vmptr))) 4891 vmx->nested.need_vmcs12_to_shadow_sync = true; 4892 4893 /* in case we halted in L2 */ 4894 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4895 4896 if (likely(!vmx->fail)) { 4897 if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && 4898 nested_exit_intr_ack_set(vcpu)) { 4899 int irq = kvm_cpu_get_interrupt(vcpu); 4900 WARN_ON(irq < 0); 4901 vmcs12->vm_exit_intr_info = irq | 4902 INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR; 4903 } 4904 4905 if (vm_exit_reason != -1) 4906 trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason, 4907 vmcs12->exit_qualification, 4908 vmcs12->idt_vectoring_info_field, 4909 vmcs12->vm_exit_intr_info, 4910 vmcs12->vm_exit_intr_error_code, 4911 KVM_ISA_VMX); 4912 4913 load_vmcs12_host_state(vcpu, vmcs12); 4914 4915 return; 4916 } 4917 4918 /* 4919 * After an early L2 VM-entry failure, we're now back 4920 * in L1 which thinks it just finished a VMLAUNCH or 4921 * VMRESUME instruction, so we need to set the failure 4922 * flag and the VM-instruction error field of the VMCS 4923 * accordingly, and skip the emulated instruction. 4924 */ 4925 (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); 4926 4927 /* 4928 * Restore L1's host state to KVM's software model. We're here 4929 * because a consistency check was caught by hardware, which 4930 * means some amount of guest state has been propagated to KVM's 4931 * model and needs to be unwound to the host's state. 4932 */ 4933 nested_vmx_restore_host_state(vcpu); 4934 4935 vmx->fail = 0; 4936 } 4937 4938 static void nested_vmx_triple_fault(struct kvm_vcpu *vcpu) 4939 { 4940 kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); 4941 nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, 0, 0); 4942 } 4943 4944 /* 4945 * Decode the memory-address operand of a vmx instruction, as recorded on an 4946 * exit caused by such an instruction (run by a guest hypervisor). 4947 * On success, returns 0. When the operand is invalid, returns 1 and throws 4948 * #UD, #GP, or #SS. 4949 */ 4950 int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, 4951 u32 vmx_instruction_info, bool wr, int len, gva_t *ret) 4952 { 4953 gva_t off; 4954 bool exn; 4955 struct kvm_segment s; 4956 4957 /* 4958 * According to Vol. 3B, "Information for VM Exits Due to Instruction 4959 * Execution", on an exit, vmx_instruction_info holds most of the 4960 * addressing components of the operand. Only the displacement part 4961 * is put in exit_qualification (see 3B, "Basic VM-Exit Information"). 4962 * For how an actual address is calculated from all these components, 4963 * refer to Vol. 1, "Operand Addressing". 4964 */ 4965 int scaling = vmx_instruction_info & 3; 4966 int addr_size = (vmx_instruction_info >> 7) & 7; 4967 bool is_reg = vmx_instruction_info & (1u << 10); 4968 int seg_reg = (vmx_instruction_info >> 15) & 7; 4969 int index_reg = (vmx_instruction_info >> 18) & 0xf; 4970 bool index_is_valid = !(vmx_instruction_info & (1u << 22)); 4971 int base_reg = (vmx_instruction_info >> 23) & 0xf; 4972 bool base_is_valid = !(vmx_instruction_info & (1u << 27)); 4973 4974 if (is_reg) { 4975 kvm_queue_exception(vcpu, UD_VECTOR); 4976 return 1; 4977 } 4978 4979 /* Addr = segment_base + offset */ 4980 /* offset = base + [index * scale] + displacement */ 4981 off = exit_qualification; /* holds the displacement */ 4982 if (addr_size == 1) 4983 off = (gva_t)sign_extend64(off, 31); 4984 else if (addr_size == 0) 4985 off = (gva_t)sign_extend64(off, 15); 4986 if (base_is_valid) 4987 off += kvm_register_read(vcpu, base_reg); 4988 if (index_is_valid) 4989 off += kvm_register_read(vcpu, index_reg) << scaling; 4990 vmx_get_segment(vcpu, &s, seg_reg); 4991 4992 /* 4993 * The effective address, i.e. @off, of a memory operand is truncated 4994 * based on the address size of the instruction. Note that this is 4995 * the *effective address*, i.e. the address prior to accounting for 4996 * the segment's base. 4997 */ 4998 if (addr_size == 1) /* 32 bit */ 4999 off &= 0xffffffff; 5000 else if (addr_size == 0) /* 16 bit */ 5001 off &= 0xffff; 5002 5003 /* Checks for #GP/#SS exceptions. */ 5004 exn = false; 5005 if (is_long_mode(vcpu)) { 5006 /* 5007 * The virtual/linear address is never truncated in 64-bit 5008 * mode, e.g. a 32-bit address size can yield a 64-bit virtual 5009 * address when using FS/GS with a non-zero base. 5010 */ 5011 if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) 5012 *ret = s.base + off; 5013 else 5014 *ret = off; 5015 5016 /* Long mode: #GP(0)/#SS(0) if the memory address is in a 5017 * non-canonical form. This is the only check on the memory 5018 * destination for long mode! 5019 */ 5020 exn = is_noncanonical_address(*ret, vcpu); 5021 } else { 5022 /* 5023 * When not in long mode, the virtual/linear address is 5024 * unconditionally truncated to 32 bits regardless of the 5025 * address size. 5026 */ 5027 *ret = (s.base + off) & 0xffffffff; 5028 5029 /* Protected mode: apply checks for segment validity in the 5030 * following order: 5031 * - segment type check (#GP(0) may be thrown) 5032 * - usability check (#GP(0)/#SS(0)) 5033 * - limit check (#GP(0)/#SS(0)) 5034 */ 5035 if (wr) 5036 /* #GP(0) if the destination operand is located in a 5037 * read-only data segment or any code segment. 5038 */ 5039 exn = ((s.type & 0xa) == 0 || (s.type & 8)); 5040 else 5041 /* #GP(0) if the source operand is located in an 5042 * execute-only code segment 5043 */ 5044 exn = ((s.type & 0xa) == 8); 5045 if (exn) { 5046 kvm_queue_exception_e(vcpu, GP_VECTOR, 0); 5047 return 1; 5048 } 5049 /* Protected mode: #GP(0)/#SS(0) if the segment is unusable. 5050 */ 5051 exn = (s.unusable != 0); 5052 5053 /* 5054 * Protected mode: #GP(0)/#SS(0) if the memory operand is 5055 * outside the segment limit. All CPUs that support VMX ignore 5056 * limit checks for flat segments, i.e. segments with base==0, 5057 * limit==0xffffffff and of type expand-up data or code. 5058 */ 5059 if (!(s.base == 0 && s.limit == 0xffffffff && 5060 ((s.type & 8) || !(s.type & 4)))) 5061 exn = exn || ((u64)off + len - 1 > s.limit); 5062 } 5063 if (exn) { 5064 kvm_queue_exception_e(vcpu, 5065 seg_reg == VCPU_SREG_SS ? 5066 SS_VECTOR : GP_VECTOR, 5067 0); 5068 return 1; 5069 } 5070 5071 return 0; 5072 } 5073 5074 static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer, 5075 int *ret) 5076 { 5077 gva_t gva; 5078 struct x86_exception e; 5079 int r; 5080 5081 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5082 vmcs_read32(VMX_INSTRUCTION_INFO), false, 5083 sizeof(*vmpointer), &gva)) { 5084 *ret = 1; 5085 return -EINVAL; 5086 } 5087 5088 r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e); 5089 if (r != X86EMUL_CONTINUE) { 5090 *ret = kvm_handle_memory_failure(vcpu, r, &e); 5091 return -EINVAL; 5092 } 5093 5094 return 0; 5095 } 5096 5097 /* 5098 * Allocate a shadow VMCS and associate it with the currently loaded 5099 * VMCS, unless such a shadow VMCS already exists. The newly allocated 5100 * VMCS is also VMCLEARed, so that it is ready for use. 5101 */ 5102 static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu) 5103 { 5104 struct vcpu_vmx *vmx = to_vmx(vcpu); 5105 struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs; 5106 5107 /* 5108 * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it 5109 * when L1 executes VMXOFF or the vCPU is forced out of nested 5110 * operation. VMXON faults if the CPU is already post-VMXON, so it 5111 * should be impossible to already have an allocated shadow VMCS. KVM 5112 * doesn't support virtualization of VMCS shadowing, so vmcs01 should 5113 * always be the loaded VMCS. 5114 */ 5115 if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs)) 5116 return loaded_vmcs->shadow_vmcs; 5117 5118 loaded_vmcs->shadow_vmcs = alloc_vmcs(true); 5119 if (loaded_vmcs->shadow_vmcs) 5120 vmcs_clear(loaded_vmcs->shadow_vmcs); 5121 5122 return loaded_vmcs->shadow_vmcs; 5123 } 5124 5125 static int enter_vmx_operation(struct kvm_vcpu *vcpu) 5126 { 5127 struct vcpu_vmx *vmx = to_vmx(vcpu); 5128 int r; 5129 5130 r = alloc_loaded_vmcs(&vmx->nested.vmcs02); 5131 if (r < 0) 5132 goto out_vmcs02; 5133 5134 vmx->nested.cached_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5135 if (!vmx->nested.cached_vmcs12) 5136 goto out_cached_vmcs12; 5137 5138 vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA; 5139 vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT); 5140 if (!vmx->nested.cached_shadow_vmcs12) 5141 goto out_cached_shadow_vmcs12; 5142 5143 if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu)) 5144 goto out_shadow_vmcs; 5145 5146 hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, 5147 HRTIMER_MODE_ABS_PINNED); 5148 vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; 5149 5150 vmx->nested.vpid02 = allocate_vpid(); 5151 5152 vmx->nested.vmcs02_initialized = false; 5153 vmx->nested.vmxon = true; 5154 5155 if (vmx_pt_mode_is_host_guest()) { 5156 vmx->pt_desc.guest.ctl = 0; 5157 pt_update_intercept_for_msr(vcpu); 5158 } 5159 5160 return 0; 5161 5162 out_shadow_vmcs: 5163 kfree(vmx->nested.cached_shadow_vmcs12); 5164 5165 out_cached_shadow_vmcs12: 5166 kfree(vmx->nested.cached_vmcs12); 5167 5168 out_cached_vmcs12: 5169 free_loaded_vmcs(&vmx->nested.vmcs02); 5170 5171 out_vmcs02: 5172 return -ENOMEM; 5173 } 5174 5175 /* Emulate the VMXON instruction. */ 5176 static int handle_vmxon(struct kvm_vcpu *vcpu) 5177 { 5178 int ret; 5179 gpa_t vmptr; 5180 uint32_t revision; 5181 struct vcpu_vmx *vmx = to_vmx(vcpu); 5182 const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED 5183 | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX; 5184 5185 /* 5186 * Manually check CR4.VMXE checks, KVM must force CR4.VMXE=1 to enter 5187 * the guest and so cannot rely on hardware to perform the check, 5188 * which has higher priority than VM-Exit (see Intel SDM's pseudocode 5189 * for VMXON). 5190 * 5191 * Rely on hardware for the other pre-VM-Exit checks, CR0.PE=1, !VM86 5192 * and !COMPATIBILITY modes. For an unrestricted guest, KVM doesn't 5193 * force any of the relevant guest state. For a restricted guest, KVM 5194 * does force CR0.PE=1, but only to also force VM86 in order to emulate 5195 * Real Mode, and so there's no need to check CR0.PE manually. 5196 */ 5197 if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_VMXE)) { 5198 kvm_queue_exception(vcpu, UD_VECTOR); 5199 return 1; 5200 } 5201 5202 /* 5203 * The CPL is checked for "not in VMX operation" and for "in VMX root", 5204 * and has higher priority than the VM-Fail due to being post-VMXON, 5205 * i.e. VMXON #GPs outside of VMX non-root if CPL!=0. In VMX non-root, 5206 * VMXON causes VM-Exit and KVM unconditionally forwards VMXON VM-Exits 5207 * from L2 to L1, i.e. there's no need to check for the vCPU being in 5208 * VMX non-root. 5209 * 5210 * Forwarding the VM-Exit unconditionally, i.e. without performing the 5211 * #UD checks (see above), is functionally ok because KVM doesn't allow 5212 * L1 to run L2 without CR4.VMXE=0, and because KVM never modifies L2's 5213 * CR0 or CR4, i.e. it's L2's responsibility to emulate #UDs that are 5214 * missed by hardware due to shadowing CR0 and/or CR4. 5215 */ 5216 if (vmx_get_cpl(vcpu)) { 5217 kvm_inject_gp(vcpu, 0); 5218 return 1; 5219 } 5220 5221 if (vmx->nested.vmxon) 5222 return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); 5223 5224 /* 5225 * Invalid CR0/CR4 generates #GP. These checks are performed if and 5226 * only if the vCPU isn't already in VMX operation, i.e. effectively 5227 * have lower priority than the VM-Fail above. 5228 */ 5229 if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) || 5230 !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) { 5231 kvm_inject_gp(vcpu, 0); 5232 return 1; 5233 } 5234 5235 if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) 5236 != VMXON_NEEDED_FEATURES) { 5237 kvm_inject_gp(vcpu, 0); 5238 return 1; 5239 } 5240 5241 if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret)) 5242 return ret; 5243 5244 /* 5245 * SDM 3: 24.11.5 5246 * The first 4 bytes of VMXON region contain the supported 5247 * VMCS revision identifier 5248 * 5249 * Note - IA32_VMX_BASIC[48] will never be 1 for the nested case; 5250 * which replaces physical address width with 32 5251 */ 5252 if (!page_address_valid(vcpu, vmptr)) 5253 return nested_vmx_failInvalid(vcpu); 5254 5255 if (kvm_read_guest(vcpu->kvm, vmptr, &revision, sizeof(revision)) || 5256 revision != VMCS12_REVISION) 5257 return nested_vmx_failInvalid(vcpu); 5258 5259 vmx->nested.vmxon_ptr = vmptr; 5260 ret = enter_vmx_operation(vcpu); 5261 if (ret) 5262 return ret; 5263 5264 return nested_vmx_succeed(vcpu); 5265 } 5266 5267 static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) 5268 { 5269 struct vcpu_vmx *vmx = to_vmx(vcpu); 5270 5271 if (vmx->nested.current_vmptr == INVALID_GPA) 5272 return; 5273 5274 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 5275 5276 if (enable_shadow_vmcs) { 5277 /* copy to memory all shadowed fields in case 5278 they were modified */ 5279 copy_shadow_to_vmcs12(vmx); 5280 vmx_disable_shadow_vmcs(vmx); 5281 } 5282 vmx->nested.posted_intr_nv = -1; 5283 5284 /* Flush VMCS12 to guest memory */ 5285 kvm_vcpu_write_guest_page(vcpu, 5286 vmx->nested.current_vmptr >> PAGE_SHIFT, 5287 vmx->nested.cached_vmcs12, 0, VMCS12_SIZE); 5288 5289 kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL); 5290 5291 vmx->nested.current_vmptr = INVALID_GPA; 5292 } 5293 5294 /* Emulate the VMXOFF instruction */ 5295 static int handle_vmxoff(struct kvm_vcpu *vcpu) 5296 { 5297 if (!nested_vmx_check_permission(vcpu)) 5298 return 1; 5299 5300 free_nested(vcpu); 5301 5302 if (kvm_apic_has_pending_init_or_sipi(vcpu)) 5303 kvm_make_request(KVM_REQ_EVENT, vcpu); 5304 5305 return nested_vmx_succeed(vcpu); 5306 } 5307 5308 /* Emulate the VMCLEAR instruction */ 5309 static int handle_vmclear(struct kvm_vcpu *vcpu) 5310 { 5311 struct vcpu_vmx *vmx = to_vmx(vcpu); 5312 u32 zero = 0; 5313 gpa_t vmptr; 5314 int r; 5315 5316 if (!nested_vmx_check_permission(vcpu)) 5317 return 1; 5318 5319 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5320 return r; 5321 5322 if (!page_address_valid(vcpu, vmptr)) 5323 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); 5324 5325 if (vmptr == vmx->nested.vmxon_ptr) 5326 return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); 5327 5328 /* 5329 * When Enlightened VMEntry is enabled on the calling CPU we treat 5330 * memory area pointer by vmptr as Enlightened VMCS (as there's no good 5331 * way to distinguish it from VMCS12) and we must not corrupt it by 5332 * writing to the non-existent 'launch_state' field. The area doesn't 5333 * have to be the currently active EVMCS on the calling CPU and there's 5334 * nothing KVM has to do to transition it from 'active' to 'non-active' 5335 * state. It is possible that the area will stay mapped as 5336 * vmx->nested.hv_evmcs but this shouldn't be a problem. 5337 */ 5338 if (likely(!guest_cpuid_has_evmcs(vcpu) || 5339 !evmptr_is_valid(nested_get_evmptr(vcpu)))) { 5340 if (vmptr == vmx->nested.current_vmptr) 5341 nested_release_vmcs12(vcpu); 5342 5343 /* 5344 * Silently ignore memory errors on VMCLEAR, Intel's pseudocode 5345 * for VMCLEAR includes a "ensure that data for VMCS referenced 5346 * by the operand is in memory" clause that guards writes to 5347 * memory, i.e. doing nothing for I/O is architecturally valid. 5348 * 5349 * FIXME: Suppress failures if and only if no memslot is found, 5350 * i.e. exit to userspace if __copy_to_user() fails. 5351 */ 5352 (void)kvm_vcpu_write_guest(vcpu, 5353 vmptr + offsetof(struct vmcs12, 5354 launch_state), 5355 &zero, sizeof(zero)); 5356 } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { 5357 nested_release_evmcs(vcpu); 5358 } 5359 5360 return nested_vmx_succeed(vcpu); 5361 } 5362 5363 /* Emulate the VMLAUNCH instruction */ 5364 static int handle_vmlaunch(struct kvm_vcpu *vcpu) 5365 { 5366 return nested_vmx_run(vcpu, true); 5367 } 5368 5369 /* Emulate the VMRESUME instruction */ 5370 static int handle_vmresume(struct kvm_vcpu *vcpu) 5371 { 5372 5373 return nested_vmx_run(vcpu, false); 5374 } 5375 5376 static int handle_vmread(struct kvm_vcpu *vcpu) 5377 { 5378 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5379 : get_vmcs12(vcpu); 5380 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5381 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5382 struct vcpu_vmx *vmx = to_vmx(vcpu); 5383 struct x86_exception e; 5384 unsigned long field; 5385 u64 value; 5386 gva_t gva = 0; 5387 short offset; 5388 int len, r; 5389 5390 if (!nested_vmx_check_permission(vcpu)) 5391 return 1; 5392 5393 /* Decode instruction info and find the field to read */ 5394 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5395 5396 if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) { 5397 /* 5398 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5399 * any VMREAD sets the ALU flags for VMfailInvalid. 5400 */ 5401 if (vmx->nested.current_vmptr == INVALID_GPA || 5402 (is_guest_mode(vcpu) && 5403 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5404 return nested_vmx_failInvalid(vcpu); 5405 5406 offset = get_vmcs12_field_offset(field); 5407 if (offset < 0) 5408 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5409 5410 if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) 5411 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5412 5413 /* Read the field, zero-extended to a u64 value */ 5414 value = vmcs12_read_any(vmcs12, field, offset); 5415 } else { 5416 /* 5417 * Hyper-V TLFS (as of 6.0b) explicitly states, that while an 5418 * enlightened VMCS is active VMREAD/VMWRITE instructions are 5419 * unsupported. Unfortunately, certain versions of Windows 11 5420 * don't comply with this requirement which is not enforced in 5421 * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a 5422 * workaround, as misbehaving guests will panic on VM-Fail. 5423 * Note, enlightened VMCS is incompatible with shadow VMCS so 5424 * all VMREADs from L2 should go to L1. 5425 */ 5426 if (WARN_ON_ONCE(is_guest_mode(vcpu))) 5427 return nested_vmx_failInvalid(vcpu); 5428 5429 offset = evmcs_field_offset(field, NULL); 5430 if (offset < 0) 5431 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5432 5433 /* Read the field, zero-extended to a u64 value */ 5434 value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset); 5435 } 5436 5437 /* 5438 * Now copy part of this value to register or memory, as requested. 5439 * Note that the number of bits actually copied is 32 or 64 depending 5440 * on the guest's mode (32 or 64 bit), not on the given field's length. 5441 */ 5442 if (instr_info & BIT(10)) { 5443 kvm_register_write(vcpu, (((instr_info) >> 3) & 0xf), value); 5444 } else { 5445 len = is_64_bit_mode(vcpu) ? 8 : 4; 5446 if (get_vmx_mem_address(vcpu, exit_qualification, 5447 instr_info, true, len, &gva)) 5448 return 1; 5449 /* _system ok, nested_vmx_check_permission has verified cpl=0 */ 5450 r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e); 5451 if (r != X86EMUL_CONTINUE) 5452 return kvm_handle_memory_failure(vcpu, r, &e); 5453 } 5454 5455 return nested_vmx_succeed(vcpu); 5456 } 5457 5458 static bool is_shadow_field_rw(unsigned long field) 5459 { 5460 switch (field) { 5461 #define SHADOW_FIELD_RW(x, y) case x: 5462 #include "vmcs_shadow_fields.h" 5463 return true; 5464 default: 5465 break; 5466 } 5467 return false; 5468 } 5469 5470 static bool is_shadow_field_ro(unsigned long field) 5471 { 5472 switch (field) { 5473 #define SHADOW_FIELD_RO(x, y) case x: 5474 #include "vmcs_shadow_fields.h" 5475 return true; 5476 default: 5477 break; 5478 } 5479 return false; 5480 } 5481 5482 static int handle_vmwrite(struct kvm_vcpu *vcpu) 5483 { 5484 struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) 5485 : get_vmcs12(vcpu); 5486 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 5487 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5488 struct vcpu_vmx *vmx = to_vmx(vcpu); 5489 struct x86_exception e; 5490 unsigned long field; 5491 short offset; 5492 gva_t gva; 5493 int len, r; 5494 5495 /* 5496 * The value to write might be 32 or 64 bits, depending on L1's long 5497 * mode, and eventually we need to write that into a field of several 5498 * possible lengths. The code below first zero-extends the value to 64 5499 * bit (value), and then copies only the appropriate number of 5500 * bits into the vmcs12 field. 5501 */ 5502 u64 value = 0; 5503 5504 if (!nested_vmx_check_permission(vcpu)) 5505 return 1; 5506 5507 /* 5508 * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA, 5509 * any VMWRITE sets the ALU flags for VMfailInvalid. 5510 */ 5511 if (vmx->nested.current_vmptr == INVALID_GPA || 5512 (is_guest_mode(vcpu) && 5513 get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA)) 5514 return nested_vmx_failInvalid(vcpu); 5515 5516 if (instr_info & BIT(10)) 5517 value = kvm_register_read(vcpu, (((instr_info) >> 3) & 0xf)); 5518 else { 5519 len = is_64_bit_mode(vcpu) ? 8 : 4; 5520 if (get_vmx_mem_address(vcpu, exit_qualification, 5521 instr_info, false, len, &gva)) 5522 return 1; 5523 r = kvm_read_guest_virt(vcpu, gva, &value, len, &e); 5524 if (r != X86EMUL_CONTINUE) 5525 return kvm_handle_memory_failure(vcpu, r, &e); 5526 } 5527 5528 field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf)); 5529 5530 offset = get_vmcs12_field_offset(field); 5531 if (offset < 0) 5532 return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); 5533 5534 /* 5535 * If the vCPU supports "VMWRITE to any supported field in the 5536 * VMCS," then the "read-only" fields are actually read/write. 5537 */ 5538 if (vmcs_field_readonly(field) && 5539 !nested_cpu_has_vmwrite_any_field(vcpu)) 5540 return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); 5541 5542 /* 5543 * Ensure vmcs12 is up-to-date before any VMWRITE that dirties 5544 * vmcs12, else we may crush a field or consume a stale value. 5545 */ 5546 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) 5547 copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 5548 5549 /* 5550 * Some Intel CPUs intentionally drop the reserved bits of the AR byte 5551 * fields on VMWRITE. Emulate this behavior to ensure consistent KVM 5552 * behavior regardless of the underlying hardware, e.g. if an AR_BYTE 5553 * field is intercepted for VMWRITE but not VMREAD (in L1), then VMREAD 5554 * from L1 will return a different value than VMREAD from L2 (L1 sees 5555 * the stripped down value, L2 sees the full value as stored by KVM). 5556 */ 5557 if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES) 5558 value &= 0x1f0ff; 5559 5560 vmcs12_write_any(vmcs12, field, offset, value); 5561 5562 /* 5563 * Do not track vmcs12 dirty-state if in guest-mode as we actually 5564 * dirty shadow vmcs12 instead of vmcs12. Fields that can be updated 5565 * by L1 without a vmexit are always updated in the vmcs02, i.e. don't 5566 * "dirty" vmcs12, all others go down the prepare_vmcs02() slow path. 5567 */ 5568 if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field)) { 5569 /* 5570 * L1 can read these fields without exiting, ensure the 5571 * shadow VMCS is up-to-date. 5572 */ 5573 if (enable_shadow_vmcs && is_shadow_field_ro(field)) { 5574 preempt_disable(); 5575 vmcs_load(vmx->vmcs01.shadow_vmcs); 5576 5577 __vmcs_writel(field, value); 5578 5579 vmcs_clear(vmx->vmcs01.shadow_vmcs); 5580 vmcs_load(vmx->loaded_vmcs->vmcs); 5581 preempt_enable(); 5582 } 5583 vmx->nested.dirty_vmcs12 = true; 5584 } 5585 5586 return nested_vmx_succeed(vcpu); 5587 } 5588 5589 static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) 5590 { 5591 vmx->nested.current_vmptr = vmptr; 5592 if (enable_shadow_vmcs) { 5593 secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); 5594 vmcs_write64(VMCS_LINK_POINTER, 5595 __pa(vmx->vmcs01.shadow_vmcs)); 5596 vmx->nested.need_vmcs12_to_shadow_sync = true; 5597 } 5598 vmx->nested.dirty_vmcs12 = true; 5599 vmx->nested.force_msr_bitmap_recalc = true; 5600 } 5601 5602 /* Emulate the VMPTRLD instruction */ 5603 static int handle_vmptrld(struct kvm_vcpu *vcpu) 5604 { 5605 struct vcpu_vmx *vmx = to_vmx(vcpu); 5606 gpa_t vmptr; 5607 int r; 5608 5609 if (!nested_vmx_check_permission(vcpu)) 5610 return 1; 5611 5612 if (nested_vmx_get_vmptr(vcpu, &vmptr, &r)) 5613 return r; 5614 5615 if (!page_address_valid(vcpu, vmptr)) 5616 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); 5617 5618 if (vmptr == vmx->nested.vmxon_ptr) 5619 return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); 5620 5621 /* Forbid normal VMPTRLD if Enlightened version was used */ 5622 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 5623 return 1; 5624 5625 if (vmx->nested.current_vmptr != vmptr) { 5626 struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache; 5627 struct vmcs_hdr hdr; 5628 5629 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) { 5630 /* 5631 * Reads from an unbacked page return all 1s, 5632 * which means that the 32 bits located at the 5633 * given physical address won't match the required 5634 * VMCS12_REVISION identifier. 5635 */ 5636 return nested_vmx_fail(vcpu, 5637 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5638 } 5639 5640 if (kvm_read_guest_offset_cached(vcpu->kvm, ghc, &hdr, 5641 offsetof(struct vmcs12, hdr), 5642 sizeof(hdr))) { 5643 return nested_vmx_fail(vcpu, 5644 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5645 } 5646 5647 if (hdr.revision_id != VMCS12_REVISION || 5648 (hdr.shadow_vmcs && 5649 !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { 5650 return nested_vmx_fail(vcpu, 5651 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5652 } 5653 5654 nested_release_vmcs12(vcpu); 5655 5656 /* 5657 * Load VMCS12 from guest memory since it is not already 5658 * cached. 5659 */ 5660 if (kvm_read_guest_cached(vcpu->kvm, ghc, vmx->nested.cached_vmcs12, 5661 VMCS12_SIZE)) { 5662 return nested_vmx_fail(vcpu, 5663 VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); 5664 } 5665 5666 set_current_vmptr(vmx, vmptr); 5667 } 5668 5669 return nested_vmx_succeed(vcpu); 5670 } 5671 5672 /* Emulate the VMPTRST instruction */ 5673 static int handle_vmptrst(struct kvm_vcpu *vcpu) 5674 { 5675 unsigned long exit_qual = vmx_get_exit_qual(vcpu); 5676 u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5677 gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; 5678 struct x86_exception e; 5679 gva_t gva; 5680 int r; 5681 5682 if (!nested_vmx_check_permission(vcpu)) 5683 return 1; 5684 5685 if (unlikely(evmptr_is_valid(to_vmx(vcpu)->nested.hv_evmcs_vmptr))) 5686 return 1; 5687 5688 if (get_vmx_mem_address(vcpu, exit_qual, instr_info, 5689 true, sizeof(gpa_t), &gva)) 5690 return 1; 5691 /* *_system ok, nested_vmx_check_permission has verified cpl=0 */ 5692 r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr, 5693 sizeof(gpa_t), &e); 5694 if (r != X86EMUL_CONTINUE) 5695 return kvm_handle_memory_failure(vcpu, r, &e); 5696 5697 return nested_vmx_succeed(vcpu); 5698 } 5699 5700 /* Emulate the INVEPT instruction */ 5701 static int handle_invept(struct kvm_vcpu *vcpu) 5702 { 5703 struct vcpu_vmx *vmx = to_vmx(vcpu); 5704 u32 vmx_instruction_info, types; 5705 unsigned long type, roots_to_free; 5706 struct kvm_mmu *mmu; 5707 gva_t gva; 5708 struct x86_exception e; 5709 struct { 5710 u64 eptp, gpa; 5711 } operand; 5712 int i, r, gpr_index; 5713 5714 if (!(vmx->nested.msrs.secondary_ctls_high & 5715 SECONDARY_EXEC_ENABLE_EPT) || 5716 !(vmx->nested.msrs.ept_caps & VMX_EPT_INVEPT_BIT)) { 5717 kvm_queue_exception(vcpu, UD_VECTOR); 5718 return 1; 5719 } 5720 5721 if (!nested_vmx_check_permission(vcpu)) 5722 return 1; 5723 5724 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5725 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5726 type = kvm_register_read(vcpu, gpr_index); 5727 5728 types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; 5729 5730 if (type >= 32 || !(types & (1 << type))) 5731 return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5732 5733 /* According to the Intel VMX instruction reference, the memory 5734 * operand is read even if it isn't needed (e.g., for type==global) 5735 */ 5736 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5737 vmx_instruction_info, false, sizeof(operand), &gva)) 5738 return 1; 5739 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5740 if (r != X86EMUL_CONTINUE) 5741 return kvm_handle_memory_failure(vcpu, r, &e); 5742 5743 /* 5744 * Nested EPT roots are always held through guest_mmu, 5745 * not root_mmu. 5746 */ 5747 mmu = &vcpu->arch.guest_mmu; 5748 5749 switch (type) { 5750 case VMX_EPT_EXTENT_CONTEXT: 5751 if (!nested_vmx_check_eptp(vcpu, operand.eptp)) 5752 return nested_vmx_fail(vcpu, 5753 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5754 5755 roots_to_free = 0; 5756 if (nested_ept_root_matches(mmu->root.hpa, mmu->root.pgd, 5757 operand.eptp)) 5758 roots_to_free |= KVM_MMU_ROOT_CURRENT; 5759 5760 for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { 5761 if (nested_ept_root_matches(mmu->prev_roots[i].hpa, 5762 mmu->prev_roots[i].pgd, 5763 operand.eptp)) 5764 roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); 5765 } 5766 break; 5767 case VMX_EPT_EXTENT_GLOBAL: 5768 roots_to_free = KVM_MMU_ROOTS_ALL; 5769 break; 5770 default: 5771 BUG(); 5772 break; 5773 } 5774 5775 if (roots_to_free) 5776 kvm_mmu_free_roots(vcpu->kvm, mmu, roots_to_free); 5777 5778 return nested_vmx_succeed(vcpu); 5779 } 5780 5781 static int handle_invvpid(struct kvm_vcpu *vcpu) 5782 { 5783 struct vcpu_vmx *vmx = to_vmx(vcpu); 5784 u32 vmx_instruction_info; 5785 unsigned long type, types; 5786 gva_t gva; 5787 struct x86_exception e; 5788 struct { 5789 u64 vpid; 5790 u64 gla; 5791 } operand; 5792 u16 vpid02; 5793 int r, gpr_index; 5794 5795 if (!(vmx->nested.msrs.secondary_ctls_high & 5796 SECONDARY_EXEC_ENABLE_VPID) || 5797 !(vmx->nested.msrs.vpid_caps & VMX_VPID_INVVPID_BIT)) { 5798 kvm_queue_exception(vcpu, UD_VECTOR); 5799 return 1; 5800 } 5801 5802 if (!nested_vmx_check_permission(vcpu)) 5803 return 1; 5804 5805 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 5806 gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); 5807 type = kvm_register_read(vcpu, gpr_index); 5808 5809 types = (vmx->nested.msrs.vpid_caps & 5810 VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; 5811 5812 if (type >= 32 || !(types & (1 << type))) 5813 return nested_vmx_fail(vcpu, 5814 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5815 5816 /* according to the intel vmx instruction reference, the memory 5817 * operand is read even if it isn't needed (e.g., for type==global) 5818 */ 5819 if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), 5820 vmx_instruction_info, false, sizeof(operand), &gva)) 5821 return 1; 5822 r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); 5823 if (r != X86EMUL_CONTINUE) 5824 return kvm_handle_memory_failure(vcpu, r, &e); 5825 5826 if (operand.vpid >> 16) 5827 return nested_vmx_fail(vcpu, 5828 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5829 5830 vpid02 = nested_get_vpid02(vcpu); 5831 switch (type) { 5832 case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: 5833 if (!operand.vpid || 5834 is_noncanonical_address(operand.gla, vcpu)) 5835 return nested_vmx_fail(vcpu, 5836 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5837 vpid_sync_vcpu_addr(vpid02, operand.gla); 5838 break; 5839 case VMX_VPID_EXTENT_SINGLE_CONTEXT: 5840 case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: 5841 if (!operand.vpid) 5842 return nested_vmx_fail(vcpu, 5843 VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); 5844 vpid_sync_context(vpid02); 5845 break; 5846 case VMX_VPID_EXTENT_ALL_CONTEXT: 5847 vpid_sync_context(vpid02); 5848 break; 5849 default: 5850 WARN_ON_ONCE(1); 5851 return kvm_skip_emulated_instruction(vcpu); 5852 } 5853 5854 /* 5855 * Sync the shadow page tables if EPT is disabled, L1 is invalidating 5856 * linear mappings for L2 (tagged with L2's VPID). Free all guest 5857 * roots as VPIDs are not tracked in the MMU role. 5858 * 5859 * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share 5860 * an MMU when EPT is disabled. 5861 * 5862 * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR. 5863 */ 5864 if (!enable_ept) 5865 kvm_mmu_free_guest_mode_roots(vcpu->kvm, &vcpu->arch.root_mmu); 5866 5867 return nested_vmx_succeed(vcpu); 5868 } 5869 5870 static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, 5871 struct vmcs12 *vmcs12) 5872 { 5873 u32 index = kvm_rcx_read(vcpu); 5874 u64 new_eptp; 5875 5876 if (WARN_ON_ONCE(!nested_cpu_has_ept(vmcs12))) 5877 return 1; 5878 if (index >= VMFUNC_EPTP_ENTRIES) 5879 return 1; 5880 5881 if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, 5882 &new_eptp, index * 8, 8)) 5883 return 1; 5884 5885 /* 5886 * If the (L2) guest does a vmfunc to the currently 5887 * active ept pointer, we don't have to do anything else 5888 */ 5889 if (vmcs12->ept_pointer != new_eptp) { 5890 if (!nested_vmx_check_eptp(vcpu, new_eptp)) 5891 return 1; 5892 5893 vmcs12->ept_pointer = new_eptp; 5894 nested_ept_new_eptp(vcpu); 5895 5896 if (!nested_cpu_has_vpid(vmcs12)) 5897 kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); 5898 } 5899 5900 return 0; 5901 } 5902 5903 static int handle_vmfunc(struct kvm_vcpu *vcpu) 5904 { 5905 struct vcpu_vmx *vmx = to_vmx(vcpu); 5906 struct vmcs12 *vmcs12; 5907 u32 function = kvm_rax_read(vcpu); 5908 5909 /* 5910 * VMFUNC should never execute cleanly while L1 is active; KVM supports 5911 * VMFUNC for nested VMs, but not for L1. 5912 */ 5913 if (WARN_ON_ONCE(!is_guest_mode(vcpu))) { 5914 kvm_queue_exception(vcpu, UD_VECTOR); 5915 return 1; 5916 } 5917 5918 vmcs12 = get_vmcs12(vcpu); 5919 5920 /* 5921 * #UD on out-of-bounds function has priority over VM-Exit, and VMFUNC 5922 * is enabled in vmcs02 if and only if it's enabled in vmcs12. 5923 */ 5924 if (WARN_ON_ONCE((function > 63) || !nested_cpu_has_vmfunc(vmcs12))) { 5925 kvm_queue_exception(vcpu, UD_VECTOR); 5926 return 1; 5927 } 5928 5929 if (!(vmcs12->vm_function_control & BIT_ULL(function))) 5930 goto fail; 5931 5932 switch (function) { 5933 case 0: 5934 if (nested_vmx_eptp_switching(vcpu, vmcs12)) 5935 goto fail; 5936 break; 5937 default: 5938 goto fail; 5939 } 5940 return kvm_skip_emulated_instruction(vcpu); 5941 5942 fail: 5943 /* 5944 * This is effectively a reflected VM-Exit, as opposed to a synthesized 5945 * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode 5946 * EXIT_REASON_VMFUNC as the exit reason. 5947 */ 5948 nested_vmx_vmexit(vcpu, vmx->exit_reason.full, 5949 vmx_get_intr_info(vcpu), 5950 vmx_get_exit_qual(vcpu)); 5951 return 1; 5952 } 5953 5954 /* 5955 * Return true if an IO instruction with the specified port and size should cause 5956 * a VM-exit into L1. 5957 */ 5958 bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, 5959 int size) 5960 { 5961 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 5962 gpa_t bitmap, last_bitmap; 5963 u8 b; 5964 5965 last_bitmap = INVALID_GPA; 5966 b = -1; 5967 5968 while (size > 0) { 5969 if (port < 0x8000) 5970 bitmap = vmcs12->io_bitmap_a; 5971 else if (port < 0x10000) 5972 bitmap = vmcs12->io_bitmap_b; 5973 else 5974 return true; 5975 bitmap += (port & 0x7fff) / 8; 5976 5977 if (last_bitmap != bitmap) 5978 if (kvm_vcpu_read_guest(vcpu, bitmap, &b, 1)) 5979 return true; 5980 if (b & (1 << (port & 7))) 5981 return true; 5982 5983 port++; 5984 size--; 5985 last_bitmap = bitmap; 5986 } 5987 5988 return false; 5989 } 5990 5991 static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, 5992 struct vmcs12 *vmcs12) 5993 { 5994 unsigned long exit_qualification; 5995 unsigned short port; 5996 int size; 5997 5998 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) 5999 return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); 6000 6001 exit_qualification = vmx_get_exit_qual(vcpu); 6002 6003 port = exit_qualification >> 16; 6004 size = (exit_qualification & 7) + 1; 6005 6006 return nested_vmx_check_io_bitmaps(vcpu, port, size); 6007 } 6008 6009 /* 6010 * Return 1 if we should exit from L2 to L1 to handle an MSR access, 6011 * rather than handle it ourselves in L0. I.e., check whether L1 expressed 6012 * disinterest in the current event (read or write a specific MSR) by using an 6013 * MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps. 6014 */ 6015 static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, 6016 struct vmcs12 *vmcs12, 6017 union vmx_exit_reason exit_reason) 6018 { 6019 u32 msr_index = kvm_rcx_read(vcpu); 6020 gpa_t bitmap; 6021 6022 if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) 6023 return true; 6024 6025 /* 6026 * The MSR_BITMAP page is divided into four 1024-byte bitmaps, 6027 * for the four combinations of read/write and low/high MSR numbers. 6028 * First we need to figure out which of the four to use: 6029 */ 6030 bitmap = vmcs12->msr_bitmap; 6031 if (exit_reason.basic == EXIT_REASON_MSR_WRITE) 6032 bitmap += 2048; 6033 if (msr_index >= 0xc0000000) { 6034 msr_index -= 0xc0000000; 6035 bitmap += 1024; 6036 } 6037 6038 /* Then read the msr_index'th bit from this bitmap: */ 6039 if (msr_index < 1024*8) { 6040 unsigned char b; 6041 if (kvm_vcpu_read_guest(vcpu, bitmap + msr_index/8, &b, 1)) 6042 return true; 6043 return 1 & (b >> (msr_index & 7)); 6044 } else 6045 return true; /* let L1 handle the wrong parameter */ 6046 } 6047 6048 /* 6049 * Return 1 if we should exit from L2 to L1 to handle a CR access exit, 6050 * rather than handle it ourselves in L0. I.e., check if L1 wanted to 6051 * intercept (via guest_host_mask etc.) the current event. 6052 */ 6053 static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, 6054 struct vmcs12 *vmcs12) 6055 { 6056 unsigned long exit_qualification = vmx_get_exit_qual(vcpu); 6057 int cr = exit_qualification & 15; 6058 int reg; 6059 unsigned long val; 6060 6061 switch ((exit_qualification >> 4) & 3) { 6062 case 0: /* mov to cr */ 6063 reg = (exit_qualification >> 8) & 15; 6064 val = kvm_register_read(vcpu, reg); 6065 switch (cr) { 6066 case 0: 6067 if (vmcs12->cr0_guest_host_mask & 6068 (val ^ vmcs12->cr0_read_shadow)) 6069 return true; 6070 break; 6071 case 3: 6072 if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING)) 6073 return true; 6074 break; 6075 case 4: 6076 if (vmcs12->cr4_guest_host_mask & 6077 (vmcs12->cr4_read_shadow ^ val)) 6078 return true; 6079 break; 6080 case 8: 6081 if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING)) 6082 return true; 6083 break; 6084 } 6085 break; 6086 case 2: /* clts */ 6087 if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) && 6088 (vmcs12->cr0_read_shadow & X86_CR0_TS)) 6089 return true; 6090 break; 6091 case 1: /* mov from cr */ 6092 switch (cr) { 6093 case 3: 6094 if (vmcs12->cpu_based_vm_exec_control & 6095 CPU_BASED_CR3_STORE_EXITING) 6096 return true; 6097 break; 6098 case 8: 6099 if (vmcs12->cpu_based_vm_exec_control & 6100 CPU_BASED_CR8_STORE_EXITING) 6101 return true; 6102 break; 6103 } 6104 break; 6105 case 3: /* lmsw */ 6106 /* 6107 * lmsw can change bits 1..3 of cr0, and only set bit 0 of 6108 * cr0. Other attempted changes are ignored, with no exit. 6109 */ 6110 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; 6111 if (vmcs12->cr0_guest_host_mask & 0xe & 6112 (val ^ vmcs12->cr0_read_shadow)) 6113 return true; 6114 if ((vmcs12->cr0_guest_host_mask & 0x1) && 6115 !(vmcs12->cr0_read_shadow & 0x1) && 6116 (val & 0x1)) 6117 return true; 6118 break; 6119 } 6120 return false; 6121 } 6122 6123 static bool nested_vmx_exit_handled_encls(struct kvm_vcpu *vcpu, 6124 struct vmcs12 *vmcs12) 6125 { 6126 u32 encls_leaf; 6127 6128 if (!guest_cpuid_has(vcpu, X86_FEATURE_SGX) || 6129 !nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENCLS_EXITING)) 6130 return false; 6131 6132 encls_leaf = kvm_rax_read(vcpu); 6133 if (encls_leaf > 62) 6134 encls_leaf = 63; 6135 return vmcs12->encls_exiting_bitmap & BIT_ULL(encls_leaf); 6136 } 6137 6138 static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu, 6139 struct vmcs12 *vmcs12, gpa_t bitmap) 6140 { 6141 u32 vmx_instruction_info; 6142 unsigned long field; 6143 u8 b; 6144 6145 if (!nested_cpu_has_shadow_vmcs(vmcs12)) 6146 return true; 6147 6148 /* Decode instruction info and find the field to access */ 6149 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); 6150 field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); 6151 6152 /* Out-of-range fields always cause a VM exit from L2 to L1 */ 6153 if (field >> 15) 6154 return true; 6155 6156 if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1)) 6157 return true; 6158 6159 return 1 & (b >> (field & 7)); 6160 } 6161 6162 static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12) 6163 { 6164 u32 entry_intr_info = vmcs12->vm_entry_intr_info_field; 6165 6166 if (nested_cpu_has_mtf(vmcs12)) 6167 return true; 6168 6169 /* 6170 * An MTF VM-exit may be injected into the guest by setting the 6171 * interruption-type to 7 (other event) and the vector field to 0. Such 6172 * is the case regardless of the 'monitor trap flag' VM-execution 6173 * control. 6174 */ 6175 return entry_intr_info == (INTR_INFO_VALID_MASK 6176 | INTR_TYPE_OTHER_EVENT); 6177 } 6178 6179 /* 6180 * Return true if L0 wants to handle an exit from L2 regardless of whether or not 6181 * L1 wants the exit. Only call this when in is_guest_mode (L2). 6182 */ 6183 static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, 6184 union vmx_exit_reason exit_reason) 6185 { 6186 u32 intr_info; 6187 6188 switch ((u16)exit_reason.basic) { 6189 case EXIT_REASON_EXCEPTION_NMI: 6190 intr_info = vmx_get_intr_info(vcpu); 6191 if (is_nmi(intr_info)) 6192 return true; 6193 else if (is_page_fault(intr_info)) 6194 return vcpu->arch.apf.host_apf_flags || 6195 vmx_need_pf_intercept(vcpu); 6196 else if (is_debug(intr_info) && 6197 vcpu->guest_debug & 6198 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 6199 return true; 6200 else if (is_breakpoint(intr_info) && 6201 vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 6202 return true; 6203 else if (is_alignment_check(intr_info) && 6204 !vmx_guest_inject_ac(vcpu)) 6205 return true; 6206 return false; 6207 case EXIT_REASON_EXTERNAL_INTERRUPT: 6208 return true; 6209 case EXIT_REASON_MCE_DURING_VMENTRY: 6210 return true; 6211 case EXIT_REASON_EPT_VIOLATION: 6212 /* 6213 * L0 always deals with the EPT violation. If nested EPT is 6214 * used, and the nested mmu code discovers that the address is 6215 * missing in the guest EPT table (EPT12), the EPT violation 6216 * will be injected with nested_ept_inject_page_fault() 6217 */ 6218 return true; 6219 case EXIT_REASON_EPT_MISCONFIG: 6220 /* 6221 * L2 never uses directly L1's EPT, but rather L0's own EPT 6222 * table (shadow on EPT) or a merged EPT table that L0 built 6223 * (EPT on EPT). So any problems with the structure of the 6224 * table is L0's fault. 6225 */ 6226 return true; 6227 case EXIT_REASON_PREEMPTION_TIMER: 6228 return true; 6229 case EXIT_REASON_PML_FULL: 6230 /* 6231 * PML is emulated for an L1 VMM and should never be enabled in 6232 * vmcs02, always "handle" PML_FULL by exiting to userspace. 6233 */ 6234 return true; 6235 case EXIT_REASON_VMFUNC: 6236 /* VM functions are emulated through L2->L0 vmexits. */ 6237 return true; 6238 case EXIT_REASON_BUS_LOCK: 6239 /* 6240 * At present, bus lock VM exit is never exposed to L1. 6241 * Handle L2's bus locks in L0 directly. 6242 */ 6243 return true; 6244 case EXIT_REASON_VMCALL: 6245 /* Hyper-V L2 TLB flush hypercall is handled by L0 */ 6246 return guest_hv_cpuid_has_l2_tlb_flush(vcpu) && 6247 nested_evmcs_l2_tlb_flush_enabled(vcpu) && 6248 kvm_hv_is_tlb_flush_hcall(vcpu); 6249 default: 6250 break; 6251 } 6252 return false; 6253 } 6254 6255 /* 6256 * Return 1 if L1 wants to intercept an exit from L2. Only call this when in 6257 * is_guest_mode (L2). 6258 */ 6259 static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, 6260 union vmx_exit_reason exit_reason) 6261 { 6262 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6263 u32 intr_info; 6264 6265 switch ((u16)exit_reason.basic) { 6266 case EXIT_REASON_EXCEPTION_NMI: 6267 intr_info = vmx_get_intr_info(vcpu); 6268 if (is_nmi(intr_info)) 6269 return true; 6270 else if (is_page_fault(intr_info)) 6271 return true; 6272 return vmcs12->exception_bitmap & 6273 (1u << (intr_info & INTR_INFO_VECTOR_MASK)); 6274 case EXIT_REASON_EXTERNAL_INTERRUPT: 6275 return nested_exit_on_intr(vcpu); 6276 case EXIT_REASON_TRIPLE_FAULT: 6277 return true; 6278 case EXIT_REASON_INTERRUPT_WINDOW: 6279 return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING); 6280 case EXIT_REASON_NMI_WINDOW: 6281 return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING); 6282 case EXIT_REASON_TASK_SWITCH: 6283 return true; 6284 case EXIT_REASON_CPUID: 6285 return true; 6286 case EXIT_REASON_HLT: 6287 return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING); 6288 case EXIT_REASON_INVD: 6289 return true; 6290 case EXIT_REASON_INVLPG: 6291 return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6292 case EXIT_REASON_RDPMC: 6293 return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); 6294 case EXIT_REASON_RDRAND: 6295 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING); 6296 case EXIT_REASON_RDSEED: 6297 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING); 6298 case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: 6299 return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); 6300 case EXIT_REASON_VMREAD: 6301 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6302 vmcs12->vmread_bitmap); 6303 case EXIT_REASON_VMWRITE: 6304 return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12, 6305 vmcs12->vmwrite_bitmap); 6306 case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: 6307 case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD: 6308 case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME: 6309 case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: 6310 case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: 6311 /* 6312 * VMX instructions trap unconditionally. This allows L1 to 6313 * emulate them for its L2 guest, i.e., allows 3-level nesting! 6314 */ 6315 return true; 6316 case EXIT_REASON_CR_ACCESS: 6317 return nested_vmx_exit_handled_cr(vcpu, vmcs12); 6318 case EXIT_REASON_DR_ACCESS: 6319 return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); 6320 case EXIT_REASON_IO_INSTRUCTION: 6321 return nested_vmx_exit_handled_io(vcpu, vmcs12); 6322 case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: 6323 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); 6324 case EXIT_REASON_MSR_READ: 6325 case EXIT_REASON_MSR_WRITE: 6326 return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); 6327 case EXIT_REASON_INVALID_STATE: 6328 return true; 6329 case EXIT_REASON_MWAIT_INSTRUCTION: 6330 return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); 6331 case EXIT_REASON_MONITOR_TRAP_FLAG: 6332 return nested_vmx_exit_handled_mtf(vmcs12); 6333 case EXIT_REASON_MONITOR_INSTRUCTION: 6334 return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); 6335 case EXIT_REASON_PAUSE_INSTRUCTION: 6336 return nested_cpu_has(vmcs12, CPU_BASED_PAUSE_EXITING) || 6337 nested_cpu_has2(vmcs12, 6338 SECONDARY_EXEC_PAUSE_LOOP_EXITING); 6339 case EXIT_REASON_MCE_DURING_VMENTRY: 6340 return true; 6341 case EXIT_REASON_TPR_BELOW_THRESHOLD: 6342 return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW); 6343 case EXIT_REASON_APIC_ACCESS: 6344 case EXIT_REASON_APIC_WRITE: 6345 case EXIT_REASON_EOI_INDUCED: 6346 /* 6347 * The controls for "virtualize APIC accesses," "APIC- 6348 * register virtualization," and "virtual-interrupt 6349 * delivery" only come from vmcs12. 6350 */ 6351 return true; 6352 case EXIT_REASON_INVPCID: 6353 return 6354 nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) && 6355 nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); 6356 case EXIT_REASON_WBINVD: 6357 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING); 6358 case EXIT_REASON_XSETBV: 6359 return true; 6360 case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS: 6361 /* 6362 * This should never happen, since it is not possible to 6363 * set XSS to a non-zero value---neither in L1 nor in L2. 6364 * If if it were, XSS would have to be checked against 6365 * the XSS exit bitmap in vmcs12. 6366 */ 6367 return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_XSAVES); 6368 case EXIT_REASON_UMWAIT: 6369 case EXIT_REASON_TPAUSE: 6370 return nested_cpu_has2(vmcs12, 6371 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE); 6372 case EXIT_REASON_ENCLS: 6373 return nested_vmx_exit_handled_encls(vcpu, vmcs12); 6374 case EXIT_REASON_NOTIFY: 6375 /* Notify VM exit is not exposed to L1 */ 6376 return false; 6377 default: 6378 return true; 6379 } 6380 } 6381 6382 /* 6383 * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was 6384 * reflected into L1. 6385 */ 6386 bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) 6387 { 6388 struct vcpu_vmx *vmx = to_vmx(vcpu); 6389 union vmx_exit_reason exit_reason = vmx->exit_reason; 6390 unsigned long exit_qual; 6391 u32 exit_intr_info; 6392 6393 WARN_ON_ONCE(vmx->nested.nested_run_pending); 6394 6395 /* 6396 * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM 6397 * has already loaded L2's state. 6398 */ 6399 if (unlikely(vmx->fail)) { 6400 trace_kvm_nested_vmenter_failed( 6401 "hardware VM-instruction error: ", 6402 vmcs_read32(VM_INSTRUCTION_ERROR)); 6403 exit_intr_info = 0; 6404 exit_qual = 0; 6405 goto reflect_vmexit; 6406 } 6407 6408 trace_kvm_nested_vmexit(vcpu, KVM_ISA_VMX); 6409 6410 /* If L0 (KVM) wants the exit, it trumps L1's desires. */ 6411 if (nested_vmx_l0_wants_exit(vcpu, exit_reason)) 6412 return false; 6413 6414 /* If L1 doesn't want the exit, handle it in L0. */ 6415 if (!nested_vmx_l1_wants_exit(vcpu, exit_reason)) 6416 return false; 6417 6418 /* 6419 * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For 6420 * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would 6421 * need to be synthesized by querying the in-kernel LAPIC, but external 6422 * interrupts are never reflected to L1 so it's a non-issue. 6423 */ 6424 exit_intr_info = vmx_get_intr_info(vcpu); 6425 if (is_exception_with_error_code(exit_intr_info)) { 6426 struct vmcs12 *vmcs12 = get_vmcs12(vcpu); 6427 6428 vmcs12->vm_exit_intr_error_code = 6429 vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 6430 } 6431 exit_qual = vmx_get_exit_qual(vcpu); 6432 6433 reflect_vmexit: 6434 nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual); 6435 return true; 6436 } 6437 6438 static int vmx_get_nested_state(struct kvm_vcpu *vcpu, 6439 struct kvm_nested_state __user *user_kvm_nested_state, 6440 u32 user_data_size) 6441 { 6442 struct vcpu_vmx *vmx; 6443 struct vmcs12 *vmcs12; 6444 struct kvm_nested_state kvm_state = { 6445 .flags = 0, 6446 .format = KVM_STATE_NESTED_FORMAT_VMX, 6447 .size = sizeof(kvm_state), 6448 .hdr.vmx.flags = 0, 6449 .hdr.vmx.vmxon_pa = INVALID_GPA, 6450 .hdr.vmx.vmcs12_pa = INVALID_GPA, 6451 .hdr.vmx.preemption_timer_deadline = 0, 6452 }; 6453 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6454 &user_kvm_nested_state->data.vmx[0]; 6455 6456 if (!vcpu) 6457 return kvm_state.size + sizeof(*user_vmx_nested_state); 6458 6459 vmx = to_vmx(vcpu); 6460 vmcs12 = get_vmcs12(vcpu); 6461 6462 if (guest_can_use(vcpu, X86_FEATURE_VMX) && 6463 (vmx->nested.vmxon || vmx->nested.smm.vmxon)) { 6464 kvm_state.hdr.vmx.vmxon_pa = vmx->nested.vmxon_ptr; 6465 kvm_state.hdr.vmx.vmcs12_pa = vmx->nested.current_vmptr; 6466 6467 if (vmx_has_valid_vmcs12(vcpu)) { 6468 kvm_state.size += sizeof(user_vmx_nested_state->vmcs12); 6469 6470 /* 'hv_evmcs_vmptr' can also be EVMPTR_MAP_PENDING here */ 6471 if (vmx->nested.hv_evmcs_vmptr != EVMPTR_INVALID) 6472 kvm_state.flags |= KVM_STATE_NESTED_EVMCS; 6473 6474 if (is_guest_mode(vcpu) && 6475 nested_cpu_has_shadow_vmcs(vmcs12) && 6476 vmcs12->vmcs_link_pointer != INVALID_GPA) 6477 kvm_state.size += sizeof(user_vmx_nested_state->shadow_vmcs12); 6478 } 6479 6480 if (vmx->nested.smm.vmxon) 6481 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON; 6482 6483 if (vmx->nested.smm.guest_mode) 6484 kvm_state.hdr.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE; 6485 6486 if (is_guest_mode(vcpu)) { 6487 kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; 6488 6489 if (vmx->nested.nested_run_pending) 6490 kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; 6491 6492 if (vmx->nested.mtf_pending) 6493 kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; 6494 6495 if (nested_cpu_has_preemption_timer(vmcs12) && 6496 vmx->nested.has_preemption_timer_deadline) { 6497 kvm_state.hdr.vmx.flags |= 6498 KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; 6499 kvm_state.hdr.vmx.preemption_timer_deadline = 6500 vmx->nested.preemption_timer_deadline; 6501 } 6502 } 6503 } 6504 6505 if (user_data_size < kvm_state.size) 6506 goto out; 6507 6508 if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) 6509 return -EFAULT; 6510 6511 if (!vmx_has_valid_vmcs12(vcpu)) 6512 goto out; 6513 6514 /* 6515 * When running L2, the authoritative vmcs12 state is in the 6516 * vmcs02. When running L1, the authoritative vmcs12 state is 6517 * in the shadow or enlightened vmcs linked to vmcs01, unless 6518 * need_vmcs12_to_shadow_sync is set, in which case, the authoritative 6519 * vmcs12 state is in the vmcs12 already. 6520 */ 6521 if (is_guest_mode(vcpu)) { 6522 sync_vmcs02_to_vmcs12(vcpu, vmcs12); 6523 sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12); 6524 } else { 6525 copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu)); 6526 if (!vmx->nested.need_vmcs12_to_shadow_sync) { 6527 if (evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) 6528 /* 6529 * L1 hypervisor is not obliged to keep eVMCS 6530 * clean fields data always up-to-date while 6531 * not in guest mode, 'hv_clean_fields' is only 6532 * supposed to be actual upon vmentry so we need 6533 * to ignore it here and do full copy. 6534 */ 6535 copy_enlightened_to_vmcs12(vmx, 0); 6536 else if (enable_shadow_vmcs) 6537 copy_shadow_to_vmcs12(vmx); 6538 } 6539 } 6540 6541 BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE); 6542 BUILD_BUG_ON(sizeof(user_vmx_nested_state->shadow_vmcs12) < VMCS12_SIZE); 6543 6544 /* 6545 * Copy over the full allocated size of vmcs12 rather than just the size 6546 * of the struct. 6547 */ 6548 if (copy_to_user(user_vmx_nested_state->vmcs12, vmcs12, VMCS12_SIZE)) 6549 return -EFAULT; 6550 6551 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6552 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6553 if (copy_to_user(user_vmx_nested_state->shadow_vmcs12, 6554 get_shadow_vmcs12(vcpu), VMCS12_SIZE)) 6555 return -EFAULT; 6556 } 6557 out: 6558 return kvm_state.size; 6559 } 6560 6561 void vmx_leave_nested(struct kvm_vcpu *vcpu) 6562 { 6563 if (is_guest_mode(vcpu)) { 6564 to_vmx(vcpu)->nested.nested_run_pending = 0; 6565 nested_vmx_vmexit(vcpu, -1, 0, 0); 6566 } 6567 free_nested(vcpu); 6568 } 6569 6570 static int vmx_set_nested_state(struct kvm_vcpu *vcpu, 6571 struct kvm_nested_state __user *user_kvm_nested_state, 6572 struct kvm_nested_state *kvm_state) 6573 { 6574 struct vcpu_vmx *vmx = to_vmx(vcpu); 6575 struct vmcs12 *vmcs12; 6576 enum vm_entry_failure_code ignored; 6577 struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = 6578 &user_kvm_nested_state->data.vmx[0]; 6579 int ret; 6580 6581 if (kvm_state->format != KVM_STATE_NESTED_FORMAT_VMX) 6582 return -EINVAL; 6583 6584 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) { 6585 if (kvm_state->hdr.vmx.smm.flags) 6586 return -EINVAL; 6587 6588 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) 6589 return -EINVAL; 6590 6591 /* 6592 * KVM_STATE_NESTED_EVMCS used to signal that KVM should 6593 * enable eVMCS capability on vCPU. However, since then 6594 * code was changed such that flag signals vmcs12 should 6595 * be copied into eVMCS in guest memory. 6596 * 6597 * To preserve backwards compatability, allow user 6598 * to set this flag even when there is no VMXON region. 6599 */ 6600 if (kvm_state->flags & ~KVM_STATE_NESTED_EVMCS) 6601 return -EINVAL; 6602 } else { 6603 if (!guest_can_use(vcpu, X86_FEATURE_VMX)) 6604 return -EINVAL; 6605 6606 if (!page_address_valid(vcpu, kvm_state->hdr.vmx.vmxon_pa)) 6607 return -EINVAL; 6608 } 6609 6610 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6611 (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6612 return -EINVAL; 6613 6614 if (kvm_state->hdr.vmx.smm.flags & 6615 ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON)) 6616 return -EINVAL; 6617 6618 if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) 6619 return -EINVAL; 6620 6621 /* 6622 * SMM temporarily disables VMX, so we cannot be in guest mode, 6623 * nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags 6624 * must be zero. 6625 */ 6626 if (is_smm(vcpu) ? 6627 (kvm_state->flags & 6628 (KVM_STATE_NESTED_GUEST_MODE | KVM_STATE_NESTED_RUN_PENDING)) 6629 : kvm_state->hdr.vmx.smm.flags) 6630 return -EINVAL; 6631 6632 if ((kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) && 6633 !(kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON)) 6634 return -EINVAL; 6635 6636 if ((kvm_state->flags & KVM_STATE_NESTED_EVMCS) && 6637 (!guest_can_use(vcpu, X86_FEATURE_VMX) || 6638 !vmx->nested.enlightened_vmcs_enabled)) 6639 return -EINVAL; 6640 6641 vmx_leave_nested(vcpu); 6642 6643 if (kvm_state->hdr.vmx.vmxon_pa == INVALID_GPA) 6644 return 0; 6645 6646 vmx->nested.vmxon_ptr = kvm_state->hdr.vmx.vmxon_pa; 6647 ret = enter_vmx_operation(vcpu); 6648 if (ret) 6649 return ret; 6650 6651 /* Empty 'VMXON' state is permitted if no VMCS loaded */ 6652 if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) { 6653 /* See vmx_has_valid_vmcs12. */ 6654 if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) || 6655 (kvm_state->flags & KVM_STATE_NESTED_EVMCS) || 6656 (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA)) 6657 return -EINVAL; 6658 else 6659 return 0; 6660 } 6661 6662 if (kvm_state->hdr.vmx.vmcs12_pa != INVALID_GPA) { 6663 if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa || 6664 !page_address_valid(vcpu, kvm_state->hdr.vmx.vmcs12_pa)) 6665 return -EINVAL; 6666 6667 set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); 6668 } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { 6669 /* 6670 * nested_vmx_handle_enlightened_vmptrld() cannot be called 6671 * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be 6672 * restored yet. EVMCS will be mapped from 6673 * nested_get_vmcs12_pages(). 6674 */ 6675 vmx->nested.hv_evmcs_vmptr = EVMPTR_MAP_PENDING; 6676 kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); 6677 } else { 6678 return -EINVAL; 6679 } 6680 6681 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) { 6682 vmx->nested.smm.vmxon = true; 6683 vmx->nested.vmxon = false; 6684 6685 if (kvm_state->hdr.vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) 6686 vmx->nested.smm.guest_mode = true; 6687 } 6688 6689 vmcs12 = get_vmcs12(vcpu); 6690 if (copy_from_user(vmcs12, user_vmx_nested_state->vmcs12, sizeof(*vmcs12))) 6691 return -EFAULT; 6692 6693 if (vmcs12->hdr.revision_id != VMCS12_REVISION) 6694 return -EINVAL; 6695 6696 if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) 6697 return 0; 6698 6699 vmx->nested.nested_run_pending = 6700 !!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING); 6701 6702 vmx->nested.mtf_pending = 6703 !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING); 6704 6705 ret = -EINVAL; 6706 if (nested_cpu_has_shadow_vmcs(vmcs12) && 6707 vmcs12->vmcs_link_pointer != INVALID_GPA) { 6708 struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); 6709 6710 if (kvm_state->size < 6711 sizeof(*kvm_state) + 6712 sizeof(user_vmx_nested_state->vmcs12) + sizeof(*shadow_vmcs12)) 6713 goto error_guest_mode; 6714 6715 if (copy_from_user(shadow_vmcs12, 6716 user_vmx_nested_state->shadow_vmcs12, 6717 sizeof(*shadow_vmcs12))) { 6718 ret = -EFAULT; 6719 goto error_guest_mode; 6720 } 6721 6722 if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION || 6723 !shadow_vmcs12->hdr.shadow_vmcs) 6724 goto error_guest_mode; 6725 } 6726 6727 vmx->nested.has_preemption_timer_deadline = false; 6728 if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { 6729 vmx->nested.has_preemption_timer_deadline = true; 6730 vmx->nested.preemption_timer_deadline = 6731 kvm_state->hdr.vmx.preemption_timer_deadline; 6732 } 6733 6734 if (nested_vmx_check_controls(vcpu, vmcs12) || 6735 nested_vmx_check_host_state(vcpu, vmcs12) || 6736 nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) 6737 goto error_guest_mode; 6738 6739 vmx->nested.dirty_vmcs12 = true; 6740 vmx->nested.force_msr_bitmap_recalc = true; 6741 ret = nested_vmx_enter_non_root_mode(vcpu, false); 6742 if (ret) 6743 goto error_guest_mode; 6744 6745 if (vmx->nested.mtf_pending) 6746 kvm_make_request(KVM_REQ_EVENT, vcpu); 6747 6748 return 0; 6749 6750 error_guest_mode: 6751 vmx->nested.nested_run_pending = 0; 6752 return ret; 6753 } 6754 6755 void nested_vmx_set_vmcs_shadowing_bitmap(void) 6756 { 6757 if (enable_shadow_vmcs) { 6758 vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); 6759 vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); 6760 } 6761 } 6762 6763 /* 6764 * Indexing into the vmcs12 uses the VMCS encoding rotated left by 6. Undo 6765 * that madness to get the encoding for comparison. 6766 */ 6767 #define VMCS12_IDX_TO_ENC(idx) ((u16)(((u16)(idx) >> 6) | ((u16)(idx) << 10))) 6768 6769 static u64 nested_vmx_calc_vmcs_enum_msr(void) 6770 { 6771 /* 6772 * Note these are the so called "index" of the VMCS field encoding, not 6773 * the index into vmcs12. 6774 */ 6775 unsigned int max_idx, idx; 6776 int i; 6777 6778 /* 6779 * For better or worse, KVM allows VMREAD/VMWRITE to all fields in 6780 * vmcs12, regardless of whether or not the associated feature is 6781 * exposed to L1. Simply find the field with the highest index. 6782 */ 6783 max_idx = 0; 6784 for (i = 0; i < nr_vmcs12_fields; i++) { 6785 /* The vmcs12 table is very, very sparsely populated. */ 6786 if (!vmcs12_field_offsets[i]) 6787 continue; 6788 6789 idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i)); 6790 if (idx > max_idx) 6791 max_idx = idx; 6792 } 6793 6794 return (u64)max_idx << VMCS_FIELD_INDEX_SHIFT; 6795 } 6796 6797 static void nested_vmx_setup_pinbased_ctls(struct vmcs_config *vmcs_conf, 6798 struct nested_vmx_msrs *msrs) 6799 { 6800 msrs->pinbased_ctls_low = 6801 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6802 6803 msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl; 6804 msrs->pinbased_ctls_high &= 6805 PIN_BASED_EXT_INTR_MASK | 6806 PIN_BASED_NMI_EXITING | 6807 PIN_BASED_VIRTUAL_NMIS | 6808 (enable_apicv ? PIN_BASED_POSTED_INTR : 0); 6809 msrs->pinbased_ctls_high |= 6810 PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6811 PIN_BASED_VMX_PREEMPTION_TIMER; 6812 } 6813 6814 static void nested_vmx_setup_exit_ctls(struct vmcs_config *vmcs_conf, 6815 struct nested_vmx_msrs *msrs) 6816 { 6817 msrs->exit_ctls_low = 6818 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; 6819 6820 msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl; 6821 msrs->exit_ctls_high &= 6822 #ifdef CONFIG_X86_64 6823 VM_EXIT_HOST_ADDR_SPACE_SIZE | 6824 #endif 6825 VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT | 6826 VM_EXIT_CLEAR_BNDCFGS; 6827 msrs->exit_ctls_high |= 6828 VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR | 6829 VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER | 6830 VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT | 6831 VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL; 6832 6833 /* We support free control of debug control saving. */ 6834 msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; 6835 } 6836 6837 static void nested_vmx_setup_entry_ctls(struct vmcs_config *vmcs_conf, 6838 struct nested_vmx_msrs *msrs) 6839 { 6840 msrs->entry_ctls_low = 6841 VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; 6842 6843 msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl; 6844 msrs->entry_ctls_high &= 6845 #ifdef CONFIG_X86_64 6846 VM_ENTRY_IA32E_MODE | 6847 #endif 6848 VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS; 6849 msrs->entry_ctls_high |= 6850 (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER | 6851 VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL); 6852 6853 /* We support free control of debug control loading. */ 6854 msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; 6855 } 6856 6857 static void nested_vmx_setup_cpubased_ctls(struct vmcs_config *vmcs_conf, 6858 struct nested_vmx_msrs *msrs) 6859 { 6860 msrs->procbased_ctls_low = 6861 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; 6862 6863 msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl; 6864 msrs->procbased_ctls_high &= 6865 CPU_BASED_INTR_WINDOW_EXITING | 6866 CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING | 6867 CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING | 6868 CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING | 6869 CPU_BASED_CR3_STORE_EXITING | 6870 #ifdef CONFIG_X86_64 6871 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | 6872 #endif 6873 CPU_BASED_MOV_DR_EXITING | CPU_BASED_UNCOND_IO_EXITING | 6874 CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MONITOR_TRAP_FLAG | 6875 CPU_BASED_MONITOR_EXITING | CPU_BASED_RDPMC_EXITING | 6876 CPU_BASED_RDTSC_EXITING | CPU_BASED_PAUSE_EXITING | 6877 CPU_BASED_TPR_SHADOW | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 6878 /* 6879 * We can allow some features even when not supported by the 6880 * hardware. For example, L1 can specify an MSR bitmap - and we 6881 * can use it to avoid exits to L1 - even when L0 runs L2 6882 * without MSR bitmaps. 6883 */ 6884 msrs->procbased_ctls_high |= 6885 CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | 6886 CPU_BASED_USE_MSR_BITMAPS; 6887 6888 /* We support free control of CR3 access interception. */ 6889 msrs->procbased_ctls_low &= 6890 ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); 6891 } 6892 6893 static void nested_vmx_setup_secondary_ctls(u32 ept_caps, 6894 struct vmcs_config *vmcs_conf, 6895 struct nested_vmx_msrs *msrs) 6896 { 6897 msrs->secondary_ctls_low = 0; 6898 6899 msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl; 6900 msrs->secondary_ctls_high &= 6901 SECONDARY_EXEC_DESC | 6902 SECONDARY_EXEC_ENABLE_RDTSCP | 6903 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | 6904 SECONDARY_EXEC_WBINVD_EXITING | 6905 SECONDARY_EXEC_APIC_REGISTER_VIRT | 6906 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | 6907 SECONDARY_EXEC_RDRAND_EXITING | 6908 SECONDARY_EXEC_ENABLE_INVPCID | 6909 SECONDARY_EXEC_ENABLE_VMFUNC | 6910 SECONDARY_EXEC_RDSEED_EXITING | 6911 SECONDARY_EXEC_ENABLE_XSAVES | 6912 SECONDARY_EXEC_TSC_SCALING | 6913 SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; 6914 6915 /* 6916 * We can emulate "VMCS shadowing," even if the hardware 6917 * doesn't support it. 6918 */ 6919 msrs->secondary_ctls_high |= 6920 SECONDARY_EXEC_SHADOW_VMCS; 6921 6922 if (enable_ept) { 6923 /* nested EPT: emulate EPT also to L1 */ 6924 msrs->secondary_ctls_high |= 6925 SECONDARY_EXEC_ENABLE_EPT; 6926 msrs->ept_caps = 6927 VMX_EPT_PAGE_WALK_4_BIT | 6928 VMX_EPT_PAGE_WALK_5_BIT | 6929 VMX_EPTP_WB_BIT | 6930 VMX_EPT_INVEPT_BIT | 6931 VMX_EPT_EXECUTE_ONLY_BIT; 6932 6933 msrs->ept_caps &= ept_caps; 6934 msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | 6935 VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | 6936 VMX_EPT_1GB_PAGE_BIT; 6937 if (enable_ept_ad_bits) { 6938 msrs->secondary_ctls_high |= 6939 SECONDARY_EXEC_ENABLE_PML; 6940 msrs->ept_caps |= VMX_EPT_AD_BIT; 6941 } 6942 6943 /* 6944 * Advertise EPTP switching irrespective of hardware support, 6945 * KVM emulates it in software so long as VMFUNC is supported. 6946 */ 6947 if (cpu_has_vmx_vmfunc()) 6948 msrs->vmfunc_controls = VMX_VMFUNC_EPTP_SWITCHING; 6949 } 6950 6951 /* 6952 * Old versions of KVM use the single-context version without 6953 * checking for support, so declare that it is supported even 6954 * though it is treated as global context. The alternative is 6955 * not failing the single-context invvpid, and it is worse. 6956 */ 6957 if (enable_vpid) { 6958 msrs->secondary_ctls_high |= 6959 SECONDARY_EXEC_ENABLE_VPID; 6960 msrs->vpid_caps = VMX_VPID_INVVPID_BIT | 6961 VMX_VPID_EXTENT_SUPPORTED_MASK; 6962 } 6963 6964 if (enable_unrestricted_guest) 6965 msrs->secondary_ctls_high |= 6966 SECONDARY_EXEC_UNRESTRICTED_GUEST; 6967 6968 if (flexpriority_enabled) 6969 msrs->secondary_ctls_high |= 6970 SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 6971 6972 if (enable_sgx) 6973 msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING; 6974 } 6975 6976 static void nested_vmx_setup_misc_data(struct vmcs_config *vmcs_conf, 6977 struct nested_vmx_msrs *msrs) 6978 { 6979 msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA; 6980 msrs->misc_low |= 6981 MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS | 6982 VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | 6983 VMX_MISC_ACTIVITY_HLT | 6984 VMX_MISC_ACTIVITY_WAIT_SIPI; 6985 msrs->misc_high = 0; 6986 } 6987 6988 static void nested_vmx_setup_basic(struct nested_vmx_msrs *msrs) 6989 { 6990 /* 6991 * This MSR reports some information about VMX support. We 6992 * should return information about the VMX we emulate for the 6993 * guest, and the VMCS structure we give it - not about the 6994 * VMX support of the underlying hardware. 6995 */ 6996 msrs->basic = 6997 VMCS12_REVISION | 6998 VMX_BASIC_TRUE_CTLS | 6999 ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | 7000 (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); 7001 7002 if (cpu_has_vmx_basic_inout()) 7003 msrs->basic |= VMX_BASIC_INOUT; 7004 } 7005 7006 static void nested_vmx_setup_cr_fixed(struct nested_vmx_msrs *msrs) 7007 { 7008 /* 7009 * These MSRs specify bits which the guest must keep fixed on 7010 * while L1 is in VMXON mode (in L1's root mode, or running an L2). 7011 * We picked the standard core2 setting. 7012 */ 7013 #define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) 7014 #define VMXON_CR4_ALWAYSON X86_CR4_VMXE 7015 msrs->cr0_fixed0 = VMXON_CR0_ALWAYSON; 7016 msrs->cr4_fixed0 = VMXON_CR4_ALWAYSON; 7017 7018 /* These MSRs specify bits which the guest must keep fixed off. */ 7019 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, msrs->cr0_fixed1); 7020 rdmsrl(MSR_IA32_VMX_CR4_FIXED1, msrs->cr4_fixed1); 7021 7022 if (vmx_umip_emulated()) 7023 msrs->cr4_fixed1 |= X86_CR4_UMIP; 7024 } 7025 7026 /* 7027 * nested_vmx_setup_ctls_msrs() sets up variables containing the values to be 7028 * returned for the various VMX controls MSRs when nested VMX is enabled. 7029 * The same values should also be used to verify that vmcs12 control fields are 7030 * valid during nested entry from L1 to L2. 7031 * Each of these control msrs has a low and high 32-bit half: A low bit is on 7032 * if the corresponding bit in the (32-bit) control field *must* be on, and a 7033 * bit in the high half is on if the corresponding bit in the control field 7034 * may be on. See also vmx_control_verify(). 7035 */ 7036 void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) 7037 { 7038 struct nested_vmx_msrs *msrs = &vmcs_conf->nested; 7039 7040 /* 7041 * Note that as a general rule, the high half of the MSRs (bits in 7042 * the control fields which may be 1) should be initialized by the 7043 * intersection of the underlying hardware's MSR (i.e., features which 7044 * can be supported) and the list of features we want to expose - 7045 * because they are known to be properly supported in our code. 7046 * Also, usually, the low half of the MSRs (bits which must be 1) can 7047 * be set to 0, meaning that L1 may turn off any of these bits. The 7048 * reason is that if one of these bits is necessary, it will appear 7049 * in vmcs01 and prepare_vmcs02, when it bitwise-or's the control 7050 * fields of vmcs01 and vmcs02, will turn these bits off - and 7051 * nested_vmx_l1_wants_exit() will not pass related exits to L1. 7052 * These rules have exceptions below. 7053 */ 7054 nested_vmx_setup_pinbased_ctls(vmcs_conf, msrs); 7055 7056 nested_vmx_setup_exit_ctls(vmcs_conf, msrs); 7057 7058 nested_vmx_setup_entry_ctls(vmcs_conf, msrs); 7059 7060 nested_vmx_setup_cpubased_ctls(vmcs_conf, msrs); 7061 7062 nested_vmx_setup_secondary_ctls(ept_caps, vmcs_conf, msrs); 7063 7064 nested_vmx_setup_misc_data(vmcs_conf, msrs); 7065 7066 nested_vmx_setup_basic(msrs); 7067 7068 nested_vmx_setup_cr_fixed(msrs); 7069 7070 msrs->vmcs_enum = nested_vmx_calc_vmcs_enum_msr(); 7071 } 7072 7073 void nested_vmx_hardware_unsetup(void) 7074 { 7075 int i; 7076 7077 if (enable_shadow_vmcs) { 7078 for (i = 0; i < VMX_BITMAP_NR; i++) 7079 free_page((unsigned long)vmx_bitmap[i]); 7080 } 7081 } 7082 7083 __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) 7084 { 7085 int i; 7086 7087 if (!cpu_has_vmx_shadow_vmcs()) 7088 enable_shadow_vmcs = 0; 7089 if (enable_shadow_vmcs) { 7090 for (i = 0; i < VMX_BITMAP_NR; i++) { 7091 /* 7092 * The vmx_bitmap is not tied to a VM and so should 7093 * not be charged to a memcg. 7094 */ 7095 vmx_bitmap[i] = (unsigned long *) 7096 __get_free_page(GFP_KERNEL); 7097 if (!vmx_bitmap[i]) { 7098 nested_vmx_hardware_unsetup(); 7099 return -ENOMEM; 7100 } 7101 } 7102 7103 init_vmcs_shadow_fields(); 7104 } 7105 7106 exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear; 7107 exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch; 7108 exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld; 7109 exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst; 7110 exit_handlers[EXIT_REASON_VMREAD] = handle_vmread; 7111 exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume; 7112 exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite; 7113 exit_handlers[EXIT_REASON_VMOFF] = handle_vmxoff; 7114 exit_handlers[EXIT_REASON_VMON] = handle_vmxon; 7115 exit_handlers[EXIT_REASON_INVEPT] = handle_invept; 7116 exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; 7117 exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; 7118 7119 return 0; 7120 } 7121 7122 struct kvm_x86_nested_ops vmx_nested_ops = { 7123 .leave_nested = vmx_leave_nested, 7124 .is_exception_vmexit = nested_vmx_is_exception_vmexit, 7125 .check_events = vmx_check_nested_events, 7126 .has_events = vmx_has_nested_events, 7127 .triple_fault = nested_vmx_triple_fault, 7128 .get_state = vmx_get_nested_state, 7129 .set_state = vmx_set_nested_state, 7130 .get_nested_state_pages = vmx_get_nested_state_pages, 7131 .write_log_dirty = nested_vmx_write_pml_buffer, 7132 .enable_evmcs = nested_enable_evmcs, 7133 .get_evmcs_version = nested_get_evmcs_version, 7134 .hv_inject_synthetic_vmexit_post_tlb_flush = vmx_hv_inject_synthetic_vmexit_post_tlb_flush, 7135 }; 7136